# Appraisal vs Metaphor

## Import Packages

In [1]:
import os
import pandas as pd
import re
from ast import literal_eval

## Reading and Cleaning Data

### Metaphor Annotations

In [2]:
# borrowed from Jodie
# helper function to build a list of lists containing the start and end indices
# and the difference between these indices
# where labels is the labels associated with a specific text
def labels_to_list(labels):
  annotations = []

  labels = literal_eval(labels)

  for label in labels:
    tags = []
    tags.append(int(label['start']))
    tags.append(int(label['end']))

    annotations.append(tags)

  return annotations

In [3]:
# load in metaphor annotation json and save it to a pandas dataframe
file_path = 'metaphor_labels_mar26_2025.json'
met_df = pd.read_json(file_path)

# extracting labels from the annotations column
met_df['labels'] = met_df.apply(lambda row: row.annotations[0]['result'], axis=1)

# changing file names so they match the appraisal folder names
met_df['filename'] = met_df.apply(lambda row: re.sub(r"^[^_]*-", '', row.file_upload), axis=1)
met_df['filename'] = met_df.apply(lambda row: re.sub(r"_fixed", '', row.filename[:-4]), axis=1)

In [4]:
# creating a dictionary of metaphor labels, where each key is a filename
met_labels = {}
for name in met_df['filename'].unique():
    # creating a new dataframe only containing labels corresponding to one file
    new_df = met_df[met_df['filename']==name][['filename', 'labels']].reset_index()
    # creating a list to save the labels in 
    labels_list = []
    for el in new_df['labels'][0]:
        # adding labels to the label list using the helper function
        labels_list.append(labels_to_list(str([el['value']]))[0])
    # saving the list of labels to the dictionary 
    met_labels[name] = labels_list

### Appraisal Annotations

In [5]:
def label_extractor(dic, txt):
    '''
    takes dic (a dictionary of dataframes) and txt (a string containing the label type to extract) as input
    returns a dictionary with the same keys as dic, where the values are lists of label lists (e.g., [[1,7],[9,15]])
    '''
    # creating a dictionary of labels, where each key is a filename and each value is a list of labels
    labels_dic = {}
    # looping through all the keys and values in the input dictionary
    for name, df in dic.items():
        # creating an empty list of labels labels
        labels_list = []
        # dropping rows with no labels 
        cleaned_df = df[df.label != '_'][['indices', 'label']].dropna()
        # if there are no labels, assigns an empty list
        if len(cleaned_df.label) == 0:
            labels_dic[name]=labels_list
        else:
            # extracting rows that contain the input txt in the label column
            cleaned_df = cleaned_df[cleaned_df.label.str.contains(txt)]
            # adding labels to the list of labels
            for r in cleaned_df.indices:
                labels_list.append([int(x) for x in r.split('-')])
            labels_dic[name]=labels_list
    return labels_dic

In [6]:
# creating a dictionary of appraisal annotation dataframes, where each key is a filename
appraisal_dict = {}

# looping through each file name in the metaphor annotations 
for folder_id in met_df['filename'].unique():
    path = 'SOCC/annotated/Appraisal/Appraisal_annotations/curation/' + folder_id
    try:
        # loading in appraisal annotations in folders that end with '.txt'
        filename = os.listdir(path + '.txt')[0]
        # reading and saving the annotations to a pandas dataframe
        df = pd.read_csv(path + '.txt/' + filename, sep = '\t', header = None, 
                         skiprows=6, names=['no.','indices','text','attitude','label','polarity'])
        # saving the dataframe to the appraisal dictionary 
        appraisal_dict[folder_id] = df
    except:
        try:
            # loading in appraisal annotations in folders that end with '.tsv'
            filename = os.listdir(path + '.tsv')[0]
            # reading and saving the annotations to a pandas dataframe
            df = pd.read_csv(path + '.tsv/' + filename, sep = '\t', header = None, 
            skiprows=6, names=['no.','indices','text','attitude','label','polarity'])
            # saving the dataframe to the appraisal dictionary
            appraisal_dict[folder_id] = df
        except:
            # prints the name of any file for which there is no appraisal annotation
            print('DOES NOT EXIST:', folder_id)

DOES NOT EXIST: aboriginal_17


In [7]:
# a dictionary where every key is a file name and every value is a list of the character indices in the negative labels 
neg_appr_labels = label_extractor(appraisal_dict, 'neg')
# a dictionary where every key is a file name and every value is a list of the character indices in the positive labels 
pos_appr_labels = label_extractor(appraisal_dict, 'pos')

## Comparison 
### what are we measuring? how?
We are deriving four metrics: (1) the percentage of metaphorical units that are labelled as positive, (2) the percentage of positive units that are labelled as metaphorical, (3) the percentage of metaphorical units that are labelled as negative, and (4) the percentage of negative units that are labelled as metaphorical. For (1), we first go through each label in the metaphor annotations, and check if 30% of the characters show up in a positive label. If they do, we say that that metaphorical unit is labelled as positive. We then calculate the percentage of positive metaphorical units by dividing the number of metaphors that are labelled as positive by the number of units labelled as metaphorical overall. The methodology is similar for (2)-(4). 

In [8]:
def overlap_calculator(dic1, dic2):
    '''
    finds the percentages of overlap between two dictionaries of labels and returns a dictionary where the values are percentages
    '''
    # setting counter for number of labels overall
    dic1_total_labels, dic2_total_labels = 0, 0
    
    # number of labels in each dic that overlap (e.g., if dic1 has labels [1,3] and [4,9] for a file 
    # and dic2 has a label [1,9] for the same file, then dic1 has two overlapping labels with dic2 
    # and dic2 has 1 overlapping label with dic1)
    overall_overlap_cnt_dic1, overall_overlap_cnt_dic2 = 0, 0
    
    for file in list(dic1.keys()):
        # initializing overlap count for each file in each dictionary; setting it to 0
        overlap_cnt_dic1, overlap_cnt_dic2 = 0, 0

        # loops through each label list in the list of list of labels corresponding to each key in the dictionary 
        for label_dic1 in dic1[file]:
            # creating a set of the characters contained in the indices (e.g., [1,5] -> [1,2,3,4,5])
            label_dic1_characters = set(list(range(label_dic1[0],label_dic1[1]+1)))
            # loops through the labels in the second dictionary for the same key
            for label_dic2 in dic2[file]:
                # creating a set of the characters contained in the indices (e.g., [1,5] -> [1,2,3,4,5])
                label_dic2_characters = set(list(range(label_dic2[0],label_dic2[1]+1)))
                # variable representing the characters included in both labels (intersection)
                overlap = label_dic1_characters & label_dic2_characters
                # variable representing the elements included in either label (union)
                universe = label_dic2_characters | label_dic2_characters
                
                # calculating overlap % if at least one element exists in both labels 
                if len(overlap) > 0:
                    # the percentage of the first label that is included in the second label
                    result_dic1 = float(len(overlap)) / len(label_dic1_characters) * 100
                    # the percentage of the second label that is included in the first label
                    result_dic2 = float(len(overlap)) / len(label_dic2_characters) * 100
                    # if at least 30% of the first label is included in the second label, it counts as overlap
                    if result_dic1 >= 30:
                        # increasing counters by 1
                        overlap_cnt_dic1+=1
                        overall_overlap_cnt_dic1+=1
                    # if at least 30% of the second label is included in the first label, it counts as overlap
                    if result_dic2 >= 30:
                        # increasing counters by 1
                        overlap_cnt_dic2+=1
                        overall_overlap_cnt_dic2+=1

        # adding to counter of total labels for dic1
        dic1_total_labels = dic1_total_labels + len(dic1[file])
        # adding to counter of total labels for dic2
        dic2_total_labels = dic2_total_labels + len(dic2[file])
    return {'percentage of dic1 units that are labelled in dic2': overall_overlap_cnt_dic1/dic1_total_labels,
            'percentage of dic2 units that are labelled in dic1': overall_overlap_cnt_dic2/dic2_total_labels}

In [9]:
overlap_percentage_pos = overlap_calculator(pos_appr_labels, met_labels).values()
print('percentage of metaphorical units that are labelled as positive:', list(overlap_percentage_pos)[1])
print('percentage of positive units that are labelled as metaphors:', list(overlap_percentage_pos)[0])

percentage of metaphorical units that are labelled as positive: 0.07861635220125786
percentage of positive units that are labelled as metaphors: 0.07496607869742199


In [10]:
overlap_percentage_neg = overlap_calculator(neg_appr_labels, met_labels).values()
print('percentage of metaphorical units that are labelled as negative:', list(overlap_percentage_neg)[1])
print('percentage of negative units that are labelled as metaphors:', list(overlap_percentage_neg)[0])

percentage of metaphorical units that are labelled as negative: 0.30542452830188677
percentage of negative units that are labelled as metaphors: 0.08239953632148377


## Miscellaneous

In [11]:
# checking which units contain both a positive and a negative label
for name, df in appraisal_dict.items():
    cleaned_df = df[df.label != '_'][['indices', 'text', 'label']].dropna()
    if len(cleaned_df) > 0:
        neg_df = cleaned_df[cleaned_df.label.str.contains('neg')]
        pos_neg = neg_df[neg_df.label.str.contains('pos')]
        if len(pos_neg) > 0:
            print(name)
            print(pos_neg)

hillary_5
  indices    text          label
7   39-45  damage  pos[2]|neg[3]
hillary_45
    indices        text          label
89  428-438  destroying  pos[6]|neg[7]
90  439-445      others  pos[6]|neg[7]
91  446-448          to  pos[6]|neg[7]
92  449-453        gain  pos[6]|neg[7]
93  454-459       power  pos[6]|neg[7]
94  460-464        over  pos[6]|neg[7]
95  465-471      others  pos[6]|neg[7]
hillary_80
  indices       text          label
8   44-53  illegally  pos[1]|neg[2]
budget_18
   indices  text          label
18   78-82  good  neg[2]|pos[3]
budget_22
    indices    text          label
24  134-136      in  neg[3]|pos[4]
25  137-143  charge  neg[3]|pos[4]
32  173-178   aware  neg[3]|pos[5]
budget_25
     indices     text           label
39   178-185  awesome   neg[4]|pos[5]
102  486-493  surplus  neg[9]|pos[11]
107  517-521     good  neg[9]|pos[12]
budget_34
  indices    text          label
8   50-56  fooled  pos[1]|neg[2]
daycare_1
    indices         text          label
30  15