# MIP/Appraisal Corpus Analysis

## Import Packages

In [1]:
import os
import pandas as pd
import re
from ast import literal_eval
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
nlp = English()
tokenizer = Tokenizer(nlp.vocab)

  from .autonotebook import tqdm as notebook_tqdm


## Reading and Cleaning Data

### Metaphor Annotations

In [2]:
# borrowed from Jodie
# helper function to build a list of lists containing the start and end indices
# and the difference between these indices
# where labels is the labels associated with a specific text
def labels_to_list(labels):
  annotations = []

  labels = literal_eval(labels)

  for label in labels:
    tags = []
    tags.append(int(label['start']))
    tags.append(int(label['end']))

    annotations.append(tags)

  return annotations

In [3]:
# load in EDITED metaphor annotation json and save it to a pandas dataframe
file_path = 'MIP-at-2025-06-11-edit.json'
met_df = pd.read_json(file_path)

# extracting labels from the annotations column
met_df['labels'] = met_df.apply(lambda row: row.annotations[0]['result'], axis=1)

# changing file names so they match the appraisal folder names
met_df['filename'] = met_df.apply(lambda row: re.sub(r"^[^_]*-", '', row.file_upload), axis=1)
met_df['filename'] = met_df.apply(lambda row: re.sub(r"_fixed", '', row.filename[:-4]), axis=1)
met_df['filename'] = met_df.apply(lambda row: re.sub(r"_NEW", '', row.filename), axis=1)

# checking for duplicated files
met_df.loc[met_df.duplicated(subset=['filename'])].filename

Series([], Name: filename, dtype: object)

In [4]:
len(met_df.filename)

1045

In [5]:
# creating a dictionary of metaphor labels, where each key is a filename
met_labels = {}
for name in met_df['filename'].unique():
    # creating a new dataframe only containing labels corresponding to one file
    new_df = met_df[met_df['filename']==name][['filename', 'labels']].reset_index()
    # creating a list to save the labels in 
    labels_list = []
    for el in new_df['labels'][0]:
        # adding labels to the label list using the helper function
        labels_list.append(labels_to_list(str([el['value']]))[0])
    # saving the list of labels to the dictionary 
    met_labels[name] = labels_list

In [6]:
len(met_labels.keys())

1045

### Appraisal Annotations

In [7]:
def extractor(dic, col, txt):
    '''
    takes dic (a dictionary of dataframes), col (a string corresponding to column name), 
    and txt (a string containing the label type to extract) as input
    returns a dictionary with the same keys as dic, where the values are lists of label lists (e.g., [[1,7],[9,15]])
    '''
    # creating a dictionary of labels, where each key is a filename and each value is a list of labels
    labels_dic = {}
    # looping through all the keys and values in the input dictionary
    for name, df in dic.items():
        # creating an empty list of labels labels
        labels_list = []
        # dropping rows with no labels 
        cleaned_df = df[df[col] != '_'][['indices', col]].dropna()
        # if there are no labels, assigns an empty list
        if len(cleaned_df[col]) == 0:
            labels_dic[name]=labels_list
        else:
            # extracting rows that contain the input txt in the label column
            dic_labels = cleaned_df[cleaned_df[col].str.contains(txt)][col].unique()
            for d in dic_labels:
                cleaned_df_v2 = cleaned_df[cleaned_df[col]==d]
                # adding labels to the list of labels
                new_dic = {}
                new_dic[d] = {}
                new_dic[d]['lowest'] = 1000000000
                new_dic[d]['highest'] = -1
                for r in cleaned_df_v2.indices:
                    ind = [int(x) for x in r.split('-')]
                    if ind[0] < new_dic[d]['lowest']:
                        new_dic[d]['lowest'] = ind[0]
                    if ind[1] > new_dic[d]['highest']:
                        new_dic[d]['highest'] = ind[1]
                labels_list.append([new_dic[d]['lowest'], new_dic[d]['highest']])
            labels_dic[name]=labels_list
    return labels_dic

In [8]:
# creating a dictionary of appraisal annotation dataframes, where each key is a filename
appraisal_dict = {}

# list of file names in the metaphor annotations
filenames = list(met_df['filename'].unique())

# looping through each file name in the metaphor annotations 
for folder_id in filenames:
    path = 'SOCC/annotated/Appraisal/Appraisal_annotations/curation/' + folder_id
    try:
        # loading in appraisal annotations in folders that end with '.txt'
        filename = os.listdir(path + '.txt')[0]
        # reading and saving the annotations to a pandas dataframe
        df = pd.read_csv(path + '.txt/' + filename, sep = '\t', header = None, 
                         skiprows=6, names=['no.','indices','text','attitude','label','polarity'])
        # saving the dataframe to the appraisal dictionary 
        appraisal_dict[folder_id] = df
    except:
        try:
            # loading in appraisal annotations in folders that end with '.tsv'
            filename = os.listdir(path + '.tsv')[0]
            # reading and saving the annotations to a pandas dataframe
            df = pd.read_csv(path + '.tsv/' + filename, sep = '\t', header = None, 
            skiprows=6, names=['no.','indices','text','attitude','label','polarity'])
            # saving the dataframe to the appraisal dictionary
            appraisal_dict[folder_id] = df
        except:
            # prints the name of any file for which there is no appraisal annotation
            print('DOES NOT EXIST:', folder_id)

DOES NOT EXIST: aboriginal_17


In [9]:
del met_labels['aboriginal_17']

In [10]:
len(appraisal_dict.keys())

1044

## Corpus Analysis

In [11]:
# creating a new dataframe based on the metaphor dataframe, with fewer columns
df_analysis=met_df[['filename', 'data']]
# removing file from that is not in appraisal dataframe
df_analysis=df_analysis.loc[df_analysis.filename != 'aboriginal_17']
# creating a column that measures the length of the text associated with each filename
df_analysis['length'] = df_analysis.apply(lambda row: len(tokenizer(row.data['text'])), axis=1)

In [12]:
# shortest comments
df_analysis.loc[(df_analysis.length == min(df_analysis.length))]

Unnamed: 0,filename,data,length
147,watch_30,{'text': 'Baloney.'},1
154,watch_37,{'text': 'Exactly!'},1
420,china_39,{'text': 'LOL...!'},1
676,trump_38,{'text': 'Ha-Ha-Ha!!!'},1


In [13]:
# longest
df_analysis.loc[(df_analysis.length == max(df_analysis.length))]

Unnamed: 0,filename,data,length
847,uber_94,{'text': 'Uber drivers have filed a class acti...,793


In [14]:
# statistics
print('There are {} comments in the corpus. The comments range between {} and {} in number of tokens, with an average of {}. \
The total number of tokens in the corpus is {}.'.format(len(df_analysis), min(df_analysis.length), 
                                                        max(df_analysis.length), sum(df_analysis.length)/len(df_analysis), 
                                                        sum(df_analysis.length)))

There are 1044 comments in the corpus. The comments range between 1 and 793 in number of tokens, with an average of 62.15900383141762. The total number of tokens in the corpus is 64894.


# LIST OF FILENAMES FOR BOTH APPRAISAL AND LABEL STUDIO

In [15]:
path = 'SOCC/annotated/Appraisal/Appraisal_annotations/curation/'
len([filename.replace('.tsv','').replace('.txt','') for filename in os.listdir(path)])

1043

In [16]:
met_files = pd.DataFrame({'filename':met_df.apply(lambda row: row.filename.lower(), axis=1)})
met_files['metaphor_files'] = met_files['filename']
met_files=met_files.set_index('filename')

In [17]:
appr_files = pd.DataFrame({'filename':[filename.replace('.tsv','').replace('.txt','').lower() for filename in os.listdir(path)]})
appr_files['appraisal_files'] = appr_files['filename']
appr_files=appr_files.set_index('filename')

In [18]:
filenames_df=met_files.join(appr_files, how='outer').reset_index().drop(columns=['filename'])
filenames_df.isna().sum()

metaphor_files     0
appraisal_files    1
dtype: int64

In [19]:
set(appr_files.appraisal_files.unique())-set(met_files.metaphor_files.unique())

set()

In [20]:
set(met_files.metaphor_files.unique())-set(appr_files.appraisal_files.unique())

{'aboriginal_17'}

In [21]:
# filenames_df.to_csv('filenames.csv')

## Miscellaneous

In [22]:
# checking which units contain both a positive and a negative label
for name, df in appraisal_dict.items():
    cleaned_df = df[df.label != '_'][['indices', 'text', 'label']].dropna()
    if len(cleaned_df) > 0:
        neg_df = cleaned_df[cleaned_df.label.str.contains('neg')]
        pos_neg = neg_df[neg_df.label.str.contains('pos')]
        if len(pos_neg) > 0:
            print(name)
            print(pos_neg)

hillary_5
  indices    text          label
7   39-45  damage  pos[2]|neg[3]
hillary_45
    indices        text          label
89  428-438  destroying  pos[6]|neg[7]
90  439-445      others  pos[6]|neg[7]
91  446-448          to  pos[6]|neg[7]
92  449-453        gain  pos[6]|neg[7]
93  454-459       power  pos[6]|neg[7]
94  460-464        over  pos[6]|neg[7]
95  465-471      others  pos[6]|neg[7]
hillary_80
  indices       text          label
8   44-53  illegally  pos[1]|neg[2]
budget_18
   indices  text          label
18   78-82  good  neg[2]|pos[3]
budget_22
    indices    text          label
24  134-136      in  neg[3]|pos[4]
25  137-143  charge  neg[3]|pos[4]
32  173-178   aware  neg[3]|pos[5]
budget_25
     indices     text           label
39   178-185  awesome   neg[4]|pos[5]
102  486-493  surplus  neg[9]|pos[11]
107  517-521     good  neg[9]|pos[12]
budget_34
  indices    text          label
8   50-56  fooled  pos[1]|neg[2]
daycare_1
    indices         text          label
30  15