# MIP vs CMT

## Import Packages

In [1]:
import pandas as pd
import re
from ast import literal_eval

## Reading and Cleaning Data

### MIP Annotations

In [2]:
# borrowed from Jodie
# helper function to build a list of lists containing the start and end indices
# and the difference between these indices
# where labels is the labels associated with a specific text
def labels_to_list(labels):
  annotations = []

  labels = literal_eval(labels)

  for label in labels:
    tags = []
    tags.append(int(label['start']))
    tags.append(int(label['end']))

    annotations.append(tags)

  return annotations

In [3]:
# load in MIP annotation json and save it to a pandas dataframe
mip_file_path = 'MIP-at-2025-08-08.json'
mip_met_df = pd.read_json(mip_file_path)

# extracting labels from the annotations column
mip_met_df['labels'] = mip_met_df.apply(lambda row: row.annotations[0]['result'], axis=1)

# fixing file names 
mip_met_df['filename'] = mip_met_df.apply(lambda row: re.sub(r"^[^_]*-", '', row.file_upload), axis=1)
mip_met_df['filename'] = mip_met_df.apply(lambda row: re.sub(r"_fixed", '', row.filename[:-4].lower()), axis=1)
mip_met_df['filename'] = mip_met_df.apply(lambda row: re.sub(r"_NEW", '', row.filename).lower(), axis=1)

# adding text column
mip_met_df['text'] = mip_met_df.apply(lambda row: row.data['text'], axis=1)

### CMT Annotations

In [4]:
# load in CMT annotation json and save it to a pandas dataframe
cmt_file_path = 'CMT_July30.json'
cmt_met_df = pd.read_json(cmt_file_path)

# extracting labels from the annotations column
cmt_met_df['labels'] = cmt_met_df.apply(lambda row: row.annotations[0]['result'], axis=1)

# cleaning up file names 
cmt_met_df['filename'] = cmt_met_df.apply(lambda row: re.sub(r"^[^_]*-", '', row.file_upload), axis=1)
cmt_met_df['filename'] = cmt_met_df.apply(lambda row: re.sub(r"_fixed", '', row.filename[:-4]), axis=1)

# adding text column
cmt_met_df['text'] = cmt_met_df.apply(lambda row: row.data['text'], axis=1)

## File Name Comparison

In [5]:
# checking to see which comments do not match, based on their filenames
cnt = 0
for name in cmt_met_df.filename.unique():
    # checks whether the last 10 characters match
    if mip_met_df[mip_met_df['filename']==name].reset_index().text[0][-10:]!=cmt_met_df[cmt_met_df['filename']==name].reset_index().text[0][-10:]:
        # if not, prints the file name in CMT and the text associated with it in both CMT and MIP
        print('no match:', name)
        print(mip_met_df[mip_met_df['filename']==name].reset_index().data[0]['text'])
        print(cmt_met_df[cmt_met_df['filename']==name].reset_index().data[0]['text'])
        cnt+=1
# prints the number of files that do not match
print(cnt)

no match: aboriginal_45
Mr. Hughes draws a connection?  More like states the obvious.Which is why an inquiry is not needed.  There is nothing to figure out.  First Nations women are disproportionately victims of violence because they disproportionately live in poverty and poor housing with parents who are uneducated and unemployed not to mention alcoholics and drug addicts largely incapable of looking after themselves let alone children.  
Mr. Hughes draws a connection? More like states the obvious. Which is why an inquiry is not needed. There is nothing to figure out. First Nations women are disproportionately victims of violence because they disproportionately live in poverty and poor housing with parents who are uneducated and unemployed not to mention alcoholics and drug addicts largely incapable of looking after themselves let alone children.
no match: aboriginal_56
Time for the elders and chiefs to stand up to the plate and take a leadership role! 
Time for the elders and chiefs 

In [6]:
# all names up to watch are added to the list of CMT comments that are identical in MIP (based on manual checking)
mip_names = cmt_met_df.filename.unique().tolist()[:125]

# creating new dataframes that only contain the watch comments
cmt_watch_df = cmt_met_df[cmt_met_df.filename.str.contains('watch')]
mip_watch_df = mip_met_df[mip_met_df.filename.str.contains('watch')]

for name in cmt_watch_df.filename.unique():
    # extracting the text in CMT corresponding to a filename
    cmt_text = cmt_watch_df[cmt_watch_df['filename']==name].reset_index().data[0]['text']
    # checking the the final 10 characters match
    if mip_watch_df[mip_watch_df['filename']==name].reset_index().data[0]['text'][-10:]== cmt_text[-10:]:
        # if yes, the name is added to the list
        mip_names.append(name)
    else:
        # if not, we search for other comments that match (corresponding final 30 characters)
        mip_matches = mip_watch_df[mip_watch_df.text.str.contains(cmt_text[-30:])].filename.unique()
        # if there is only one match, we add that filename to the list of names
        if len(mip_matches) == 1:
            mip_names.append(mip_matches[0])

        # if there is no match, or more than one match, we print out the relevant information
        else:
            print('no match:', name)
            print(cmt_text)
            print(mip_watch_df[mip_watch_df['filename']==name].reset_index().data[0]['text'])
            print(mip_matches)        

In [7]:
len(mip_names)

150

## Analysis of Reduced MIP Corpus

In [8]:
# creating a reduce MIP dataframe with only the comments in the CMT layer
mip_reduced = mip_met_df[mip_met_df.filename.isin(mip_names)]

# creating a dictionary of metaphor labels, where each key is a filename
met_labels = {}
for name in mip_reduced['filename'].unique():
    # creating a new dataframe only containing labels corresponding to one file
    new_df = mip_reduced[mip_reduced['filename']==name][['filename', 'labels']].reset_index()
    # creating a list to save the labels in 
    labels_list = []
    for el in new_df['labels'][0]:
        # adding labels to the label list using the helper function
        labels_list.append(labels_to_list(str([el['value']]))[0])
    # saving the list of labels to the dictionary 
    met_labels[name] = labels_list

# creating a list to contain the length of each label           
labels_diff = []
for label in met_labels.values():
    for indices in label:
        # calculating the difference in indices
        length=indices[1]-indices[0]
        # appending the length to the list
        labels_diff.append(length)
print('There are {} labels in the MIP annotation layer, and the average span of each label is {} characters.'.format(len(labels_diff),
                                                                                                                    sum(labels_diff)/len(labels_diff)))

There are 429 labels in the MIP annotation layer, and the average span of each label is 6.384615384615385 characters.
