# File ID Equivalencies Across MIP, CMT, and Appraisal

This notebook creates a dataframe that shows how the file names in MIP, Appraisal, the CMT Sample, and the CMT Project map to one another. 
It also generates source texts for all of the comments in the MIP project and saves them under their MIP names.

## Import Packages

In [1]:
import os
import pandas as pd
import numpy as np
import re
from thefuzz import fuzz

## Reading and Cleaning Data

### MIP Annotations

In [20]:
# load in metaphor annotation json and save it to a pandas dataframe
file_path = 'MIP-at-2025-08-08.json'
met_df = pd.read_json(file_path)

# extracting labels from the annotations column
met_df['labels'] = met_df.apply(lambda row: row.annotations[0]['result'], axis=1)

# changing file names to folder IDs
met_df['filename'] = met_df.apply(lambda row: re.sub(r"^[^_]*-", '', row.file_upload), axis=1)
met_df['filename'] = met_df.apply(lambda row: re.sub(r"_fixed", '', row.filename[:-4]), axis=1)
met_df['filename'] = met_df.apply(lambda row: re.sub(r"_NEW", '', row.filename).lower(), axis=1)

# adding text column
met_df['text'] = met_df.apply(lambda row: row.data['text'], axis=1)

# checking for duplicated files
met_df.loc[met_df.duplicated(subset=['filename'])].filename

Series([], Name: filename, dtype: object)

In [21]:
len(met_df.filename)

1043

### CMT Sample Annotations

In [22]:
# load in CMT annotation json and save it to a pandas dataframe
cmt_file_path = 'CMT_July30.json'
cmt_met_df = pd.read_json(cmt_file_path)

# extracting labels from the annotations column
cmt_met_df['labels'] = cmt_met_df.apply(lambda row: row.annotations[0]['result'], axis=1)

# cleaning up file names 
cmt_met_df['filename'] = cmt_met_df.apply(lambda row: re.sub(r"^[^_]*-", '', row.file_upload), axis=1)
cmt_met_df['filename'] = cmt_met_df.apply(lambda row: re.sub(r"_fixed", '', row.filename[:-4]), axis=1)

# adding text column
cmt_met_df['text'] = cmt_met_df.apply(lambda row: row.data['text'], axis=1)

### Appraisal Annotations

In [23]:
appraisal_dir = os.listdir('SOCC/annotated/Appraisal/Appraisal_annotations/curation')
# appraisal_files = [file.replace('.txt','').replace('.tsv','') for file in appraisal_dir]
appraisal_text_dict = {}
for folder in appraisal_dir:
    path = 'SOCC/annotated/Appraisal/Appraisal_annotations/curation/' + folder + '/CURATION_USER.tsv' 
    # reading and saving the source texts from the annotation to a dictionary
    with open(path, 'r', encoding="utf8") as file:
        lines = file.readlines()
        text = ''.join([line for line in lines if line.startswith('#Text')]).replace('#Text=','"')
        appraisal_text_dict[folder.replace('.txt','').replace('.tsv','')] = text


In [24]:
len(appraisal_dir)

1043

## MIP/Appraisal Mapping

In [25]:
# dictionary where MIP name is the key and Appraisal name is the value
appr_mip_corresp = {}
mip_filenames=met_df.filename.unique()
for name in mip_filenames:
    # text corresponding to given name in MIP
    met_text = str(met_df[met_df['filename']==name].reset_index().text[0])
    try: 
        # text corresponding to given name in appraisal 
        appr_text = appraisal_text_dict[name]
        # similarity score
        sim_score = fuzz.ratio(met_text, appr_text)
    except:
        sim_score = 0 # no such filename in appraisal
    # if over 80, it's a match! the two are the same
    if sim_score >= 80:
        appr_mip_corresp[name] = name
    # if not, we need to find the corresponding name 
    else:
        # find the topic
        topic = re.sub(r"_\d+", '', name)
        # find all file names in Appraisal that fall under that specific topic
        topic_names_total = [filename for filename in appraisal_text_dict.keys() if topic in filename]
        # remove file names that have already been matched
        topic_names_used = [filename for filename in appr_mip_corresp.values() if topic in filename]
        # create list of file names for a specific topic that have NOT already been matched
        topic_names_to_check = set(topic_names_total) - set(topic_names_used)
        cand_dict = {}
        for candidate in topic_names_to_check:
            cand_text = appraisal_text_dict[candidate]
            sim_score_cand = fuzz.ratio(met_text, cand_text)
            if sim_score_cand >= 80:
                # each candidate file name is a key, assign its similarity score as its value if above 80
                cand_dict[candidate] = sim_score_cand
        if len(cand_dict.keys())==1:
            # if there is exactly one suitable candidate, then it's a match!
            appr_mip_corresp[name]=list(cand_dict.keys())[0]
        else:
            print(name)
            print(cand_dict)

In [34]:
# # duplicate files in MIP
# d = appr_mip_corresp
# flipped = {}

# for key, value in d.items():
#     if value not in flipped:
#         flipped[value] = [key]
#     else:
#         flipped[value].append(key)
# [(value, key) for key, value in flipped.items() if len(value) > 1]

In [33]:
# # files in MIP with no equivalent in Appraisal
# set(met_df.filename.unique())-set(appr_mip_corresp.keys())

In [31]:
# # files in Appraisal with no equivalent in MIP
# appraisal_files = [file.replace('.txt','').replace('.tsv','') for file in appraisal_dir]
# set(appraisal_files)-set(appr_mip_corresp.values())

In [30]:
# # prints the comments in Appraisal with no equivalent in MIP
# for x in set(appraisal_files)-set(appr_mip_corresp.values()):
#     print(x)
#     print(appraisal_text_dict[x])

In [32]:
# # names in Appraisal but not MIP
# set(appraisal_files)-set(mip_filenames)

In [43]:
mip_appraisal_df=pd.DataFrame(appr_mip_corresp.items(), columns=['MIP', 'Appraisal'])

## MIP/CMT Mapping

In [45]:
# dictionary where MIP name is the key and CMT Sample name is the value
cmt_mip_corresp = {}
for name in cmt_met_df.filename.unique():
    # checks whether the last 10 characters match
    mip_text=met_df[met_df['filename']==name].reset_index().text[0]
    cmt_text = cmt_met_df[cmt_met_df['filename']==name].reset_index().text[0]
    sim_score = fuzz.ratio(mip_text, cmt_text)
    if sim_score >= 80:
        cmt_mip_corresp[name]=name
    else:
        # find the topic
        topic = re.sub(r"_\d+", '', name)
        # find all file names in MIP that fall under that specific topic
        topic_names_total = [filename for filename in met_df.filename.unique() if topic in filename]
        # remove file names that have already been matched
        topic_names_used = [filename for filename in cmt_mip_corresp.values() if topic in filename]
        # create list of file names for a specific topic that have NOT already been matched
        topic_names_to_check = set(topic_names_total) - set(topic_names_used)
        cand_dict = {}
        for candidate in topic_names_to_check:
            cand_text = met_df[met_df['filename']==candidate].reset_index().text[0]
            sim_score_cand = fuzz.ratio(cmt_text, cand_text)
            if sim_score_cand >= 80:
                # each candidate file name is a key, assign its similarity score as its value if above 80
                cand_dict[candidate] = sim_score_cand
        if len(cand_dict.keys())==1:
            # if there is exactly one suitable candidate, then it's a match!
            cmt_mip_corresp[list(cand_dict.keys())[0]]=name
        else:
            print(name)
            print(cand_dict)

In [46]:
mip_cmt_df = pd.DataFrame(cmt_mip_corresp.items(), columns=['MIP', 'CMT Sample'])

## Central DataFrame

In [47]:
# joining the two dataframes to create a more comprehensive central source
mapping_df = pd.merge(mip_appraisal_df, mip_cmt_df, on='MIP', how='outer')
# creating a CMT Project column, that does not include the file names in the sample, and is equivalent to MIP everywhere else
mapping_df['CMT Project']=mapping_df.apply(lambda row: row.MIP if pd.isna(row['CMT Sample']) else np.nan, axis=1)

In [48]:
mapping_df

Unnamed: 0,MIP,Appraisal,CMT Sample,CMT Project
0,aboriginal_1,aboriginal_1,aboriginal_1,
1,aboriginal_10,aboriginal_10,,aboriginal_10
2,aboriginal_11,aboriginal_11,,aboriginal_11
3,aboriginal_12,aboriginal_12,,aboriginal_12
4,aboriginal_13,aboriginal_13,,aboriginal_13
...,...,...,...,...
1038,watch_91,watch_91,,watch_91
1039,watch_92,watch_92,,watch_92
1040,watch_93,watch_93,,watch_93
1041,watch_95,watch_95,,watch_95


In [49]:
# save as csv
mapping_df.to_csv('mapping_spreadsheet.csv')

## Generating New Source Comments (based on what is in MIP)

In [62]:
# folder path to save the files in for the CMT Project
path_proj = 'SOURCES_FROM_MIP/CMT_PROJECT/'
# all the MIP comments minus the ones already annotated in the CMT Sample
new_cmt = mapping_df['CMT Project'].dropna().unique()
for name in new_cmt:
    text = met_df[met_df['filename']==name].reset_index().text[0].replace('“','"').replace('”','"').replace('‘',"'").replace('’',"'").replace('…','...').replace('–','—')
    # saving to a .txt file with the corresponding name in MIP
    with open(path_proj+name+'.txt', "w", encoding="utf-8") as text_file:
        text_file.write(text)

In [65]:
# folder path to save the files in for the CMT Sample
path_sample = 'SOURCES_FROM_MIP/CMT_SAMPLE/'
# the MIP comments that were already annotated in the CMT Sample
sample_cmt = mapping_df.dropna(axis=0, subset=['CMT Sample']).MIP.unique()
for name in sample_cmt:
    text = met_df[met_df['filename']==name].reset_index().text[0].replace('“','"').replace('”','"').replace('‘',"'").replace('’',"'").replace('…','...').replace('–','—')
    # saving to a .txt file with the corresponding name in MIP
    with open(path_sample+name+'.txt', "w", encoding="utf-8") as text_file:
        text_file.write(text)