## Fuzzy Wuzzy Method

In [1]:
import pandas as pd
from fuzzywuzzy import fuzz

llm_output_df = pd.read_csv("Temperature1_WithoutExamples_cleaned.csv")

annotations_df = pd.read_csv("Annotations.csv")




In [2]:
#Simple Ratio: When both strings are expected to be in similar order and fully comparable.
#**Partial Ratio: When one string is a part of the other or has extra noise (e.g., extra words).
#**Token Sort Ratio: When the strings have the same words but may be in a different order.
#Token Set Ratio: When the strings have repeated or redundant words and you want to focus on unique words.

# try a search method and isolate highest similarity

In [3]:
columns_to_compare = ['subj', 'rel', 'obj']

#karrouri, 12 anti, fava 



low_similarity_threshold = 30
overall_similarity_scores = []
count=0

for idx in range(min(len(annotations_df), len(llm_output_df))):
    similarity_scores = []
    
    for col in columns_to_compare:
        handwritten = annotations_df.iloc[idx][col]
        llm = llm_output_df.iloc[idx][col]
        
        #partial ratio because the words should be similar but there can be extra words around 
        similarity_score = fuzz.partial_ratio(handwritten, llm)
        similarity_scores.append(similarity_score)
    

    average_similarity = sum(similarity_scores) / len(similarity_scores)
    overall_similarity_scores.append(average_similarity)
    

#     if average_similarity < low_similarity_threshold:
        

#         print(f"Row {idx} in annotations has low similarity (Avg score: {average_similarity}):")
#         print(annotations_df.iloc[idx])
#         print('---------------------------')
#         print(llm_output_df.iloc[idx])
#         count+=1
    if average_similarity > low_similarity_threshold:
        print(annotations_df.iloc[idx])
        print('---------------------------')
        print(llm_output_df.iloc[idx])
        count+=1
        print('===========================')
        
overall_similarity_score = sum(overall_similarity_scores) / len(overall_similarity_scores)
print(f"\nOverall Similarity Score: {overall_similarity_score}")
print(f"These are the mismatches: {count}")

subj           Depression
rel           Side effect
obj     Suicidal behavior
Name: 0, dtype: object
---------------------------
Unnamed: 0                                                    0
subj                                  Major Depressive Disorder
rel                                                  Definition
obj           a prevalent psychiatric disorder that often le...
Name: 0, dtype: object
subj     Depression
rel     Side effect
obj       Mortality
Name: 1, dtype: object
---------------------------
Unnamed: 0                                                    1
subj          Selective Serotonin Reuptake Inhibitor antidep...
rel                                        first-line treatment
obj                                   Major Depressive Disorder
Name: 1, dtype: object
subj                      Depression
rel                      Side effect
obj     Slower response to treatment
Name: 2, dtype: object
---------------------------
Unnamed: 0                              

## TF- IDF Vector and Cosine Method

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
#cosine method is good for when the length of the comparisons don't matter- the content is more emphasized

In [5]:
ground_truth = pd.read_csv("Annotations.csv")
llm_file =  pd.read_csv("Temperature1_WithoutExamples_cleaned.csv")

# turn the rows into a single string
ground_truth['combined'] = ground_truth.astype(str).agg(' '.join, axis=1)
llm_file['combined'] = llm_file.astype(str).agg(' '.join, axis=1)

# vectorize
vectorizer = TfidfVectorizer()
ground_truth_tfidf = vectorizer.fit_transform(ground_truth['combined'])
llm_tfidf = vectorizer.transform(llm_file['combined'])

# compares each row of the ground truth to each row of the llm file
similarity_matrix = cosine_similarity(ground_truth_tfidf, llm_tfidf)

# extract the row in the llm output with the highest similarity 
highest_sim = similarity_matrix.max(axis=1)
highest_sim_index = similarity_matrix.argmax(axis=1)

# llm_file = llm_file.reset_index(drop=True)
result = pd.DataFrame({
    'ground_truth_row': ground_truth['combined'],
    'most_similar_row': llm_file['combined'].iloc[highest_sim_index].apply(lambda x: x).values,
    'similarity_score': highest_sim
})


result.to_csv('similarity_results.csv', index=False)

print(result)
print(result['similarity_score'].mean())

                                      ground_truth_row  \
0             Depression Side effect Suicidal behavior   
1                     Depression Side effect Mortality   
2    Depression Side effect Slower response to trea...   
3     Pharmacotherapy Treats Acute phase of depression   
4                        Pharmacotherapy Entails SSRIs   
..                                                 ...   
171  Fluoxetine Side effects Generally acceptable s...   
172           Fluoxetine Moderate efficacy for Anxiety   
173  Fluoxetine High efficiency for  Obsessive-comp...   
174       Duloxetine Side effects Lower acceptability    
175  Duloxetine High efficiency for  Neuropathic pa...   

                                      most_similar_row  similarity_score  
0    5 All Depression Associated with high rates of...          0.531832  
1    5 All Depression Associated with high rates of...          0.351954  
2    6 Acute treatment duration for Major Depressio...          0.304288  
3  