## Fuzzy Wuzzy Method

In [1]:
import pandas as pd
from fuzzywuzzy import fuzz

llm_output_df = pd.read_csv("Temperature1_WithoutExamples_cleaned.csv")

annotations_df = pd.read_csv("Annotations.csv")




In [2]:
#Simple Ratio: When both strings are expected to be in similar order and fully comparable.
#**Partial Ratio: When one string is a part of the other or has extra noise (e.g., extra words).
#**Token Sort Ratio: When the strings have the same words but may be in a different order.
#Token Set Ratio: When the strings have repeated or redundant words and you want to focus on unique words.

# try a search method and isolate highest similarity

In [3]:
columns_to_compare = ['subj', 'rel', 'obj']

#karrouri, 12 anti, fava 



low_similarity_threshold = 30
overall_similarity_scores = []
count=0

for idx in range(min(len(annotations_df), len(llm_output_df))):
    similarity_scores = []
    
    for col in columns_to_compare:
        handwritten = annotations_df.iloc[idx][col]
        llm = llm_output_df.iloc[idx][col]
        
        #partial ratio because the words should be similar but there can be extra words around 
        similarity_score = fuzz.partial_ratio(handwritten, llm)
        similarity_scores.append(similarity_score)
    

    average_similarity = sum(similarity_scores) / len(similarity_scores)
    overall_similarity_scores.append(average_similarity)
    

#     if average_similarity < low_similarity_threshold:
        

#         print(f"Row {idx} in annotations has low similarity (Avg score: {average_similarity}):")
#         print(annotations_df.iloc[idx])
#         print('---------------------------')
#         print(llm_output_df.iloc[idx])
#         count+=1
    if average_similarity > low_similarity_threshold:
        print(annotations_df.iloc[idx])
        print('---------------------------')
        print(llm_output_df.iloc[idx])
        count+=1
        print('===========================')
        
overall_similarity_score = sum(overall_similarity_scores) / len(overall_similarity_scores)
print(f"\nOverall Similarity Score: {overall_similarity_score}")
print(f"These are the mismatches: {count}")

subj           Depression
rel           Side effect
obj     Suicidal behavior
Name: 0, dtype: object
---------------------------
Unnamed: 0                                                    0
subj                                  Major Depressive Disorder
rel                                                  Definition
obj           a prevalent psychiatric disorder that often le...
Name: 0, dtype: object
subj     Depression
rel     Side effect
obj       Mortality
Name: 1, dtype: object
---------------------------
Unnamed: 0                                                    1
subj          Selective Serotonin Reuptake Inhibitor antidep...
rel                                        first-line treatment
obj                                   Major Depressive Disorder
Name: 1, dtype: object
subj                      Depression
rel                      Side effect
obj     Slower response to treatment
Name: 2, dtype: object
---------------------------
Unnamed: 0                              

## TF- IDF Vector and Cosine Method

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
#cosine method is good for when the length of the comparisons don't matter- the content is more emphasized

In [5]:
ground_truth = pd.read_csv("Annotations.csv")
llm_file =  pd.read_csv("Temperature1_WithoutExamples_cleaned.csv")

# turn the rows into a single string
ground_truth['combined'] = ground_truth.astype(str).agg(' '.join, axis=1)
llm_file['combined'] = llm_file.astype(str).agg(' '.join, axis=1)

# vectorize
vectorizer = TfidfVectorizer()
ground_truth_tfidf = vectorizer.fit_transform(ground_truth['combined'])
llm_tfidf = vectorizer.transform(llm_file['combined'])

# compares each row of the ground truth to each row of the llm file
similarity_matrix = cosine_similarity(ground_truth_tfidf, llm_tfidf)

# extract the row in the llm output with the highest similarity 
highest_sim = similarity_matrix.max(axis=1)
highest_sim_index = similarity_matrix.argmax(axis=1)

# llm_file = llm_file.reset_index(drop=True)
result = pd.DataFrame({
    'ground_truth_row': ground_truth['combined'],
    'most_similar_row': llm_file['combined'].iloc[highest_sim_index].apply(lambda x: x).values,
    'similarity_score': highest_sim
})


result.to_csv('similarity_results.csv', index=False)

print(result)
print(result['similarity_score'].mean())

                                      ground_truth_row  \
0             Depression Side effect Suicidal behavior   
1                     Depression Side effect Mortality   
2    Depression Side effect Slower response to trea...   
3     Pharmacotherapy Treats Acute phase of depression   
4                        Pharmacotherapy Entails SSRIs   
..                                                 ...   
171  Fluoxetine Side effects Generally acceptable s...   
172           Fluoxetine Moderate efficacy for Anxiety   
173  Fluoxetine High efficiency for  Obsessive-comp...   
174       Duloxetine Side effects Lower acceptability    
175  Duloxetine High efficiency for  Neuropathic pa...   

                                      most_similar_row  similarity_score  
0    5 All Depression Associated with high rates of...          0.531832  
1    5 All Depression Associated with high rates of...          0.351954  
2    6 Acute treatment duration for Major Depressio...          0.304288  
3  

## GPT Critic

In [13]:
import concurrent.futures
import re
import openai
import pandas as pd

openai.api_key = ''

# Load data
llm_output = pd.read_csv("Temperature1_WithoutExamples_cleaned.csv")
ground_truth = pd.read_csv("Annotations.csv")

# Score extraction with regex
def extract_score(evaluation):
    match = re.search(r'(?:Overall correctness score|Score|I would score the LLM output a)[:\s]*([0-1](?:\.\d+)?)', evaluation, re.IGNORECASE)
    
    if match:
        return float(match.group(1))
    else:
        partial_match = re.search(r'correctness.*?([0-1](?:\.\d+)?)', evaluation, re.IGNORECASE)
        if partial_match:
            return float(partial_match.group(1))
        
        print(f"Failed to extract score from evaluation: {evaluation}")
        return -1

# Relevance check
def is_relevant(ground_truth, llm_output):
    gt_terms = set(ground_truth.lower().split())
    llm_terms = set(llm_output.lower().split())
    relevance = gt_terms.intersection(llm_terms)
    return len(relevance) > 1

# Create prompt for OpenAI GPT model
def create_prompt(ground_truth_row, llm_output_row):
    prompt = f"""
    I want you to compare the following statements.

    Ground Truth: {ground_truth_row}
    
    LLM Output: {llm_output_row}
    
    Based on the ground truth, evaluate the correctness of the LLM output in terms of:
    1. Factual correctness: Are the facts in the LLM output correct compared to the ground truth?
    2. Omissions: Does the LLM output miss any key details present in the ground truth?
    3. Contradictions: Does the LLM output contradict any information in the ground truth?

    You are required to provide a score from 0 to 1 for overall correctness (1 being highly correct), and include a short explanation of the result.
    """
    return prompt

# Get evaluation from GPT model
def get_critic_gpt_evaluation(prompt):
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",  
            messages=[{"role": "user", "content": prompt}],
            max_tokens=150
        )
        return response['choices'][0]['message']['content'].strip()
    except Exception as e:
        print(f"Error during evaluation: {e}")
        return None

# Evaluate a single ground truth row against LLM output
def evaluate_row(gt_row, llm_output):
    best_evaluation = None
    best_llm_row = None
    best_score = -1
    irrelevant_flag = False

    for j in range(len(llm_output)):
        llm_subj = llm_output.loc[j, 'subj']
        llm_rel = llm_output.loc[j, 'rel']
        llm_obj = llm_output.loc[j, 'obj']
        llm_row = f"{llm_subj} {llm_rel} {llm_obj}"
        
        if not is_relevant(gt_row, llm_row):
            irrelevant_flag = True
            continue     

        prompt = create_prompt(gt_row, llm_row)
        evaluation = get_critic_gpt_evaluation(prompt)

        if evaluation:
            score = extract_score(evaluation)
            if score == -1:
                print(f"Score extraction failed for Ground Truth: '{gt_row}' and LLM Output: '{llm_row}'\nFull Evaluation: {evaluation}")
            else:
                print(f"Evaluation for Ground Truth: '{gt_row}' and LLM Output: '{llm_row}' resulted in score: {score}")

            if score > best_score:
                best_score = score
                best_evaluation = evaluation
                best_llm_row = llm_row
    
    if best_llm_row and best_evaluation:
        return {
            'ground_truth': gt_row,
            'best_llm_output': best_llm_row,
            'best_evaluation': best_evaluation
        }
    elif irrelevant_flag:
        print(f"No relevant match for Ground Truth: '{gt_row}'")
        return None

# Parallel processing of ground truth rows
best_matches = []
ground_truth_rows = []

for i in range(len(ground_truth)):
    gt_subj = ground_truth.loc[i, 'subj']
    gt_rel = ground_truth.loc[i, 'rel']
    gt_obj = ground_truth.loc[i, 'obj']
    gt_row = f"{gt_subj} {gt_rel} {gt_obj}"
    ground_truth_rows.append(gt_row)

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    future_to_row = {executor.submit(evaluate_row, gt_row, llm_output): gt_row for gt_row in ground_truth_rows}

    for future in concurrent.futures.as_completed(future_to_row):
        try:
            result = future.result()
            if result:
                best_matches.append(result)
        except Exception as e:
            print(f"Error processing a row: {e}")

# Print results
print('----------------------')
for match in best_matches:
    print(f"Ground Truth: {match['ground_truth']}")
    print(f"Best LLM Output: {match['best_llm_output']}")
    print(f"Best Evaluation: {match['best_evaluation']}")
    print('----------------------')

# Save results to DataFrame and optionally CSV
results_df = pd.DataFrame(best_matches)
print(results_df.head(10))
# results_df.to_csv('comparison_results.csv', index=False)


No relevant match for Ground Truth: 'Pharmacotherapy Entails SSRIs'
No relevant match for Ground Truth: 'Pharmacotherapy Entails TCAs'
No relevant match for Ground Truth: 'Pharmacotherapy Entails Ketamine '
Evaluation for Ground Truth: 'Monoamine oxidase inhibitors Same efficacy as Tricyclic antidepressants' and LLM Output: 'Monoamine Oxidase Inhibitors Less commonly used Major Depressive Disorder' resulted in score: 0.5
Evaluation for Ground Truth: 'Depression Side effect Suicidal behavior' and LLM Output: 'Ketamine Properties very quick effects on resistant unipolar depression and acute suicidal ideation' resulted in score: 0.8
Evaluation for Ground Truth: 'Depression Side effect Slower response to treatment' and LLM Output: 'All Depression Definition a prevalent psychiatric disorder that often leads to poor quality of life and impaired functioning' resulted in score: 0.5
Evaluation for Ground Truth: 'Depression Side effect Mortality' and LLM Output: 'All Depression Associated with h

## RAGA evaluation

### Optimized GPT Critic parallel batch

In [12]:
import concurrent.futures
import re
import openai
import pandas as pd

openai.api_key = ''

llm_output = pd.read_csv("Temperature1_WithoutExamples_cleaned.csv")
ground_truth = pd.read_csv("Annotations.csv")

def extract_raga_scores(evaluation):
    """
    Extract scores for RAGA components: Retrieval, Augmentation, Generation, Attribution.
    """
    retrieval = re.search(r'Retrieval:\s*([0-1](?:\.\d+)?)', evaluation)
    augmentation = re.search(r'Augmentation:\s*([0-1](?:\.\d+)?)', evaluation)
    generation = re.search(r'Generation:\s*([0-1](?:\.\d+)?)', evaluation)
    attribution = re.search(r'Attribution:\s*([0-1](?:\.\d+)?)', evaluation)

    if retrieval and augmentation and generation and attribution:
        return {
            'retrieval': float(retrieval.group(1)),
            'augmentation': float(augmentation.group(1)),
            'generation': float(generation.group(1)),
            'attribution': float(attribution.group(1))
        }
    return None

def is_relevant(ground_truth, llm_output):
    gt_terms = set(ground_truth.lower().split())
    llm_terms = set(llm_output.lower().split())
    relevance = gt_terms.intersection(llm_terms)
    return len(relevance) > 1

def create_raga_prompt(ground_truth_row, llm_output_row):
    prompt = f"""
    I want you to compare the following statements.

    Ground Truth: {ground_truth_row}
    
    LLM Output: {llm_output_row}
    
    Based on the ground truth, evaluate the correctness of the LLM output in terms of:
    1. **Retrieval**: How well the LLM output retrieves information relevant to the ground truth.
    2. **Augmentation**: How well the LLM output synthesizes the retrieved information to form a coherent response.
    3. **Generation**: Is the LLM output fluent and grammatically correct?
    4. **Attribution**: Can the LLM output be directly attributed to the ground truth, without unsupported claims?

    You are required to provide a score from 0 to 1 for each component (1 being the best), and include a short explanation of the result.
    """
    return prompt

def get_critic_gpt_evaluation(prompt):
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=200
        )
        return response['choices'][0]['message']['content'].strip()
    except Exception as e:
        print(f"Error during evaluation: {e}")
        return None

def evaluate_row_raga(gt_row, llm_output):
    best_evaluation = None
    best_llm_row = None
    best_scores = {
        'retrieval': -1,
        'augmentation': -1,
        'generation': -1,
        'attribution': -1
    }

    for j in range(len(llm_output)):
        llm_subj = llm_output.loc[j, 'subj']
        llm_rel = llm_output.loc[j, 'rel']
        llm_obj = llm_output.loc[j, 'obj']
        llm_row = f"{llm_subj} {llm_rel} {llm_obj}"
        
        if not is_relevant(gt_row, llm_row):
            continue

        prompt = create_raga_prompt(gt_row, llm_row)
        evaluation = get_critic_gpt_evaluation(prompt)

        if evaluation:
            raga_scores = extract_raga_scores(evaluation)
            if raga_scores:
                print(f"Evaluation for Ground Truth: '{gt_row}' and LLM Output: '{llm_row}' resulted in RAGA scores: {raga_scores}")

                # Check if this evaluation has better RAGA scores
                if sum(raga_scores.values()) > sum(best_scores.values()):
                    best_scores = raga_scores
                    best_evaluation = evaluation
                    best_llm_row = llm_row

    if best_llm_row and best_evaluation:
        return {
            'ground_truth': gt_row,
            'best_llm_output': best_llm_row,
            'best_evaluation': best_evaluation,
            **best_scores
        }
    return None

# Parallel processing of ground truth rows using RAGA
best_matches = []
ground_truth_rows = []

for i in range(len(ground_truth)):
    gt_subj = ground_truth.loc[i, 'subj']
    gt_rel = ground_truth.loc[i, 'rel']
    gt_obj = ground_truth.loc[i, 'obj']
    gt_row = f"{gt_subj} {gt_rel} {gt_obj}"
    ground_truth_rows.append(gt_row)

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    future_to_row = {executor.submit(evaluate_row_raga, gt_row, llm_output): gt_row for gt_row in ground_truth_rows}

    for future in concurrent.futures.as_completed(future_to_row):
        try:
            result = future.result()
            if result:
                best_matches.append(result)
        except Exception as e:
            print(f"Error processing a row: {e}")

# Print RAGA results
print('----------------------')
for match in best_matches:
    print(f"Ground Truth: {match['ground_truth']}")
    print(f"Best LLM Output: {match['best_llm_output']}")
    print(f"Best Evaluation: {match['best_evaluation']}")
    print(f"Retrieval Score: {match['retrieval']}, Augmentation Score: {match['augmentation']}, Generation Score: {match['generation']}, Attribution Score: {match['attribution']}")
    print('----------------------')

# Save results to DataFrame and optionally CSV
results_df = pd.DataFrame(best_matches)
print(results_df.head(10))
# results_df.to_csv('comparison_raga_results.csv', index=False)

Evaluation for Ground Truth: 'Depression Side effect Suicidal behavior' and LLM Output: 'Ketamine Properties very quick effects on resistant unipolar depression and acute suicidal ideation' resulted in RAGA scores: {'retrieval': 0.75, 'augmentation': 0.8, 'generation': 0.9, 'attribution': 0.7}
Evaluation for Ground Truth: 'Monoamine oxidase inhibitors Same efficacy as Tricyclic antidepressants' and LLM Output: 'Monoamine Oxidase Inhibitors Less commonly used Major Depressive Disorder' resulted in RAGA scores: {'retrieval': 0.5, 'augmentation': 0.3, 'generation': 0.7, 'attribution': 0.2}
Evaluation for Ground Truth: 'Monoamine oxidase inhibitors Side effects Hepatotoxicity and hypertensive crises ' and LLM Output: 'Monoamine Oxidase Inhibitors Less commonly used Major Depressive Disorder' resulted in RAGA scores: {'retrieval': 0.5, 'augmentation': 0.2, 'generation': 0.8, 'attribution': 0.3}
Evaluation for Ground Truth: 'Depression Side effect Slower response to treatment' and LLM Output