## Fuzzy Wuzzy Method

In [1]:
import pandas as pd

from fuzzywuzzy import fuzz

llm_output_df = pd.read_csv("Temperature1_WithoutExamples_cleaned.csv")

annotations_df = pd.read_csv("Annotations.csv")




In [None]:
#Simple Ratio: When both strings are expected to be in similar order and fully comparable.
#**Partial Ratio: When one string is a part of the other or has extra noise (e.g., extra words).
#**Token Sort Ratio: When the strings have the same words but may be in a different order.
#Token Set Ratio: When the strings have repeated or redundant words and you want to focus on unique words.

# try a search method and isolate highest similarity

In [14]:
columns_to_compare = ['subj', 'rel', 'obj']

#karrouri, 12 anti, fava 



low_similarity_threshold = 30
overall_similarity_scores = []
count=0

for idx in range(min(len(annotations_df), len(llm_output_df))):
    similarity_scores = []
    
    for col in columns_to_compare:
        handwritten = annotations_df.iloc[idx][col]
        llm = llm_output_df.iloc[idx][col]
        
        #partial ratio because the words should be similar but there can be extra words around 
        similarity_score = fuzz.partial_ratio(handwritten, llm)
        similarity_scores.append(similarity_score)
    

    average_similarity = sum(similarity_scores) / len(similarity_scores)
    overall_similarity_scores.append(average_similarity)
    

#     if average_similarity < low_similarity_threshold:
        

#         print(f"Row {idx} in annotations has low similarity (Avg score: {average_similarity}):")
#         print(annotations_df.iloc[idx])
#         print('---------------------------')
#         print(llm_output_df.iloc[idx])
#         count+=1
    if average_similarity > low_similarity_threshold:
        print(annotations_df.iloc[idx])
        print('---------------------------')
        print(llm_output_df.iloc[idx])
        count+=1
        print('===========================')
        
overall_similarity_score = sum(overall_similarity_scores) / len(overall_similarity_scores)
print(f"\nOverall Similarity Score: {overall_similarity_score}")
print(f"These are the mismatches: {count}")

subj           Depression
rel           Side effect
obj     Suicidal behavior
Name: 0, dtype: object
---------------------------
Unnamed: 0                                                    0
subj                                  Major Depressive Disorder
rel                                                  Definition
obj           a prevalent psychiatric disorder that often le...
Name: 0, dtype: object
subj     Depression
rel     Side effect
obj       Mortality
Name: 1, dtype: object
---------------------------
Unnamed: 0                                                    1
subj          Selective Serotonin Reuptake Inhibitor antidep...
rel                                        first-line treatment
obj                                   Major Depressive Disorder
Name: 1, dtype: object
subj                      Depression
rel                      Side effect
obj     Slower response to treatment
Name: 2, dtype: object
---------------------------
Unnamed: 0                              

## TF- IDF Vector and Cosine Method

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
#cosine method is good for when the length of the comparisons don't matter- the content is more emphasized

In [23]:
ground_truth = pd.read_csv("Annotations.csv")
llm_file =  pd.read_csv("Temperature1_WithoutExamples_cleaned.csv")

# turn the rows into a single string
ground_truth['combined'] = ground_truth.astype(str).agg(' '.join, axis=1)
llm_file['combined'] = llm_file.astype(str).agg(' '.join, axis=1)

# vectorize
vectorizer = TfidfVectorizer()
ground_truth_tfidf = vectorizer.fit_transform(ground_truth['combined'])
llm_tfidf = vectorizer.transform(llm_file['combined'])

# compares each row of the ground truth to each row of the llm file
similarity_matrix = cosine_similarity(ground_truth_tfidf, llm_tfidf)

# extract the row in the llm output with the highest similarity 
highest_sim = similarity_matrix.max(axis=1)
highest_sim_index = similarity_matrix.argmax(axis=1)

# llm_file = llm_file.reset_index(drop=True)
result = pd.DataFrame({
    'ground_truth_row': ground_truth['combined'],
    'most_similar_row': llm_file['combined'].iloc[highest_sim_index].apply(lambda x: x).values,
    'similarity_score': highest_sim
})


result.to_csv('similarity_results.csv', index=False)

print(result)
print(result['similarity_score'].mean())

                                      ground_truth_row  \
0             Depression Side effect Suicidal behavior   
1                     Depression Side effect Mortality   
2    Depression Side effect Slower response to trea...   
3     Pharmacotherapy Treats Acute phase of depression   
4                        Pharmacotherapy Entails SSRIs   
..                                                 ...   
171  Fluoxetine Side effects Generally acceptable s...   
172           Fluoxetine Moderate efficacy for Anxiety   
173  Fluoxetine High efficiency for  Obsessive-comp...   
174       Duloxetine Side effects Lower acceptability    
175  Duloxetine High efficiency for  Neuropathic pa...   

                                      most_similar_row  similarity_score  
0    5 All Depression Associated with high rates of...          0.531832  
1    5 All Depression Associated with high rates of...          0.351954  
2    6 Acute treatment duration for Major Depressio...          0.304288  
3  

In [3]:
#!pip install deepeval --user

## G-eval 

In [2]:
from collections import namedtuple
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
import pandas as pd
import os
import openai
import traceback 

llm_output = pd.read_csv("Temperature1_WithoutExamples_cleaned.csv")

ground_truth = pd.read_csv("Annotations.csv")


os.environ["OPENAI_API_KEY"] = ""
openai.api_key = os.getenv("OPENAI_API_KEY")

original_init = GEval.__init__


def patched_init(self, name, evaluation_steps=None, evaluation_params=None, criteria=None, *args, **kwargs):
    evaluation_params = evaluation_params if evaluation_params is not None else []
    if criteria is None and evaluation_steps is None:
        raise ValueError("Either 'criteria' or 'evaluation_steps' must be provided.")
    original_init(self, name, evaluation_params=evaluation_params, criteria=criteria, evaluation_steps=evaluation_steps, *args, **kwargs)
    if self.evaluation_cost is None:
        self.evaluation_cost = 0
GEval.__init__ = patched_init

TestCase = namedtuple('TestCase', ['input', 'actual_output'])
correctness_metric = GEval(
    name="Correctness",
    criteria="Determine whether the actual output is factually correct based on the expected output.",
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT]
)


best_matches = []

for i in range(len(ground_truth)):
    gt_subj = ground_truth.loc[i, 'subj']
    gt_rel = ground_truth.loc[i, 'rel']
    gt_obj = ground_truth.loc[i, 'obj']
    gt_row = f"{gt_subj} {gt_rel} {gt_obj}"
    best_evaluation = None
    best_llm_row = None

    for j in range(len(llm_output)):
        llm_subj = llm_output.loc[j, 'subj']
        llm_rel = llm_output.loc[j, 'rel']
        llm_obj = llm_output.loc[j, 'obj']
        llm_row = f"{llm_subj} {llm_rel} {llm_obj}"

        test_case = TestCase(input=gt_row, actual_output=llm_row)
        print(f"Evaluating test case: {test_case}")

        try:
            evaluation = correctness_metric.evaluate(test_case)
            print(f"Evaluation result: {evaluation}")

            if isinstance(evaluation, dict) and 'score' in evaluation:
                print(f"Score: {evaluation['score']}")
            else:
                print(f"Unexpected evaluation result format: {evaluation}")

        except Exception as e:
            print(f"Error during evaluation: {e}")
            traceback.print_exc()

        if best_evaluation is None or evaluation['score'] > best_evaluation['score']:
            best_evaluation = evaluation
            best_llm_row = llm_row

    best_matches.append({
        'ground_truth': gt_row,
        'best_llm_output': best_llm_row,
        'best_evaluation_score': best_evaluation['score']
    })

result_df = pd.DataFrame(best_matches)
print(result_df)


ImportError: cannot import name 'OpenAI' from 'openai' (C:\Users\anush\AppData\Local\anaconda3\Lib\site-packages\openai\__init__.py)

## GPT Critic

In [2]:
import openai
import pandas as pd

openai.api_key = ''


llm_output = pd.read_csv("Temperature1_WithoutExamples_cleaned.csv")

ground_truth = pd.read_csv("Annotations.csv")

import re

def extract_score(evaluation):
    match = re.search(r'Score:\s*([0-1]\.\d+|[0-1])', evaluation)
    if match:
        return float(match.group(1))
    return -1  

def create_prompt(ground_truth_row, llm_output_row):
    prompt = f"""
    I want you to compare the following statements.

    Ground Truth: {ground_truth_row}
    
    LLM Output: {llm_output_row}
    
    Based on the ground truth, evaluate the correctness of the LLM output in terms of:
    1. Factual correctness: Are the facts in the LLM output correct compared to the ground truth?
    2. Omissions: Does the LLM output miss any key details present in the ground truth?
    3. Contradictions: Does the LLM output contradict any information in the ground truth?

    Provide a score from 0 to 1 for overall correctness, and include a short explanation of the result.
    """
    return prompt

def get_critic_gpt_evaluation(prompt):
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",  
            messages=[{"role": "user", "content": prompt}],
            max_tokens=150
        )
        return response['choices'][0]['message']['content'].strip()
    except Exception as e:
        print(f"Error during evaluation: {e}")
        return None

best_matches = []


for i in range(len(ground_truth)):
    gt_subj = ground_truth.loc[i, 'subj']
    gt_rel = ground_truth.loc[i, 'rel']
    gt_obj = ground_truth.loc[i, 'obj']
    gt_row = f"{gt_subj} {gt_rel} {gt_obj}"

    best_evaluation = None
    best_llm_row = None
    best_score = -1  
    
    for j in range(len(llm_output)):
        llm_subj = llm_output.loc[j, 'subj']
        llm_rel = llm_output.loc[j, 'rel']
        llm_obj = llm_output.loc[j, 'obj']
        llm_row = f"{llm_subj} {llm_rel} {llm_obj}"

        prompt = create_prompt(gt_row, llm_row)

        evaluation = get_critic_gpt_evaluation(prompt)
        if evaluation:
            score = extract_score(evaluation) 
            if score > best_score:
                best_score = score
                best_evaluation = evaluation
                best_llm_row = llm_row
                
    best_matches.append({
        'ground_truth': gt_row,
        'best_llm_output': best_llm_row,
        'best_evaluation': best_evaluation
    })
    

print('----------------------')
for match in best_matches:
    print(f"Ground Truth: {match['ground_truth']}")
    print(f"Best LLM Output: {match['best_llm_output']}")
    print(f"Best Evaluation: {match['best_evaluation']}")
    print('----------------------')


result_df = pd.DataFrame(best_matches)
print(results_df.head(2))
# result_df.to_csv('comparison_results.csv', index=False)
# print("Evaluation completed and saved to comparison_results.csv")


## GPT Critic- Parallel Batch Request

In [None]:
import concurrent.futures
import re
import openai
import pandas as pd

openai.api_key = ''


llm_output = pd.read_csv("Temperature1_WithoutExamples_cleaned.csv")

ground_truth = pd.read_csv("Annotations.csv")


def extract_score(evaluation):
    match = re.search(r'Score:\s*([0-1]\.\d+|[0-1])', evaluation)
    if match:
        return float(match.group(1))
    return -1

def create_prompt(ground_truth_row, llm_output_row):
    prompt = f"""
    I want you to compare the following statements.

    Ground Truth: {ground_truth_row}
    
    LLM Output: {llm_output_row}
    
    Based on the ground truth, evaluate the correctness of the LLM output in terms of:
    1. Factual correctness: Are the facts in the LLM output correct compared to the ground truth?
    2. Omissions: Does the LLM output miss any key details present in the ground truth?
    3. Contradictions: Does the LLM output contradict any information in the ground truth?

    Provide a score from 0 to 1 for overall correctness, and include a short explanation of the result.
    """
    return prompt

def get_critic_gpt_evaluation(prompt):
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",  
            messages=[{"role": "user", "content": prompt}],
            max_tokens=150
        )
        return response['choices'][0]['message']['content'].strip()
    except Exception as e:
        print(f"Error during evaluation: {e}")
        return None

def evaluate_row(gt_row, llm_output):
    best_evaluation = None
    best_llm_row = None
    best_score = -1

    for j in range(len(llm_output)):
        llm_subj = llm_output.loc[j, 'subj']
        llm_rel = llm_output.loc[j, 'rel']
        llm_obj = llm_output.loc[j, 'obj']
        llm_row = f"{llm_subj} {llm_rel} {llm_obj}"

        prompt = create_prompt(gt_row, llm_row)
        evaluation = get_critic_gpt_evaluation(prompt)

        if evaluation:
            score = extract_score(evaluation)
            if score > best_score:
                best_score = score
                best_evaluation = evaluation
                best_llm_row = llm_row
    
    return {
        'ground_truth': gt_row,
        'best_llm_output': best_llm_row,
        'best_evaluation': best_evaluation
    }

best_matches = []
ground_truth_rows = []

for i in range(len(ground_truth)):
    gt_subj = ground_truth.loc[i, 'subj']
    gt_rel = ground_truth.loc[i, 'rel']
    gt_obj = ground_truth.loc[i, 'obj']
    gt_row = f"{gt_subj} {gt_rel} {gt_obj}"
    ground_truth_rows.append(gt_row)

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    future_to_row = {executor.submit(evaluate_row, gt_row, llm_output): gt_row for gt_row in ground_truth_rows}

    for future in concurrent.futures.as_completed(future_to_row):
        result = future.result()
        if result:
            best_matches.append(result)


print('----------------------')
for match in best_matches:
    print(f"Ground Truth: {match['ground_truth']}")
    print(f"Best LLM Output: {match['best_llm_output']}")
    print(f"Best Evaluation: {match['best_evaluation']}")
    print('----------------------')


Error during evaluation: HTTP code 502 from API (<html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>cloudflare</center>
</body>
</html>
)
Error during evaluation: Error communicating with OpenAI: HTTPSConnectionPool(host='api.openai.com', port=443): Max retries exceeded with url: /v1/chat/completions (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002BD63A88D90>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error during evaluation: Error communicating with OpenAI: HTTPSConnectionPool(host='api.openai.com', port=443): Max retries exceeded with url: /v1/chat/completions (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002BD6358F190>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error during evaluation: Error communicating with OpenAI: HTTPSConnectionPool(host='api.openai.com', port=443): Max retries excee

In [1]:
#pip install openai==0.28