# GPT Critic

In [3]:
import concurrent.futures
import re
from openai import OpenAI
import pandas as pd
import numpy as np
from tqdm import tqdm

# Initialize OpenAI client
client = OpenAI(api_key='')

def extract_score(evaluation):
    """Extract numerical score from GPT evaluation"""
    match = re.search(r'(?:Overall correctness score|Score|I would score the LLM output a)[:\s]*([0-1](?:\.\d+)?)', evaluation, re.IGNORECASE)
    return float(match.group(1)) if match else -1

def is_relevant(ground_truth, llm_output):
    """Check if LLM output is relevant to ground truth"""
    gt_terms = set(ground_truth.lower().split())
    llm_terms = set(llm_output.lower().split())
    return len(gt_terms.intersection(llm_terms)) > 1

def get_critic_gpt_evaluation(llm_text, gt_text):
    """Get evaluation from GPT for a single comparison"""
    try:
        prompt = f"""
        Compare these statements:
        Ground Truth: {gt_text}
        LLM Output: {llm_text}
        
        Score from 0 to 1 for overall correctness (1 being highly correct). Format: 'Score: X.XX'
        Brief explanation (one sentence).
        """
        
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=150
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error during evaluation: {e}")
        return None

def evaluate_single_output(llm_row, ground_truth_df):
    """Evaluate a single LLM output against all ground truth entries"""
    llm_text = f"{llm_row['subj']} {llm_row['rel']} {llm_row['obj']}"
    best_score = -1
    best_evaluation = None
    best_gt_row = None
    
    for _, gt_row in ground_truth_df.iterrows():
        gt_text = f"{gt_row['subj']} {gt_row['rel']} {gt_row['obj']}"
        
        if not is_relevant(gt_text, llm_text):
            continue
            
        evaluation = get_critic_gpt_evaluation(llm_text, gt_text)
        
        if evaluation:
            score = extract_score(evaluation)
            if score > best_score:
                best_score = score
                best_evaluation = evaluation
                best_gt_row = gt_text
    
    return {
        'llm_output': llm_text,
        'best_matching_ground_truth': best_gt_row,
        'best_evaluation': best_evaluation,
        'best_score': best_score
    }

def evaluate_all_outputs(llm_output_df, ground_truth_df, max_workers=10):
    """Process all LLM outputs in parallel"""
    all_evaluations = []
    total = len(llm_output_df)
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Create future tasks
        future_to_row = {
            executor.submit(evaluate_single_output, row, ground_truth_df): i 
            for i, (_, row) in enumerate(llm_output_df.iterrows())
        }
        
        # Process results with progress bar
        with tqdm(total=total, desc="Evaluating relationships") as pbar:
            for future in concurrent.futures.as_completed(future_to_row):
                try:
                    result = future.result()
                    if result['best_score'] != -1:
                        all_evaluations.append(result)
                except Exception as e:
                    print(f"Error processing row: {e}")
                pbar.update(1)
    
    return all_evaluations

# Load data
final_evals = []
ground_truth = pd.read_csv("ground_truth.csv")
rel_docs = ["NewRels_Skip2_cummulative.csv", "NewRels_Skip2_increments.csv", 
            "NewRels_Skip3_cummulative.csv", "NewRels_Skip3_increments.csv", 
            "NewRels_Skip4_increments.csv", "Temperature0point2.csv", 
            "Temperature1_WithExamples.csv", "Temperature1_WithoutExamples_cleaned.csv", 
            "Temperature1_WithoutExamples.csv"]

for doc in rel_docs:
    llm_output = pd.read_csv(doc)
    print(f"Processing {len(llm_output)} LLM outputs against {len(ground_truth)} ground truth statements...")

    # Run evaluation
    evaluations = evaluate_all_outputs(
        llm_output_df=llm_output,
        ground_truth_df=ground_truth,
        max_workers=10
    )

    # Calculate average score
    valid_scores = [eval['best_score'] for eval in evaluations if eval['best_score'] != -1]
    average_score = np.mean(valid_scores) if valid_scores else 0
    
    # Print results
    print(f"\nOverall Average Score: {average_score:.2f}")
    print(f"Processed {len(evaluations)} evaluations")
    
    # Save results to DataFrame
    final_evals.append(average_score)
    results_df = pd.DataFrame(evaluations)
    print("\nSummary DataFrame:")
    print(results_df.head())
    
    # Save results
    results_df.to_csv(f'GPT_critic_eval_for_{doc}', index=False)

Processing 163 LLM outputs against 176 ground truth statements...


Evaluating relationships: 100%|███████████████| 163/163 [04:38<00:00,  1.71s/it]



Overall Average Score: 0.72
Processed 158 evaluations

Summary DataFrame:
                                          llm_output  \
0  Selective serotonin reuptake inhibitors (SSRIs...   
1  Monoamine oxidase inhibitors (MAOIs) Specific ...   
2  Monoamine oxidase inhibitors (MAOIs) Less comm...   
3  Monoamine oxidase inhibitors (MAOIs) second-li...   
4  Monoamine Oxidase Inhibitors (MAOIs) Definitio...   

                          best_matching_ground_truth  \
0              SSRIs Treat Acute phase of depression   
1  Monoamine oxidase inhibitors Demonstrated spec...   
2  Depression Side effect Slower response to trea...   
3  Depression Side effect Slower response to trea...   
4  Pharmacotherapy Entails Monoamine oxidase inhi...   

                                     best_evaluation  best_score  
0  Score: 0.95\nThe LLM output is highly correct ...        0.95  
1  Score: 0.90\nThe LLM output is mostly correct ...        0.90  
2  Score: 0.75\nThe LLM output is somewhat relate.

Evaluating relationships: 100%|███████████████| 172/172 [03:32<00:00,  1.24s/it]



Overall Average Score: 0.66
Processed 164 evaluations

Summary DataFrame:
                                          llm_output  \
0  depression with atypical features subset depre...   
1  selective serotonin reuptake inhibitors (ssris...   
2  selective serotonin reuptake inhibitors (ssris...   
3  tricyclic antidepressants (tca) more effective...   
4  venlafaxine comparable to tricyclic antidepres...   

                          best_matching_ground_truth  \
0  Problem-solving therapy Use Preventing depress...   
1  Norepinephrine reuptake inhibitors Eligible fo...   
2  Depression Side effect Slower response to trea...   
3  Norepinephrine reuptake inhibitors Eligible fo...   
4  Monoamine oxidase inhibitors Same efficacy as ...   

                                     best_evaluation  best_score  
0  Score: 0.50\nThe LLM output only partially cap...        0.50  
1  Score: 0.50\nThe LLM output is partially corre...        0.50  
2  Score: 0.50\nThe LLM output is not highly corr.

Evaluating relationships: 100%|███████████████| 106/106 [02:41<00:00,  1.53s/it]



Overall Average Score: 0.72
Processed 103 evaluations

Summary DataFrame:
                                          llm_output  \
0  selective serotonin reuptake inhibitors first-...   
1  serotonin-norepinephrine reuptake inhibitors (...   
2  monoamine oxidase inhibitors (maois) less comm...   
3  selective serotonin reuptake inhibitors (ssris...   
4  tricyclic antidepressants (tcas) comparable to...   

                          best_matching_ground_truth  \
0  Pharmacotherapy Entails Norepinephrine reuptak...   
1  Pharmacotherapy Entails Norepinephrine reuptak...   
2  Pharmacotherapy Entails Monoamine oxidase inhi...   
3  Pharmacotherapy Entails Norepinephrine reuptak...   
4                   Nefazodone Is comparable to SSRI   

                                     best_evaluation  best_score  
0  Score: 0.50\nThe LLM output is partially corre...        0.50  
1  Score: 0.75\nThe LLM output is generally corre...        0.75  
2  Score: 0.75\nThe LLM output provides some cont.

Evaluating relationships: 100%|███████████████| 110/110 [02:38<00:00,  1.44s/it]



Overall Average Score: 0.69
Processed 106 evaluations

Summary DataFrame:
                                          llm_output  \
0  venlafaxine comparable to tricyclic antidepres...   
1  mirtazapine properties increases serotonin or ...   
2  selective serotonin reuptake inhibitors (ssris...   
3  selective serotonin reuptake inhibitors (ssris...   
4  vortioxetine specific efficacy in treating dep...   

                          best_matching_ground_truth  \
0  Monoamine oxidase inhibitors Same efficacy as ...   
1  Mirtazapine Increases Availability of serotoni...   
2  Pharmacotherapy Entails Norepinephrine reuptak...   
3  Pharmacotherapy Entails Norepinephrine reuptak...   
4  Monoamine oxidase inhibitors Demonstrated spec...   

                                     best_evaluation  best_score  
0  Score: 0.75\nThe LLM output is mostly correct ...        0.75  
1  Score: 0.90\nThe LLM output is mostly correct ...        0.90  
2  Score: 0.50\nThe LLM output includes inaccurac.

Evaluating relationships: 100%|█████████████████| 84/84 [01:52<00:00,  1.34s/it]



Overall Average Score: 0.66
Processed 81 evaluations

Summary DataFrame:
                                          llm_output  \
0  ketamine highly effective treatment-resistant ...   
1  selective serotonin reuptake inhibitors (ssris...   
2  monoamine oxidase inhibitors (maois) less comm...   
3  serotonin-norepinephrine reuptake inhibitors (...   
4  ketamine not effective as a miracle drug for a...   

                          best_matching_ground_truth  \
0  Ketamine Has effects on Unipolar, bipolar depr...   
1  Pharmacotherapy Entails Norepinephrine reuptak...   
2  Monoamine oxidase inhibitors Eligible for Pati...   
3  Pharmacotherapy Entails Norepinephrine reuptak...   
4                  Mirtazapine As effective as SSRIs   

                                     best_evaluation  best_score  
0  Score: 0.75\nThe LLM output is mostly correct ...        0.75  
1  Score: 0.50\nThe LLM output contains some corr...        0.50  
2  Score: 0.75\nThe LLM output correctly recogniz..

Evaluating relationships: 100%|███████████████| 109/109 [02:49<00:00,  1.56s/it]



Overall Average Score: 0.75
Processed 105 evaluations

Summary DataFrame:
                                          llm_output  \
0  VNS Usage treatment-resistant unilateral or bi...   
1  ECT Highly effective for Major Unipolar Depres...   
2          SSRIs More commonly used Major Depression   
3  Ketamine Highly effective for treatment-resist...   
4  Bupropion side effects minimal weight gain or ...   

                          best_matching_ground_truth  \
0  Ketamine Has effects on Unipolar, bipolar depr...   
1  Electroconvulsive therapy Effective Acute phas...   
2              SSRIs Treat Acute phase of depression   
3  Ketamine Has effects on Unipolar, bipolar depr...   
4  Bupropion Properties A better tolerability tha...   

                                     best_evaluation  best_score  
0  Score: 0.75\nThe LLM output is mostly correct ...        0.75  
1  Score: 0.75\nThe LLM output is mostly correct ...        0.75  
2  Score: 0.75\nThe LLM output is mostly correct .

Evaluating relationships: 100%|█████████████████| 97/97 [03:37<00:00,  2.24s/it]



Overall Average Score: 0.74
Processed 97 evaluations

Summary DataFrame:
                                          llm_output  \
0  Selective Serotonin Reuptake Inhibitors (SSRIs...   
1                ECT Highly effective for Depression   
2  Anxious Depression more common than nonanxious...   
3  Ketamine Highly effective for treatment-resist...   
4  Anxious Depression side effects low response r...   

                          best_matching_ground_truth  \
0  Pharmacotherapy Entails Norepinephrine reuptak...   
1  Electroconvulsive therapy Effective Acute phas...   
2  Bupropion Finding Is more likely than SSRIs to...   
3  Ketamine Has effects on Unipolar, bipolar depr...   
4           Depression Side effect Suicidal behavior   

                                     best_evaluation  best_score  
0  Score: 0.50\nThe LLM output is not highly corr...        0.50  
1  Score: 0.75\nThe LLM output largely captures t...        0.75  
2  Score: 0.50\nThe LLM output is not directly re..

Evaluating relationships: 100%|███████████████| 116/116 [03:29<00:00,  1.81s/it]



Overall Average Score: 0.76
Processed 113 evaluations

Summary DataFrame:
                                          llm_output  \
0  Selective Serotonin Reuptake Inhibitor antidep...   
1  SSRIs More commonly used Major Depressive Diso...   
2  Monoamine Oxidase Inhibitors Less commonly use...   
3  Vagus Nerve Stimulation Treats resistant unila...   
4  Selective Serotonin Reuptake Inhibitors (SSRIs...   

                          best_matching_ground_truth  \
0  Bupropion Properties Norepinephrine and dopami...   
1  Bupropion Finding Is more likely than SSRIs to...   
2  Pharmacotherapy Entails Monoamine oxidase inhi...   
3   Pharmacotherapy Treats Acute phase of depression   
4  Norepinephrine reuptake inhibitors Eligible fo...   

                                     best_evaluation  best_score  
0  Score: 0.50\nThe LLM output is incorrect as it...        0.50  
1  Score: 0.75\nThe LLM output incorrectly focuse...        0.75  
2  Score: 0.75\nThe LLM output correctly identifi.

Evaluating relationships: 100%|███████████████| 116/116 [03:27<00:00,  1.79s/it]


Overall Average Score: 0.75
Processed 113 evaluations

Summary DataFrame:
                                          llm_output  \
0  Selective Serotonin Reuptake Inhibitor antidep...   
1  SSRIs More commonly used Major Depressive Diso...   
2  Monoamine Oxidase Inhibitors Less commonly use...   
3  Vagus Nerve Stimulation Treats resistant unila...   
4  Selective Serotonin Reuptake Inhibitors (SSRIs...   

                          best_matching_ground_truth  \
0  Bupropion Properties Norepinephrine and dopami...   
1  Bupropion Properties Has a more activating pro...   
2  Pharmacotherapy Entails Monoamine oxidase inhi...   
3   Pharmacotherapy Treats Acute phase of depression   
4  Depression Side effect Slower response to trea...   

                                     best_evaluation  best_score  
0  Score: 0.50\nThe LLM output is incorrect as it...        0.50  
1  Score: 0.50\nThe LLM output partially captures...        0.50  
2  Score: 0.75\nThe LLM output is mostly correct .




In [5]:
final_evals

[0.7168987341772152,
 0.6623170731707316,
 0.7179611650485436,
 0.6867924528301886,
 0.6574074074074074,
 0.747142857142857,
 0.7402061855670105,
 0.7610619469026548,
 0.7477876106194691]