# GPT Critic

In [9]:
import concurrent.futures
import re
from openai import OpenAI
import pandas as pd
import numpy as np
from tqdm import tqdm

# Initialize OpenAI client
client = OpenAI(api_key='')

def extract_score(evaluation):
    """Extract numerical score from GPT evaluation"""
    match = re.search(r'(?:Overall correctness score|Score|I would score the LLM output a)[:\s]*([0-1](?:\.\d+)?)', evaluation, re.IGNORECASE)
    return float(match.group(1)) if match else -1

def is_relevant(ground_truth, llm_output):
    """Check if LLM output is relevant to ground truth"""
    gt_terms = set(ground_truth.lower().split())
    llm_terms = set(llm_output.lower().split())
    return len(gt_terms.intersection(llm_terms)) > 1

def get_critic_gpt_evaluation(llm_text, gt_text):
    """Get evaluation from GPT for a single comparison"""
    try:
        prompt = f"""
        Compare these statements:
        Ground Truth: {gt_text}
        LLM Output: {llm_text}
        
        Score from 0 to 1 for overall correctness (1 being highly correct). Format: 'Score: X.XX'
        Brief explanation (one sentence).
        """
        
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=150
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error during evaluation: {e}")
        return None

def evaluate_single_output(llm_row, ground_truth_df):
    """Evaluate a single LLM output against all ground truth entries"""
    llm_text = f"{llm_row['subj']} {llm_row['rel']} {llm_row['obj']}"
    best_score = -1
    best_evaluation = None
    best_gt_row = None
    
    for _, gt_row in ground_truth_df.iterrows():
        gt_text = f"{gt_row['subj']} {gt_row['rel']} {gt_row['obj']}"
        
        if not is_relevant(gt_text, llm_text):
            continue
            
        evaluation = get_critic_gpt_evaluation(llm_text, gt_text)
        
        if evaluation:
            score = extract_score(evaluation)
            if score > best_score:
                best_score = score
                best_evaluation = evaluation
                best_gt_row = gt_text
    
    return {
        'llm_output': llm_text,
        'best_matching_ground_truth': best_gt_row,
        'best_evaluation': best_evaluation,
        'best_score': best_score
    }

def evaluate_all_outputs(llm_output_df, ground_truth_df, max_workers=10):
    """Process all LLM outputs in parallel"""
    all_evaluations = []
    total = len(llm_output_df)
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Create future tasks
        future_to_row = {
            executor.submit(evaluate_single_output, row, ground_truth_df): i 
            for i, (_, row) in enumerate(llm_output_df.iterrows())
        }
        
        # Process results with progress bar
        with tqdm(total=total, desc="Evaluating relationships") as pbar:
            for future in concurrent.futures.as_completed(future_to_row):
                try:
                    result = future.result()
                    if result['best_score'] != -1:
                        all_evaluations.append(result)
                except Exception as e:
                    print(f"Error processing row: {e}")
                pbar.update(1)
    
    return all_evaluations

# Load data
llm_output = pd.read_csv("Results/NewRels_Skip3_increments.csv")
ground_truth = pd.read_csv("Results/ground_truth.csv")

print(f"Processing {len(llm_output)} LLM outputs against {len(ground_truth)} ground truth statements...")

# Run evaluation
evaluations = evaluate_all_outputs(
    llm_output_df=llm_output,
    ground_truth_df=ground_truth,
    max_workers=10
)

# Calculate average score
valid_scores = [eval['best_score'] for eval in evaluations if eval['best_score'] != -1]
average_score = np.mean(valid_scores) if valid_scores else 0

# Print results
print(f"\nOverall Average Score: {average_score:.2f}")
print(f"Processed {len(evaluations)} evaluations")

# Save results to DataFrame
results_df = pd.DataFrame(evaluations)
print("\nSummary DataFrame:")
print(results_df.head())

# Save results
results_df.to_csv('../Results/GPT_critic_llm_evaluation_results.csv', index=False)

Processing 110 LLM outputs against 176 ground truth statements...


Evaluating relationships: 100%|█████████████████████████████████████████████████████████████████████████████████| 110/110 [03:23<00:00,  1.85s/it]


Overall Average Score: 0.69
Processed 106 evaluations

Summary DataFrame:
                                          llm_output  \
0  venlafaxine comparable to tricyclic antidepres...   
1  selective serotonin reuptake inhibitors (ssris...   
2  mirtazapine properties increases serotonin or ...   
3  selective serotonin reuptake inhibitors (ssris...   
4  vortioxetine specific efficacy in treating dep...   

                          best_matching_ground_truth  \
0  Monoamine oxidase inhibitors Same efficacy as ...   
1  Norepinephrine reuptake inhibitors Eligible fo...   
2  Mirtazapine Increases Availability of serotoni...   
3  Pharmacotherapy Entails Norepinephrine reuptak...   
4  Problem-solving therapy Use Preventing depress...   

                                     best_evaluation  best_score  
0  Score: 0.75\nThe LLM output is accurate in ide...        0.75  
1  Score: 0.50\nThe LLM output is only partially ...        0.50  
2  Score: 0.90\nThe LLM output is almost identica.


