# GPT Critic

In [None]:
import concurrent.futures
import re
from openai import OpenAI
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import fitz
# Initialize OpenAI client
client = OpenAI(api_key='')
def read_pdf(pdf_file):
    start=False
    sentences=[]
    start_idx=0
    with fitz.open(pdf_file) as pdf_file:
        for page_index, page in enumerate(pdf_file):
            text = page.get_text("text").lower()
            text=text.split(". ")
            sentences.extend(text)
                
    return sentences
def read_files(root_dir, hand):
    
    lines=[]
    for files in os.listdir(root_dir):
        if files[-4:] != '.pdf':
            continue
        sentences = read_pdf(f"{root_dir}/{files}")
        lines.extend(sentences)

    # read in hand annotations
    for p in hand.iterrows():
        rel = p[1]['rel']
        subj = p[1]['subj']
        obj = p[1]['obj']
        out=f"{subj} {rel} {obj}" 
        lines.append(out)


    return lines

def extract_score(evaluation):
    """Extract numerical score from GPT evaluation"""
    match = re.search(r'(?:Overall correctness score|Score|I would score the LLM output a)[:\s]*([0-1](?:\.\d+)?)', evaluation, re.IGNORECASE)
    return float(match.group(1)) if match else -1

def is_relevant(ground_truth, llm_output):
    """Check if LLM output is relevant to ground truth"""
    gt_terms = set(ground_truth.lower().split())
    llm_terms = set(llm_output.lower().split())
    return len(gt_terms.intersection(llm_terms)) > 1

def get_critic_gpt_evaluation(llm_text, gt_text):
    """Get evaluation from GPT for a single comparison"""
    try:
        prompt = f"""
        Compare these statements:
        Ground Truth: {gt_text}
        LLM Output: {llm_text}
        
        Score from 0 to 1 for overall correctness (1 being highly correct). Format: 'Score: X.XX'
        Brief explanation (one sentence).
        """
        
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=150
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error during evaluation: {e}")
        return None

def evaluate_single_output(llm_row, ground_truth):
    """Evaluate a single LLM output against all ground truth entries"""
    llm_text = f"{llm_row['subj']} {llm_row['rel']} {llm_row['obj']}"
    best_score = -1
    best_evaluation = None
    best_gt_row = None
    
    for gt in ground_truth:
        if not is_relevant(gt, llm_text):
            continue
            
        evaluation = get_critic_gpt_evaluation(llm_text, gt)
        
        if evaluation:
            score = extract_score(evaluation)
            if score > best_score:
                best_score = score
                best_evaluation = evaluation
                best_gt_row = gt
    
    return {
        'llm_output': llm_text,
        'best_matching_ground_truth': best_gt_row,
        'best_evaluation': best_evaluation,
        'best_score': best_score
    }

def evaluate_all_outputs(llm_output_df, ground_truth_df, max_workers=10):
    """Process all LLM outputs in parallel"""
    all_evaluations = []
    total = len(llm_output_df)
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Create future tasks
        future_to_row = {
            executor.submit(evaluate_single_output, row, ground_truth_df): i 
            for i, (_, row) in enumerate(llm_output_df.iterrows())
        }
        
        # Process results with progress bar
        with tqdm(total=total, desc="Evaluating relationships") as pbar:
            for future in concurrent.futures.as_completed(future_to_row):
                try:
                    result = future.result()
                    if result['best_score'] != -1:
                        all_evaluations.append(result)
                except Exception as e:
                    print(f"Error processing row: {e}")
                pbar.update(1)
    
    return all_evaluations

# Load data
final_evals = []
hand = pd.read_csv("../Results/ground_truth.csv")
ground_truth = read_files("../Docs",hand)
rel_docs = ["Temperature0point2.csv", 
            "Temperature1_WithExamples.csv", "Temperature1_WithoutExamples.csv", 
            "Temperature1_WithoutExamples.csv", 'NewRels_Skip1_Temperature1.csv']

for doc in rel_docs[-1:]:
    doc_ = f'../Results/{doc}'
    llm_output = pd.read_csv(doc_)
    print(f"Processing {len(llm_output)} LLM outputs against {len(ground_truth)} ground truth statements...")

    # Run evaluation
    evaluations = evaluate_all_outputs(
        llm_output_df=llm_output,
        ground_truth_df=ground_truth,
        max_workers=10
    )

    # Calculate average score
    valid_scores = [eval['best_score'] for eval in evaluations if eval['best_score'] != -1]
    average_score = np.mean(valid_scores) if valid_scores else 0
    
    # Print results
    print(f"\nOverall Average Score {doc_}: {average_score:.2f}")
    print(f"Processed {len(evaluations)} evaluations")
    # Save results to DataFrame
    final_evals.append(average_score)
    results_df = pd.DataFrame(evaluations)
    print("\nSummary DataFrame:")
    print(results_df.head())
    
    # Save results
    results_df.to_csv(f'GPT_critic_eval_for_{doc}', index=False)

Processing 314 LLM outputs against 1412 ground truth statements...


Evaluating relationships:   0%|                         | 0/314 [00:02<?, ?it/s]


In [7]:
for i in range(len(final_evals)):
    print(f"Accuracy for {rel_docs[i]}: {final_evals[i]:.2f}")

Accuracy for NewRels_Skip2_cummulative.csv: 0.72
Accuracy for NewRels_Skip2_increments.csv: 0.67
Accuracy for NewRels_Skip3_cummulative.csv: 0.72
Accuracy for NewRels_Skip3_increments.csv: 0.68
Accuracy for NewRels_Skip4_increments.csv: 0.65
Accuracy for Temperature0point2.csv: 0.76
Accuracy for Temperature1_WithExamples.csv: 0.74
Accuracy for Temperature1_WithoutExamples_cleaned.csv: 0.76
Accuracy for Temperature1_WithoutExamples.csv: 0.76


In [49]:
def read_pdf(pdf_file):
    start=False
    sentences=[]
    start_idx=0
    with fitz.open(pdf_file) as pdf_file:
        for page_index, page in enumerate(pdf_file):
            text = page.get_text("text").lower()
            text=text.split(". ")
            for sub in text:
                if 'abstract' in sub or 'intro' in sub:
                    start=True
                    if 'abstract' in sub:
                        sub.index('abstract')
                    else:
                        sub.index('intro')
                        
                if start:
                    sentences.append(sub)
                
    return sentences
def read_files(root_dir, hand):
    
    lines=[]
    for files in os.listdir(root_dir):
        if files[-4:] != '.pdf':
            continue
        sentences = read_pdf(f"{root_dir}/{files}")
        lines.extend(sentences)

    # read in hand annotations
    for p in hand.iterrows():
        rel = p[1]['rel']
        subj = p[1]['subj']
        obj = p[1]['obj']
        out=f"{subj} {rel} {obj}" 
        lines.append(out)


    return lines
hand = pd.read_csv("../Results/ground_truth.csv")
ground_truth = read_files("../Docs",hand)

In [None]:
for i in range(len(final_evals)):
    print(f"Accuracy for {rel_docs[i]}: {final_evals[i]:.2f}")

In [None]:
Overall Average Score ../Results/NewRels_Skip2_increments.csv: 0.89
Overall Average Score ../Results/NewRels_Skip3_increments.csv: 0.86
Overall Average Score ../Results/NewRels_Skip4_increments.csv: 0.87
Overall Average Score ../Results/Temperature1_WithoutExamples.csv: 0.90
Overall Average Score ../Results/Temperature0point2.csv: 0.91