# GPT Critic

In [3]:
import concurrent.futures
import re
from openai import OpenAI
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import fitz
# Initialize OpenAI client
client = OpenAI(api_key='')
def read_pdf(pdf_file):
    start=False
    sentences=[]
    start_idx=0
    with fitz.open(pdf_file) as pdf_file:
        for page_index, page in enumerate(pdf_file):
            text = page.get_text("text").lower()
            text=text.split(". ")
            sentences.extend(text)
                
    return sentences
def read_files(root_dir, hand):
    
    lines=[]
    for files in os.listdir(root_dir):
        if files[-4:] != '.pdf':
            continue
        sentences = read_pdf(f"{root_dir}/{files}")
        lines.extend(sentences)

    # read in hand annotations
    for p in hand.iterrows():
        rel = p[1]['rel']
        subj = p[1]['subj']
        obj = p[1]['obj']
        out=f"{subj} {rel} {obj}" 
        lines.append(out)


    return lines

def extract_score(evaluation):
    """Extract numerical score from GPT evaluation"""
    match = re.search(r'(?:Overall correctness score|Score|I would score the LLM output a)[:\s]*([0-1](?:\.\d+)?)', evaluation, re.IGNORECASE)
    return float(match.group(1)) if match else -1

def is_relevant(ground_truth, llm_output):
    """Check if LLM output is relevant to ground truth"""
    gt_terms = set(ground_truth.lower().split())
    llm_terms = set(llm_output.lower().split())
    return len(gt_terms.intersection(llm_terms)) > 1

def get_critic_gpt_evaluation(llm_text, gt_text):
    """Get evaluation from GPT for a single comparison"""
    try:
        prompt = f"""
        Compare these statements:
        Ground Truth: {gt_text}
        LLM Output: {llm_text}
        
        Score from 0 to 1 for overall correctness (1 being highly correct). Format: 'Score: X.XX'
        Brief explanation (one sentence).
        """
        
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=150
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error during evaluation: {e}")
        return None

def evaluate_single_output(llm_row, ground_truth):
    """Evaluate a single LLM output against all ground truth entries"""
    llm_text = f"{llm_row['subj']} {llm_row['rel']} {llm_row['obj']}"
    best_score = -1
    best_evaluation = None
    best_gt_row = None
    
    for gt in ground_truth:
        if not is_relevant(gt, llm_text):
            continue
            
        evaluation = get_critic_gpt_evaluation(llm_text, gt)
        
        if evaluation:
            score = extract_score(evaluation)
            if score > best_score:
                best_score = score
                best_evaluation = evaluation
                best_gt_row = gt
    
    return {
        'llm_output': llm_text,
        'best_matching_ground_truth': best_gt_row,
        'best_evaluation': best_evaluation,
        'best_score': best_score
    }

def evaluate_all_outputs(llm_output_df, ground_truth_df, max_workers=10):
    """Process all LLM outputs in parallel"""
    all_evaluations = []
    total = len(llm_output_df)
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Create future tasks
        future_to_row = {
            executor.submit(evaluate_single_output, row, ground_truth_df): i 
            for i, (_, row) in enumerate(llm_output_df.iterrows())
        }
        
        # Process results with progress bar
        with tqdm(total=total, desc="Evaluating relationships") as pbar:
            for future in concurrent.futures.as_completed(future_to_row):
                try:
                    result = future.result()
                    if result['best_score'] != -1:
                        all_evaluations.append(result)
                except Exception as e:
                    print(f"Error processing row: {e}")
                pbar.update(1)
    
    return all_evaluations

# Load data
final_evals = []
hand = pd.read_csv("../Results/ground_truth.csv")
ground_truth = read_files("../Docs",hand)
rel_docs = ["Temperature0point2.csv", 
            "Temperature1_WithExamples.csv", "Temperature1_WithoutExamples.csv", 
            "Temperature1_WithoutExamples.csv"]

for doc in rel_docs:
    doc_ = f'../Results/{doc}'
    llm_output = pd.read_csv(doc_)
    print(f"Processing {len(llm_output)} LLM outputs against {len(ground_truth)} ground truth statements...")

    # Run evaluation
    evaluations = evaluate_all_outputs(
        llm_output_df=llm_output,
        ground_truth_df=ground_truth,
        max_workers=10
    )

    # Calculate average score
    valid_scores = [eval['best_score'] for eval in evaluations if eval['best_score'] != -1]
    average_score = np.mean(valid_scores) if valid_scores else 0
    
    # Print results
    print(f"\nOverall Average Score {doc_}: {average_score:.2f}")
    print(f"Processed {len(evaluations)} evaluations")
    # Save results to DataFrame
    final_evals.append(average_score)
    results_df = pd.DataFrame(evaluations)
    print("\nSummary DataFrame:")
    print(results_df.head())
    
    # Save results
    results_df.to_csv(f'GPT_critic_eval_for_{doc}', index=False)

Processing 109 LLM outputs against 1412 ground truth statements...


Evaluating relationships: 100%|███████████████| 109/109 [36:35<00:00, 20.14s/it]



Overall Average Score ../Results/Temperature0point2.csv: 0.89
Processed 109 evaluations

Summary DataFrame:
                                          llm_output  \
0                  Problem-solve therapy synonym PST   
1  VNS Usage treatment-resistant unilateral or bi...   
2  Anxious Depression side effects low remission ...   
3  Ketamine Highly effective for treatment-resist...   
4          SSRIs More commonly used Major Depression   

                          best_matching_ground_truth  \
0  despite its small effect sizes, pst is compara...   
1  possible neurotoxicity and \ndrug dependence\n...   
2  for both endpoints—remis-\nsion as defined by ...   
3  side effects and adverse events in patients in...   
4  we used response \nrate instead of a continuou...   

                                     best_evaluation  best_score  
0  Score: 0.50\nThe LLM output correctly identifi...        0.50  
1  Score: 0.75\nThe LLM output is somewhat relate...        0.75  
2  Score: 1.00\n

Evaluating relationships: 100%|███████████████| 97/97 [1:12:44<00:00, 45.00s/it]



Overall Average Score ../Results/Temperature1_WithExamples.csv: 0.91
Processed 97 evaluations

Summary DataFrame:
                                          llm_output  \
0  Anxious Depression side effects low remission ...   
1  Anxious Depression side effects low remission ...   
2  Anxious Depression side effects low response r...   
3  Anxious Depression side effects low response r...   
4                Physical exercise treats Depression   

                          best_matching_ground_truth  \
0  although these data\nand the relatively linear...   
1  ham-d anxiety/somatization factor score versus...   
2  notably, the num-\nber of hospitalizations for...   
3  side effects and adverse events in patients in...   
4  Physical Exercise Finding an adjunct to other ...   

                                     best_evaluation  best_score  
0  Score: 0.85\nThe LLM output captures the essen...        0.85  
1  Score: 1.00\nThe LLM output accurately summari...        1.00  
2  Score: 

Evaluating relationships: 100%|███████████████| 116/116 [40:46<00:00, 21.09s/it]



Overall Average Score ../Results/Temperature1_WithoutExamples.csv: 0.91
Processed 116 evaluations

Summary DataFrame:
                                          llm_output  \
0        Electroconvulsive therapy Treats Depression   
1  Vagus Nerve Stimulation Treats resistant unila...   
2  Ketamine Side effects dizziness, neurotoxicity...   
3  SSRIs More commonly used Major Depressive Diso...   
4  Monoamine Oxidase Inhibitors Less commonly use...   

                          best_matching_ground_truth  \
0  combining medications, psychotherapy, and \nso...   
1  a treatment manual, initial didactic instructi...   
2  Ketamine Side effects dizziness, neurotoxicity...   
3  therefore, most \nguidelines currently recomme...   
4  wjcc\nhttps://www.wjgnet.com\n9350\nnovember 6...   

                                     best_evaluation  best_score  
0  Score: 0.75\nThe LLM output accurately identif...        0.75  
1  Score: 0.80\nThe LLM output accurately summari...        0.80  
2  Sco

Evaluating relationships: 100%|███████████████| 116/116 [40:24<00:00, 20.91s/it]


Overall Average Score ../Results/Temperature1_WithoutExamples.csv: 0.91
Processed 116 evaluations

Summary DataFrame:
                                          llm_output  \
0        Electroconvulsive therapy Treats Depression   
1  Vagus Nerve Stimulation Treats resistant unila...   
2  Ketamine Side effects dizziness, neurotoxicity...   
3  SSRIs More commonly used Major Depressive Diso...   
4  Monoamine Oxidase Inhibitors Less commonly use...   

                          best_matching_ground_truth  \
0  electrocon-\nvulsive therapy is the most effec...   
1  tdcs, as a \nrelatively simple and portable te...   
2  Ketamine Side effects dizziness, neurotoxicity...   
3  we used response \nrate instead of a continuou...   
4  http://ctr.gsk.co.uk/summary/paroxetine/iii_\n...   

                                     best_evaluation  best_score  
0  Score: 0.80\nThe LLM output captures the main ...        0.80  
1  Score: 0.80\nThe LLM output captures the main ...        0.80  
2  Sco




In [7]:
for i in range(len(final_evals)):
    print(f"Accuracy for {rel_docs[i]}: {final_evals[i]:.2f}")

Accuracy for NewRels_Skip2_cummulative.csv: 0.72
Accuracy for NewRels_Skip2_increments.csv: 0.67
Accuracy for NewRels_Skip3_cummulative.csv: 0.72
Accuracy for NewRels_Skip3_increments.csv: 0.68
Accuracy for NewRels_Skip4_increments.csv: 0.65
Accuracy for Temperature0point2.csv: 0.76
Accuracy for Temperature1_WithExamples.csv: 0.74
Accuracy for Temperature1_WithoutExamples_cleaned.csv: 0.76
Accuracy for Temperature1_WithoutExamples.csv: 0.76


In [31]:
t="""d’s anxiety escalated over the next few 
days, and a benzodiazepine was prescribed, which 
brought about a signiﬁcant improvement in her insomnia 
and agitation"""
p='benzodiazepine treats insomnia and agitation'
def vec(sentences):
    # Encode sentences
    embeddings = model.encode([sentences[0], sentences[1]])
    
    # Compute cosine similarity
    similarity = util.cos_sim(embeddings[0], embeddings[1])
    return similarity.item() # Value close to 1 indicates high similarity
import fitz,os
from sentence_transformers import SentenceTransformer, util

# Load a pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')
import nltk
from nltk.stem import PorterStemmer
nltk.download("punkt")

# Initialize Python porter stemmer
ps = PorterStemmer()
def lemmatize(sent):
    return [ps.stem(word) for word in sent.split()]
pred=" ".join(lemmatize(p.lower()))
test=" ".join(lemmatize(t))
cos = vec([t,p])

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rishikasrinivas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [49]:
def read_pdf(pdf_file):
    start=False
    sentences=[]
    start_idx=0
    with fitz.open(pdf_file) as pdf_file:
        for page_index, page in enumerate(pdf_file):
            text = page.get_text("text").lower()
            text=text.split(". ")
            for sub in text:
                if 'abstract' in sub or 'intro' in sub:
                    start=True
                    if 'abstract' in sub:
                        sub.index('abstract')
                    else:
                        sub.index('intro')
                        
                if start:
                    sentences.append(sub)
                
    return sentences
def read_files(root_dir, hand):
    
    lines=[]
    for files in os.listdir(root_dir):
        if files[-4:] != '.pdf':
            continue
        sentences = read_pdf(f"{root_dir}/{files}")
        lines.extend(sentences)

    # read in hand annotations
    for p in hand.iterrows():
        rel = p[1]['rel']
        subj = p[1]['subj']
        obj = p[1]['obj']
        out=f"{subj} {rel} {obj}" 
        lines.append(out)


    return lines
hand = pd.read_csv("../Results/ground_truth.csv")
ground_truth = read_files("../Docs",hand)

In [52]:
for i in ground_truth:
    if 'serotonin' in i:
        print(i)


pharmacotherapy, especially selective 
serotonin reuptake inhibitors antidepressants, remains the most frequent option 
for treating depression during the acute phase, while other promising pharmaco-
logical options are still competing for the attention of practitioners
in this article, we discuss 
various treatment options implemented by clinicians, highlighting the role that each 
option plays in actual psychiatric practice.
pharmacotherapy
while selective serotonin reuptake inhibitors (ssris) remain the gold-standard 
treatment for depression, new antidepressants are always being developed and tested

maois’ effectiveness is still unclear for treating depression in patients who are 
resistant to multiple sequential trials with ssris and serotonin-norepinephrine 
reuptake inhibitors (snris)[14]
therefore, most 
guidelines currently recommend ssris as the first-line treatment for patients with 
major depression[25].
norepinephrine reuptake inhibitors
other monoamine (norepinephrine, s

In [None]:
for i in range(len(final_evals)):
    print(f"Accuracy for {rel_docs[i]}: {final_evals[i]:.2f}")

In [None]:
Overall Average Score ../Results/NewRels_Skip2_increments.csv: 0.89
Overall Average Score ../Results/NewRels_Skip3_increments.csv: 0.86
Overall Average Score ../Results/NewRels_Skip4_increments.csv: 0.87
Overall Average Score ../Results/Temperature1_WithoutExamples.csv: 0.90
Overall Average Score ../Results/Temperature0point2.csv: 0.91