# Hallucination Evaluation

In [3]:
# Install necessary libraries
import pandas as pd
import os
import time
from tqdm import tqdm
from deepeval import evaluate
from deepeval.metrics import HallucinationMetric
from deepeval.test_case import LLMTestCase
import logging
import warnings
import deepeval
import nest_asyncio

In [None]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Set your API keys
PUBMED_API_KEY = ""
OPENAI_API_KEY = ""  # Replace with your OpenAI API key
EMAIL = "KymariBratton@Gmail.com"

# Set up APIs
os.environ["PUBMED_API_KEY"] = PUBMED_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
Entrez.email = EMAIL
Entrez.api_key = PUBMED_API_KEY



def get_pubmed_context(subject, relationship, object_term):
    """Get PubMed context with improved search and error handling"""
    contexts = []
    try:
        # Construct search query
        query = f'("{subject}") AND ("{object_term}")'
        
        handle = Entrez.esearch(db="pubmed", term=query, retmax=3)
        record = Entrez.read(handle)
        handle.close()
        
        pubmed_ids = record.get("IdList", [])
        
        for pmid in pubmed_ids:
            try:
                handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
                article = Entrez.read(handle)['PubmedArticle'][0]
                handle.close()
                
                # Get both title and abstract
                article_data = article['MedlineCitation']['Article']
                title = article_data.get('ArticleTitle', '')
                abstract = article_data.get('Abstract', {}).get('AbstractText', [''])[0]
                
                if abstract:
                    context = f"PubMed ID {pmid} - Title: {title}\nAbstract: {abstract[:500]}"
                    contexts.append(context)
                
                time.sleep(0.34)  # Respect PubMed rate limits
            except Exception as e:
                logger.warning(f"Error fetching PMID {pmid}: {str(e)}")
                continue
                
        return contexts if contexts else ["No relevant PubMed context found"]
    
    except Exception as e:
        logger.error(f"PubMed search error: {str(e)}")
        return ["Error retrieving PubMed context"]

def create_and_evaluate_test_cases(annotations_file, new_rels_file):
    """Create and evaluate test cases with proper error handling"""
    try:
        # Load data
        annotations_df = pd.read_csv(annotations_file)
        newrels_df = pd.read_csv(new_rels_file)
        
        test_cases = []
        logger.info("Creating test cases...")
        
        # Create test cases
        for _, gt_row in tqdm(annotations_df.iterrows(), total=len(annotations_df)):
            # Find matching relationships
            matching_rels = newrels_df[
                (newrels_df['subj'].str.lower() == gt_row['subj'].lower()) |
                (newrels_df['subjSummary'].str.lower() == gt_row['subj'].lower())
            ]
            
            if matching_rels.empty:
                continue
                
            # Get PubMed context
            context = get_pubmed_context(
                gt_row['subj'],
                gt_row['rel'],
                gt_row['obj']
            )
            
            # Create test case for each matching relationship
            for _, gen_row in matching_rels.iterrows():
                test_case = LLMTestCase(
                    input=f"Evaluate the relationship between {gt_row['subj']} and {gt_row['obj']}",
                    actual_output=f"{gen_row['subj']} {gen_row['rel']} {gen_row['obj']}",
                    expected_output=f"{gt_row['subj']} {gt_row['rel']} {gt_row['obj']}",
                    context=context
                )
                test_cases.append(test_case)
        
        if not test_cases:
            logger.error("No test cases were created!")
            return
            
        logger.info(f"Created {len(test_cases)} test cases")
        
        # Set up hallucination metric
        hallucination_metric = HallucinationMetric(
            threshold=0.5,
            model="gpt-4-turbo"
        )
        
        # Evaluate test cases
        logger.info("Evaluating test cases...")
        evaluation_results = evaluate(test_cases, [hallucination_metric])
        
        # Process results
        scores = []
        for result in evaluation_results:
            if hasattr(result, 'metrics') and 'HallucinationMetric' in result.metrics:
                score = result.metrics['HallucinationMetric'].score
                scores.append(score)
        
        if not scores:
            logger.error("No valid scores were generated!")
            return
            
        # Calculate and print statistics
        total_cases = len(scores)
        avg_score = sum(scores) / total_cases
        low_hallucination = sum(1 for score in scores if score < 0.5)
        high_hallucination = sum(1 for score in scores if score >= 0.5)
        
        print("\n=== Hallucination Evaluation Results ===")
        print(f"Total Test Cases Evaluated: {total_cases}")
        print(f"Average Hallucination Score: {avg_score:.3f}")
        print(f"Low Hallucination Cases (<0.5): {low_hallucination} ({(low_hallucination/total_cases)*100:.1f}%)")
        print(f"High Hallucination Cases (≥0.5): {high_hallucination} ({(high_hallucination/total_cases)*100:.1f}%)")
        
        return scores
        
    except Exception as e:
        logger.error(f"Error in evaluation process: {str(e)}")
        raise

if __name__ == "__main__":
    try:
        scores = create_and_evaluate_test_cases(
            annotations_file='Annotations.csv',
            new_rels_file='NewRels_Skip2.csv'
        )
        
        if not scores:
            print("No valid results were generated. Please check the logs for details.")
            
    except Exception as e:
        print(f"An error occurred: {str(e)}")

In [38]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def create_and_evaluate_test_cases(annotations_file, new_rels_file, docs_path):
    """Create and evaluate test cases with local document context"""
    try:
        # Load data
        annotations_df = pd.read_csv(annotations_file)
        newrels_df = pd.read_csv(new_rels_file)
        
        test_cases = []
        logger.info("Creating test cases...")
        
        # Create test cases
        for _, gt_row in tqdm(annotations_df.iterrows(), total=len(annotations_df)):
            # Find matching relationships
            matching_rels = newrels_df[
                (newrels_df['subj'].str.lower() == gt_row['subj'].lower()) |
                (newrels_df['subjSummary'].str.lower() == gt_row['subj'].lower())
            ]
            
            if matching_rels.empty:
                continue
                
            # Get local document context
            context = get_local_document_context(
                gt_row['subj'],
                gt_row['rel'],
                gt_row['obj'],
                docs_path
            )
            
            # Create test case for each matching relationship
            for _, gen_row in matching_rels.iterrows():
                test_case = LLMTestCase(
                    input=f"Evaluate the relationship between {gt_row['subj']} and {gt_row['obj']}",
                    actual_output=f"{gen_row['subj']} {gen_row['rel']} {gen_row['obj']}",
                    expected_output=f"{gt_row['subj']} {gt_row['rel']} {gt_row['obj']}",
                    context=context
                )
                test_cases.append(test_case)
        
        if not test_cases:
            logger.warning("No test cases were created!")
            return
            
        logger.info(f"Created {len(test_cases)} test cases")
        
        # Set up hallucination metric
        hallucination_metric = HallucinationMetric(
            threshold=0.5,
            model="gpt-4-turbo"
        )
        
        # Evaluate test cases
        logger.info("Evaluating test cases...")
        evaluation_result = evaluate(test_cases, [hallucination_metric])
        
        # Debug evaluation attributes only in DEBUG mode
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug(f"Evaluation result type: {type(evaluation_result)}")
            logger.debug(f"Evaluation result attributes: {dir(evaluation_result)}")
        
        # Process results
        scores = []
        # Check if the result has results attribute
        if hasattr(evaluation_result, 'results'):
            for single_result in evaluation_result.results:
                if hasattr(single_result, 'metrics'):
                    metric = single_result.metrics.get('HallucinationMetric')
                    if metric and hasattr(metric, 'score'):
                        scores.append(metric.score)
        
        if not scores:
            logger.warning("No valid scores were generated!")
            return
            
        # Calculate and print statistics
        total_cases = len(scores)
        avg_score = sum(scores) / total_cases
        low_hallucination = sum(1 for score in scores if score < 0.5)
        high_hallucination = sum(1 for score in scores if score >= 0.5)
        
        print("\n=== Hallucination Evaluation Results ===")
        print(f"Total Test Cases Evaluated: {total_cases}")
        print(f"Average Hallucination Score: {avg_score:.3f}")
        print(f"Low Hallucination Cases (<0.5): {low_hallucination} ({(low_hallucination/total_cases)*100:.1f}%)")
        print(f"High Hallucination Cases (≥0.5): {high_hallucination} ({(high_hallucination/total_cases)*100:.1f}%)")
        
        return scores
        
    except Exception as e:
        logger.error(f"Error in evaluation process: {str(e)}")
        raise

if __name__ == "__main__":
    try:
        docs_path = "Docs/"
        
        scores = create_and_evaluate_test_cases(
            annotations_file='Annotations.csv',
            new_rels_file='NewRels_Skip2.csv',
            docs_path=docs_path
        )
        
        if not scores:
            print("No valid results were generated. Please check the logs for details.")
            
    except Exception as e:
        print(f"An error occurred: {str(e)}")


2024-11-21 12:13:07,555 - INFO - Creating test cases...
100%|████████████████████████████████████████| 176/176 [00:00<00:00, 714.04it/s]
2024-11-21 12:13:07,804 - INFO - Created 224 test cases
2024-11-21 12:13:07,877 - INFO - Evaluating test cases...


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 224 test case(s) in parallel: | |  0% (0/224) [Time Taken: 00:00, ?te2024-11-21 12:13:12,691 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-21 12:13:12,859 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-21 12:13:13,471 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-21 12:13:13,634 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-21 12:13:13,636 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-21 12:13:13,637 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-21 12:13:13,638 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-21 12:13:13,640 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-21 12:13



Metrics Summary

  - ✅ Hallucination (score: 0.0, threshold: 0.5, strict: False, evaluation model: gpt-4-turbo, reason: The score is 0.00 because the actual output perfectly aligns with the provided context and there are no contradictions., error: None)

For test case:

  - input: Evaluate the relationship between SSRIs and Pharmacotherapy
  - actual output: Selective Serotonin Reuptake Inhibitors (SSRIs) first-line treatment for patients with major depression
  - expected output: SSRIs Is a part of Pharmacotherapy
  - context: ['File: WJCC.txt\nContext: te phase of a major depressive episode aims to help the patient reach a remission state and eventually return to their baseline level of functioning. pharmacotherapy, especially selective serotonin reuptake inhibitors antidepressants, remains the most frequent option for treating depression during the acute phase, while other promising pharmaco- logical options are still competing for the attention of practitioners. depression- focus



No valid results were generated. Please check the logs for details.


# Deep Eval

In [None]:
import fitz,os
from sentence_transformers import SentenceTransformer, util

# Load a pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')
import nltk
from nltk.stem import PorterStemmer
nltk.download("punkt")

# Initialize Python porter stemmer
ps = PorterStemmer()
def lemmatize(sent):
    return [ps.stem(word) for word in sent.split()]
#reading the pdf and hand annotation files
def read_pdf(pdf_file):
    start=False
    sentences=[]
    start_idx=0
    with fitz.open(pdf_file) as pdf_file:
        for page_index, page in enumerate(pdf_file):
            text = page.get_text("text").lower()
            text=text.split(". ")
            sentences.extend(text)
                
    return sentences
def read_files(root_dir, hand):
    
    lines=[]
    for files in os.listdir(root_dir):
        if files[-4:] != '.pdf':
            continue
        sentences = read_pdf(f"{root_dir}/{files}")
        lines.extend(sentences)

    # read in hand annotations
    for p in hand.iterrows():
        rel = p[1]['rel']
        subj = p[1]['subj']
        obj = p[1]['obj']
        out=f"{subj} {rel} {obj}" 
        lines.append(out)


    return lines

#computing cosine similarity
def vec(sentences):
    # Encode sentences
    embeddings = model.encode([sentences[0], sentences[1]])
    
    # Compute cosine similarity
    similarity = util.cos_sim(embeddings[0], embeddings[1])
    return similarity.item() # Value close to 1 indicates high similarity
    
#finding if the target string (relation triplet) is in the src (pdf + hand annotation)
def find(target, src):
    found=False
    matching_sentence=""
 
    for idx,sentence in enumerate(src):
        pred=" ".join(lemmatize(target))
        test=" ".join(lemmatize(sentence))
        cos = vec([pred,test])
        if pred in test or cos > 0.7:
            if cos >0.65 and cos < 0.7:
                print(f"Got a match for {pred }: {sentence}")
            elif cos <=0.65:
                print(f"Closest match to {pred} was {test}")
            found=True
            st_idx=idx
            matching_sentence=sentence
            return found, matching_sentence, 
            
    return found, matching_sentence


In [None]:
from deepeval import evaluate
from deepeval.metrics import HallucinationMetric
from deepeval.test_case import LLMTestCase
import pandas as pd
pred_files = ["../Results/NewRels_Skip3_PassingInIncrements.csv", "../Results/NewRels_Skip4_increments.csv"]
hand = pd.read_csv("../Results/ground_truth.csv")
sentences = read_files("../Docs",hand)
def hallucination_test(actual_output, context, extractions):
    sub, obj = extractions[0], extractions[1]
    test_case = LLMTestCase(
        input=f"What is the relationship between {sub} and {obj}",
        actual_output=actual_output,
        context=context
    )
    metric = HallucinationMetric(threshold=0.5)

    metric.measure(test_case)
    return metric.score



def hallucination(sentences, predictions_file):
    predictions = pd.read_csv(predictions_file)
    total_halluncination_score = 0
    for p in predictions.iterrows():
        ref = p[1]['ref']
        rel = p[1]['rel']
        subj = p[1]['subj']
        obj = p[1]['obj']
        out=f"{subj} {rel} {obj}"
        
        found, match = find(out, sentences)
        if found:
            idx = sentences.index(match)
        else:
            continue
        sentence_window = [sentences[i] for i in range(idx-2, idx+2)]
        halluncination_score = hallucination_test(out,context=sentence_window, extractions = [subj, obj])
        total_halluncination_score += halluncination_score
    print(f"Average hallucination score for predictions_file: {predictions_file} is {total_halluncination_score/len(predictions)}")

for pred_file in pred_files:
    avg_score = hallucination(sentences, pred_file)
