# Hallucination Evaluation

In [1]:
# Install necessary libraries
!pip install deepeval
!pip install PyMuPDF
!pip install sentence-transformers
!pip install numpy==1.19.5 sentence-transformers 
import pandas as pd
import os
import time
from tqdm import tqdm
from deepeval import evaluate
from deepeval.metrics import HallucinationMetric
from deepeval.test_case import LLMTestCase
import logging
import warnings
import deepeval
import nest_asyncio

Collecting grpcio~=1.63.0 (from deepeval)
  Using cached grpcio-1.63.2-cp312-cp312-macosx_10_9_universal2.whl.metadata (3.2 kB)
Collecting protobuf (from deepeval)
  Using cached protobuf-4.25.5-cp37-abi3-macosx_10_9_universal2.whl.metadata (541 bytes)
Using cached grpcio-1.63.2-cp312-cp312-macosx_10_9_universal2.whl (10.1 MB)
Using cached protobuf-4.25.5-cp37-abi3-macosx_10_9_universal2.whl (394 kB)
Installing collected packages: protobuf, grpcio
  Attempting uninstall: protobuf
    Found existing installation: protobuf 5.29.0
    Uninstalling protobuf-5.29.0:
      Successfully uninstalled protobuf-5.29.0
  Attempting uninstall: grpcio
    Found existing installation: grpcio 1.67.1
    Uninstalling grpcio-1.67.1:
      Successfully uninstalled grpcio-1.67.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
grpcio-status 1.68.0 requires grpcio>=1.68.0, but 



In [None]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Set your API keys
PUBMED_API_KEY = ""
OPENAI_API_KEY = ""  # Replace with your OpenAI API key
EMAIL = "KymariBratton@Gmail.com"

# Set up APIs
os.environ["PUBMED_API_KEY"] = PUBMED_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
Entrez.email = EMAIL
Entrez.api_key = PUBMED_API_KEY



def create_and_evaluate_test_cases(senteces, new_rels_file):
    """Create and evaluate test cases with proper error handling"""
    try:
        # Load data
        newrels_df = pd.read_csv(new_rels_file)
        
        test_cases = []
        logger.info("Creating test cases...")
        
        # Create test cases
        for _, gt_row in tqdm(annotations_df.iterrows(), total=len(annotations_df)):
            # Find matching relationships
            matching_rels = newrels_df[
                (newrels_df['subj'].str.lower() == gt_row['subj'].lower()) |
                (newrels_df['subjSummary'].str.lower() == gt_row['subj'].lower())
            ]
            
            if matching_rels.empty:
                continue
                
            # Get PubMed context
            context = get_pubmed_context(
                gt_row['subj'],
                gt_row['rel'],
                gt_row['obj']
            )
            
            # Create test case for each matching relationship
            for _, gen_row in matching_rels.iterrows():
                test_case = LLMTestCase(
                    input=f"Evaluate the relationship between {gt_row['subj']} and {gt_row['obj']}",
                    actual_output=f"{gen_row['subj']} {gen_row['rel']} {gen_row['obj']}",
                    expected_output=f"{gt_row['subj']} {gt_row['rel']} {gt_row['obj']}",
                    context=context
                )
                test_cases.append(test_case)
        
        if not test_cases:
            logger.error("No test cases were created!")
            return
            
        logger.info(f"Created {len(test_cases)} test cases")
        
        # Set up hallucination metric
        hallucination_metric = HallucinationMetric(
            threshold=0.5,
            model="gpt-4-turbo"
        )
        
        # Evaluate test cases
        logger.info("Evaluating test cases...")
        evaluation_results = evaluate(test_cases, [hallucination_metric])
        
        # Process results
        scores = []
        for result in evaluation_results:
            if hasattr(result, 'metrics') and 'HallucinationMetric' in result.metrics:
                score = result.metrics['HallucinationMetric'].score
                scores.append(score)
        
        if not scores:
            logger.error("No valid scores were generated!")
            return
            
        # Calculate and print statistics
        total_cases = len(scores)
        avg_score = sum(scores) / total_cases
        low_hallucination = sum(1 for score in scores if score < 0.5)
        high_hallucination = sum(1 for score in scores if score >= 0.5)
        
        print("\n=== Hallucination Evaluation Results ===")
        print(f"Total Test Cases Evaluated: {total_cases}")
        print(f"Average Hallucination Score: {avg_score:.3f}")
        print(f"Low Hallucination Cases (<0.5): {low_hallucination} ({(low_hallucination/total_cases)*100:.1f}%)")
        print(f"High Hallucination Cases (≥0.5): {high_hallucination} ({(high_hallucination/total_cases)*100:.1f}%)")
        
        return scores
        
    except Exception as e:
        logger.error(f"Error in evaluation process: {str(e)}")
        raise

if __name__ == "__main__":
    try:
        scores = create_and_evaluate_test_cases(
            annotations_file='Annotations.csv',
            new_rels_file='NewRels_Skip2.csv'
        )
        
        if not scores:
            print("No valid results were generated. Please check the logs for details.")
            
    except Exception as e:
        print(f"An error occurred: {str(e)}")

In [4]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def create_and_evaluate_test_cases(sentences, predictions, docs_path):
    """Create and evaluate test cases with local document context"""
    try:
        # Load data
        
        test_cases = []
        logger.info("Creating test cases...")
        
        for p in predictions.iterrows():
    
            ref = p[1]['ref']
            rel = p[1]['rel']
            subj = p[1]['subj']
            obj = p[1]['obj']
            out=f"{subj} {rel} {obj}"
            
            found, match = find(out, sentences)
            if found:
                idx = sentences.index(match)
            else:
                continue
            sentence_window = [sentences[i] for i in range(idx-2, idx+2)]
            halluncination_score = hallucination_test(out,context=sentence_window, extractions = [subj, obj])
            total_halluncination_score += halluncination_score

            
            # Create test case for each matching relationship
            for _, gen_row in matching_rels.iterrows():
                test_case = LLMTestCase(
                    input=f"Evaluate the relationship between {gt_row['subj']} and {gt_row['obj']}",
                    actual_output=f"{gen_row['subj']} {gen_row['rel']} {gen_row['obj']}",
                    expected_output=f"{gt_row['subj']} {gt_row['rel']} {gt_row['obj']}",
                    context=context
                )
                test_cases.append(test_case)
        
        if not test_cases:
            logger.warning("No test cases were created!")
            return
            
        logger.info(f"Created {len(test_cases)} test cases")
        
        # Set up hallucination metric
        hallucination_metric = HallucinationMetric(
            threshold=0.5,
            model="gpt-4-turbo"
        )
        
        # Evaluate test cases
        logger.info("Evaluating test cases...")
        evaluation_result = evaluate(test_cases, [hallucination_metric])
        
        # Debug evaluation attributes only in DEBUG mode
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug(f"Evaluation result type: {type(evaluation_result)}")
            logger.debug(f"Evaluation result attributes: {dir(evaluation_result)}")
        
        # Process results
        scores = []
        # Check if the result has results attribute
        if hasattr(evaluation_result, 'results'):
            for single_result in evaluation_result.results:
                if hasattr(single_result, 'metrics'):
                    metric = single_result.metrics.get('HallucinationMetric')
                    if metric and hasattr(metric, 'score'):
                        scores.append(metric.score)
        
        if not scores:
            logger.warning("No valid scores were generated!")
            return
            
        # Calculate and print statistics
        total_cases = len(scores)
        avg_score = sum(scores) / total_cases
        low_hallucination = sum(1 for score in scores if score < 0.5)
        high_hallucination = sum(1 for score in scores if score >= 0.5)
        
        print("\n=== Hallucination Evaluation Results ===")
        print(f"Total Test Cases Evaluated: {total_cases}")
        print(f"Average Hallucination Score: {avg_score:.3f}")
        print(f"Low Hallucination Cases (<0.5): {low_hallucination} ({(low_hallucination/total_cases)*100:.1f}%)")
        print(f"High Hallucination Cases (≥0.5): {high_hallucination} ({(high_hallucination/total_cases)*100:.1f}%)")
        
        return scores
        
    except Exception as e:
        logger.error(f"Error in evaluation process: {str(e)}")
        raise

if __name__ == "__main__":
    try:
        docs_path = "Docs/"
        
        scores = create_and_evaluate_test_cases(
            sentences,
            new_rels_file='NewRels_Skip2.csv',
            docs_path=docs_path
        )
        
        if not scores:
            print("No valid results were generated. Please check the logs for details.")
            
    except Exception as e:
        print(f"An error occurred: {str(e)}")


An error occurred: name 'sentences' is not defined


# Deep Eval

In [30]:
!pip install numpy sentence-transformers


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import fitz,os
import numpy as np
import deepeval
# Load a pre-trained model'
OPENAI_API_KEY=""
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
import nltk
from nltk.stem import PorterStemmer
nltk.download("punkt")

# Initialize Python porter stemmer
ps = PorterStemmer()
def lemmatize(sent):
    return [ps.stem(word) for word in sent.split()]
#reading the pdf and hand annotation files
def read_pdf(pdf_file):
    start=False
    sentences=[]
    start_idx=0
    with fitz.open(pdf_file) as pdf_file:
        for page_index, page in enumerate(pdf_file):
            text = page.get_text("text").lower()
            text=text.split(". ")
            for sub in text:
                if 'abstract' in sub or 'intro' in sub:
                    start=True
                    if 'abstract' in sub:
                        sub.index('abstract')
                    else:
                        sub.index('intro')
                        
                if start:
                    sentences.append(sub)
                
    return sentences
def read_files(root_dir, hand):
    
    lines=[]
    for files in os.listdir(root_dir):
        if files[-4:] != '.pdf':
            continue
        sentences = read_pdf(f"{root_dir}/{files}")
        lines.extend(sentences)

    # read in hand annotations
    for p in hand.iterrows():
        rel = p[1]['rel']
        subj = p[1]['subj']
        obj = p[1]['obj']
        out=f"{subj} {rel} {obj}" 
        lines.append(out)


    return lines

    return lines
hand = pd.read_csv("../Results/ground_truth.csv")
sentences = read_files("../Docs",hand)
#computing cosine similarity
def vec(sentences):
    # Encode sentences
    embeddings = model.encode([sentences[0], sentences[1]])
    
    # Compute cosine similarity
    similarity = util.cos_sim(embeddings[0], embeddings[1])
    return similarity.item() # Value close to 1 indicates high similarity
    
#finding if the target string (relation triplet) is in the src (pdf + hand annotation)
def find(target, src):
    found=False
    matching_sentences=[]
    
    for idx,sentence in enumerate(src):
        subj, obj = f" {target.split()[0]} ", f" {target.split()[1]} "
        if subj in sentence and obj in sentence:
            idx = find_substring_index(sentences, subj)
            matching_sentence=sentence[idx:]
            matching_sentences.append(matching_sentence)
            found=True
            break 
            
        #pred=" ".join(lemmatize(target))
        #test=" ".join(lemmatize(sentence))
        #cos = vec([pred,test])
        
        elif subj in sentence:# or cos > 0.68:
            found=True
            st_idx=idx
            idx = find_substring_index(sentences, subj)
            matching_sentence=sentence[idx:]
            matching_sentences.append(matching_sentence)
            
    print(len(matching_sentences))
    return found, matching_sentences

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rishikasrinivas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
hall, files, pred_files

([],
 ['../Results/Temperature1_WithoutExamples.csv'],
 ['Temperature0point2.csv',
  'ground_truth.csv',
  'eval_results',
  'Temperature1_WithoutExamples.csv',
  'Temperature1_WithoutExamples_cleaned.csv',
  'NewRels_Skip2_cummulative.csv',
  '.ipynb_checkpoints',
  'NewRels_Skip3_cummulative.csv',
  'NewRels_Skip2_increments.csv',
  'NewRels_Skip4_increments.csv',
  'Temperature1_WithExamples.csv',
  'NewRels_Skip3_increments.csv'])

In [None]:
import deepeval
from deepeval import evaluate
from deepeval.metrics import HallucinationMetric
from deepeval.test_case import LLMTestCase
import pandas as pd
pred_files = os.listdir("../Results")
hand = pd.read_csv("../Results/ground_truth.csv")
sentences = read_files("../Docs",hand)
def hallucination_test(actual_output, context, extractions):
    sub, obj = extractions[0], extractions[1]
    metric = HallucinationMetric(threshold=0.5)
    scores=[]
    for c in context:
        test_case = LLMTestCase(
            input=f"What is the relationship between {sub} and {obj}",
            actual_output={actual_output},
            context=[c]
        )
        
        metric.measure(test_case)
        scores.append(metric.score)
    test_case = LLMTestCase(
        input=f"What is the relationship between {sub} and {obj}",
        actual_output={actual_output},
        context=context
    )
    metric.measure(test_case)
    scores.append(metric.score)
    metric.score = min(scores)
    
    return metric.score

import threading

import multiprocessing 
def find_substring_index(lst, substring):
    for i, s in enumerate(lst):
        if substring in s:  # Check if the substring exists in the string
            return i
    return -1 
def hallucination(sentences, pred_file):
    predictions = pd.read_csv(pred_file)
    total_halluncination_score = 0
    for i,p in enumerate(predictions.iterrows()):
        print(i)
    
        ref = p[1]['ref']
        rel = p[1]['rel']
        subj = p[1]['subj']
        obj = p[1]['obj']
        out=f"{subj} {rel} {obj}"
        
        found, matching_sentences = find(out.lower(), sentences)
        print(len(matching_sentences))
        if found:
            halluncination_scores=[]
            for match in matching_sentences:
                idx = find_substring_index(sentences, match)
                sentence_window = [sentences[i] for i in range(idx-1, idx+1)]
                halluncination_score = hallucination_test(out,context=sentence_window, extractions = [subj, obj])
                halluncination_scores.append(halluncination_score)
                
                
            total_halluncination_score += min(halluncination_scores)
        else:
            print(f"Could not find a match for {out}")
            total_halluncination_score += 1
            continue
        
        
       
        
    print(len(predictions), total_halluncination_score)
    print(f"Average hallucination score for predictions_file: {pred_file} is {total_halluncination_score/len(predictions)}")
    return total_halluncination_score/len(predictions)
import logging
httpx_logger = logging.getLogger("httpx")
httpx_logger.setLevel(logging.WARNING)
if __name__ == "__main__":
    hall=[]
    files=[]
    for pred_file in ['NewRels_Skip3_increments.csv', 'Temperature1_WithoutExamples.csv']:
        print(f"============={pred_file}=================")
        if any(substring in pred_file for substring in [ 'cumm', '.ipynb', 'ground', 'Clean', 'result']):
            continue
        pred_file=f"../Results/{pred_file}"
        files.append(pred_file)
        hall.append(hallucination(sentences, pred_file))
        
      
       

Output()

0
2
2


Output()

Output()

Output()

Output()

Output()

Output()

1
3
3


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

2
3
3


Output()

Output()

Output()

In [None]:
Average hallucination score for predictions_file: ../Results/Temperature0point2.csv is 0.5137614678899083
Average hallucination score for predictions_file: ../Results/Temperature1_WithoutExamples.csv is 0.4267241379310345
Average hallucination score for predictions_file: ../Results/NewRels_Skip2_increments.csv is 0.6424418604651163

Average hallucination score for predictions_file: ../Results/NewRels_Skip4_increments.csv is 0.6369047619047619
Average hallucination score for predictions_file: ../Results/NewRels_Skip3_increments.csv is 0.5863636363636363

In [46]:
!pip install --upgrade deepeval openai langchain httpx


Collecting deepeval
  Downloading deepeval-2.0.1-py3-none-any.whl.metadata (1.0 kB)
Collecting langchain
  Downloading langchain-0.3.9-py3-none-any.whl.metadata (7.1 kB)
Collecting grpcio==1.60.1 (from deepeval)
  Downloading grpcio-1.60.1-cp312-cp312-macosx_10_10_universal2.whl.metadata (4.0 kB)
Collecting langchain-core (from deepeval)
  Using cached langchain_core-0.3.21-py3-none-any.whl.metadata (6.3 kB)
Collecting numpy<3,>=1.26.2 (from langchain)
  Using cached numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl.metadata (61 kB)
Downloading deepeval-2.0.1-py3-none-any.whl (474 kB)
Downloading grpcio-1.60.1-cp312-cp312-macosx_10_10_universal2.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading langchain-0.3.9-py3-none-any.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached langc

In [8]:
69.5/189

0.36772486772486773