# Hallucination Evaluation

In [1]:
# Install necessary libraries
!pip install deepeval
!pip install PyMuPDF
!pip install sentence-transformers
!pip install numpy==1.19.5 sentence-transformers 
import pandas as pd
import os
import time
from tqdm import tqdm
from deepeval import evaluate
from deepeval.metrics import HallucinationMetric
from deepeval.test_case import LLMTestCase
import logging
import warnings
import deepeval
import nest_asyncio

Collecting grpcio~=1.63.0 (from deepeval)
  Using cached grpcio-1.63.2-cp312-cp312-macosx_10_9_universal2.whl.metadata (3.2 kB)
Collecting protobuf (from deepeval)
  Using cached protobuf-4.25.5-cp37-abi3-macosx_10_9_universal2.whl.metadata (541 bytes)
Using cached grpcio-1.63.2-cp312-cp312-macosx_10_9_universal2.whl (10.1 MB)
Using cached protobuf-4.25.5-cp37-abi3-macosx_10_9_universal2.whl (394 kB)
Installing collected packages: protobuf, grpcio
  Attempting uninstall: protobuf
    Found existing installation: protobuf 5.29.0
    Uninstalling protobuf-5.29.0:
      Successfully uninstalled protobuf-5.29.0
  Attempting uninstall: grpcio
    Found existing installation: grpcio 1.67.1
    Uninstalling grpcio-1.67.1:
      Successfully uninstalled grpcio-1.67.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
grpcio-status 1.68.0 requires grpcio>=1.68.0, but 



# Deep Eval

In [2]:
import fitz,os
import numpy as np
import deepeval
# Load a pre-trained model'
OPENAI_API_KEY=""
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
import nltk
from nltk.stem import PorterStemmer
nltk.download("punkt")

# Initialize Python porter stemmer
ps = PorterStemmer()
def lemmatize(sent):
    '''
        lemmatize each word in a sentence
        args:
            sent: list of sentences
        return:
            list of words lemmatized
    '''
    return [ps.stem(word) for word in sent.split()]
    
#reading the pdf and hand annotation files
def read_pdf(pdf_file):
    '''
        Reads PDF files
        args:
            pdf_file: name of pdf file
        return:
            list of sentences in pdf
    '''
    start=False
    sentences=[]
    start_idx=0
    with fitz.open(pdf_file) as pdf_file:
        for page_index, page in enumerate(pdf_file):
            text = page.get_text("text").lower()
            text=text.split(". ")
            for sub in text:
                if 'abstract' in sub or 'intro' in sub:
                    start=True
                    if 'abstract' in sub:
                        sub.index('abstract')
                    else:
                        sub.index('intro')
                        
                if start:
                    sentences.append(sub)
    return sentences
    
def read_files(root_dir, hand):
    '''
        reads files in root_dir and hand

        args:
            root_dir: directory containing results
            hand: hand annotations

        return:
            list of sentences in all files and hand annotations
    '''
    lines=[]
    for files in os.listdir(root_dir):
        if files[-4:] != '.pdf':
            continue
        sentences = read_pdf(f"{root_dir}/{files}")
        lines.extend(sentences)

    # read in hand annotations
    for p in hand.iterrows():
        rel = p[1]['rel']
        subj = p[1]['subj']
        obj = p[1]['obj']
        out=f"{subj} {rel} {obj}" 
        lines.append(out)


    return lines
    
#computing cosine similarity (not used anymore)
def vec(sentences):
    # Encode sentences
    embeddings = model.encode([sentences[0], sentences[1]])
    
    # Compute cosine similarity
    similarity = util.cos_sim(embeddings[0], embeddings[1])
    return similarity.item() # Value close to 1 indicates high similarity
    
#finding if the target string (relation triplet) is in the src (pdf + hand annotation)
def find(target, src):
    '''
        Searches through src documents to find a close match to the target string

        args: 
            target: string to find
            src: strings to search through
        return:
            boolean of found/not found and matching sentence in src if found
    '''
    found=False
    matching_sentences=[]
    
    for idx,sentence in enumerate(src):
        subj, obj = f" {target.split()[0]} ", f" {target.split()[1]} "
        if subj in sentence and obj in sentence:
            idx = find_substring_index(sentences, subj)
            matching_sentence=sentence[idx:]
            matching_sentences.append(matching_sentence)
            found=True
            break 
            
        elif subj in sentence:# or cos > 0.68:
            found=True
            st_idx=idx
            idx = find_substring_index(sentences, subj)
            matching_sentence=sentence[idx:]
            matching_sentences.append(matching_sentence)
            
    print(len(matching_sentences))
    return found, matching_sentences

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rishikasrinivas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import deepeval
from deepeval import evaluate
from deepeval.metrics import HallucinationMetric
from deepeval.test_case import LLMTestCase
import pandas as pd

pred_files = os.listdir("../Results")
hand = pd.read_csv("../Results/ground_truth.csv")
sentences = read_files("../Docs",hand)

def hallucination_test(actual_output, context, extractions):
    '''
        Calculates hallucination likelihood between actual output and each sentence in the context and actual output and 
        context as a whole. Returns the lowest hallucination score. 

        args: 
            actual output: string representing the subj, rel, obj from the LLM
            context: reference LLM used to get that relationships plus few sentences around the reference in the src documents
            extractions: subject and object used to prompt the LLMTestCase call
        returns:
            Lowest hallucination score for this output
    '''
    
    sub, obj = extractions[0], extractions[1]
    metric = HallucinationMetric(threshold=0.5)
    scores=[]
    for c in context:
        test_case = LLMTestCase(
            input=f"What is the relationship between {sub} and {obj}",
            actual_output={actual_output},
            context=[c]
        )
        
        metric.measure(test_case)
        scores.append(metric.score)
    test_case = LLMTestCase(
        input=f"What is the relationship between {sub} and {obj}",
        actual_output={actual_output},
        context=context
    )
    metric.measure(test_case)
    scores.append(metric.score)
    metric.score = min(scores)
    
    return metric.score


def find_substring_index(lst, substring):
    '''
        Finds the index of the substring in the list lst

        args: 
            lst: List to find substring in
            substring: substring to find in the list
        returns:
            Index of the first instance of substring in lst
    '''
    for i, s in enumerate(lst):
        if substring in s:  # Check if the substring exists in the string
            return i
    return -1 
    
def hallucination(sentences, pred_file):
    '''
        Calculates the average hallucination between predictions and ground truth

        args:
            sentences: list of strings for each sentence in the src documents and each of hand annotations represented as the string
                "subj rel obj"
            pred_file: filename of predictions to evaluate

        return:
            average hallucination score
    '''
    predictions = pd.read_csv(pred_file)
    total_halluncination_score = 0
    for i,p in enumerate(predictions.iterrows()):
        print(i)
    
        ref = p[1]['ref']
        rel = p[1]['rel']
        subj = p[1]['subj']
        obj = p[1]['obj']
        out=f"{subj} {rel} {obj}"
        
        found, matching_sentences = find(out.lower(), sentences)
        print(len(matching_sentences))
        if found:
            halluncination_scores=[]
            for match in matching_sentences:
                idx = find_substring_index(sentences, match)
                sentence_window = [sentences[i] for i in range(idx-1, idx+1)]
                halluncination_score = hallucination_test(out,context=sentence_window, extractions = [subj, obj])
                halluncination_scores.append(halluncination_score)
                
                
            total_halluncination_score += min(halluncination_scores)
        else:
            print(f"Could not find a match for {out}")
            total_halluncination_score += 1
            continue
        
        
       
        
    print(len(predictions), total_halluncination_score)
    print(f"Average hallucination score for predictions_file: {pred_file} is {total_halluncination_score/len(predictions)}")
    return total_halluncination_score/len(predictions)


if __name__ == "__main__":
    hall=[]
    files=[]
    for pred_file in ['Temperature0point2.csv']:
        print(f"============={pred_file}=================")
        if any(substring in pred_file for substring in [ 'cumm', '.ipynb', 'ground', 'Clean', 'result']):
            continue
        pred_file=f"../Results/{pred_file}"
        files.append(pred_file)
        hall.append(hallucination(sentences, pred_file))
        
      
       

In [None]:
Average hallucination score for predictions_file: ../Results/Temperature0point2.csv is 0.252
Hallucination score for ./Results/Temperature1_WithoutExamples.csv is 0.1810344827586207
Average hallucination score for predictions_file: ../Results/NewRels_Skip2_increments.csv is 0.32
Average hallucination score for predictions_file: ../Results/NewRels_Skip4_increments.csv is 0.35
Average hallucination score for predictions_file: ../Results/NewRels_Skip3_increments.csv is 0.259