# IMPORTS

In [None]:
import os
import pandas as pd
import torch
import re

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, util

# SETUP

In [None]:
path = "outputs/Patunai-IR-Eval.xlsx"

col_facts = "Premise/Facts"
col_claims = "Hypothesis/Claims"
col_generated = "Generated Premise"

entailment_facts_vs_claims = []
entailment_gen_vs_claims = []
similarity_facts_vs_gen = []

df = pd.read_excel(path)

# EVALUATION

In [None]:
embedding_model = 'intfloat/e5-base-v2'

class Evaluator:
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Running on: {self.device}")

        # # 1. NLI - entailment
        # print("Loading NLI model (cross-encoder/nli-distilroberta-base)...")
        # self.nli_tokenizer = AutoTokenizer.from_pretrained("cross-encoder/nli-distilroberta-base")
        # self.nli_model = AutoModelForSequenceClassification.from_pretrained("cross-encoder/nli-distilroberta-base").to(self.device)
        # self.entailment_idx = 1
        
        # 2. Semantic - similarity
        print(f"Loading Semantic model ({embedding_model})...")
        self.sim_model = SentenceTransformer(embedding_model, device=self.device)

    # def get_entailment_score(self, premise, hypothesis):
    #     if not premise or not hypothesis: 
    #         return 0.0
        
    #     inputs = self.nli_tokenizer(
    #         premise, hypothesis, return_tensors='pt', truncation=True, max_length=512
    #     ).to(self.device)
        
    #     with torch.no_grad():
    #         outputs = self.nli_model(**inputs)
    #         probs = torch.softmax(outputs.logits, dim=1)
            
        # return probs[0][self.entailment_idx].item()

    def get_semantic_similarity(self, text1, text2):
        if not text1 or not text2: 
            return 0.0
        emb1 = self.sim_model.encode(text1, convert_to_tensor=True)
        emb2 = self.sim_model.encode(text2, convert_to_tensor=True)
        return util.cos_sim(emb1, emb2).item()

evaluator = Evaluator()

Running on: cuda
Using OpenAI embeddings (text-embedding-3-small)


In [None]:
col_facts = "Premise/Facts"
col_claims = "Hypothesis/Claims"
col_generated = "Generated Premise"

def safe_text(val):
    if pd.isna(val) or val is None:
        return ""
    return str(val).strip()

entailment_facts_vs_claims = [] 
entailment_gen_vs_claims = []   
similarity_facts_vs_gen = []    

print(f"Evaluating {len(df)} rows...")

for idx, row in df.iterrows():
    facts = safe_text(row.get(col_facts))
    claims = safe_text(row.get(col_claims))
    generated = safe_text(row.get(col_generated))
    
    score_facts_claims = 0.0
    score_gen_claims = 0.0
    score_sim = 0.0
    
    # if facts and claims:
    #     score_facts_claims = evaluator.get_entailment_score(
    #         premise=facts, 
    #         hypothesis=claims
    #     )
    # if generated and claims:
    #     score_gen_claims = evaluator.get_entailment_score(
    #         premise=generated, 
    #         hypothesis=claims
    #     )
    if facts and generated:
        score_sim = evaluator.get_semantic_similarity(facts, generated)
    
    # entailment_facts_vs_claims.append(score_facts_claims)
    # entailment_gen_vs_claims.append(score_gen_claims)
    similarity_facts_vs_gen.append(score_sim)

# df['Entailment_Facts_vs_Claims'] = entailment_facts_vs_claims
# df['Entailment_Gen_vs_Claims'] = entailment_gen_vs_claims
df[embedding_model] = similarity_facts_vs_gen

In [None]:
cols_to_analyze = [
    # 'Entailment_Facts_vs_Claims',
    # 'Entailment_Gen_vs_Claims',
    embedding_model
]

# stats = df[cols_to_analyze].agg(['mean', 'std', 'min', 'max', 'median'])
print("---Aggregate Statistics---")
# print(stats)
print("\n")

# avg_baseline = df['Entailment_Facts_vs_Claims'].mean()
# avg_validity = df['Entailment_Gen_vs_Claims'].mean()
avg_fidelity = df[embedding_model].mean()
print("---Summary Report---")
# print(f"1. Baseline Truth   (Do Facts support Claims?):       {avg_baseline:.2%} confidence")
# print(f"2. AI Validity      (Does Gen support Claims?):       {avg_validity:.2%} confidence")
print(f"3. Fidelity         (Is Gen similar to Facts?):       {avg_fidelity:.2%} similarity")

In [None]:
base_name = os.path.splitext(os.path.basename(path))[0]
output_path = f"outputs/{base_name}_{embedding_model.replace('/','-')}.xlsx"
df.to_excel(output_path, index=False)
print(f"Saved results to: {output_path}")