# IMPORTS

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize
import re
import warnings
warnings.filterwarnings('ignore')

# EVALUATION

In [None]:
def match_score(df: pd.DataFrame, col_match: str = "Match") -> float:
    if col_match not in df.columns:
        raise ValueError("DataFrame must contain a 'Match' column.")
    
    valid_matches = df[col_match].dropna()
    if len(valid_matches) == 0:
        return 0.0

    return valid_matches.mean()

In [None]:
def normalize_text(text) -> str:
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
path = "outputs/text_only_1.xlsx"
col_original_premise = "Premise/Facts"
col_generated_premise = "Generated Premise"
col_match = "Match"

df = pd.read_excel(path)
df[col_original_premise] = df[col_original_premise].apply(normalize_text)
df[col_generated_premise] = df[col_generated_premise].apply(normalize_text)

match_score = match_score(df, col_match=col_match)
print(f"Retrieval accuracy: {match_score:.2%}")

In [None]:
class PremiseEvaluator:
    def __init__(self):
        self.vectorizer = TfidfVectorizer()
    
    def cosine_similarity_score(self, original: str, generated: str) -> float:
        """Calculate cosine similarity between TF-IDF vectors"""
        try:
            vectors = self.vectorizer.fit_transform([original, generated])
            return cosine_similarity(vectors[0:1], vectors[1:2])[0][0]
        except:
            return 0.0
    
    def bleu_score(self, original: str, generated: str) -> float:
        """Calculate BLEU score for text similarity"""
        reference = [word_tokenize(original)]
        candidate = word_tokenize(generated)
        return sentence_bleu(reference, candidate)
    
    def length_ratio(self, original: str, generated: str) -> float:
        """Compare length of generated vs original"""
        len_orig = len(original.split())
        len_gen = len(generated.split())
        if len_orig == 0:
            return 0.0
        return len_gen / len_orig

evaluator = PremiseEvaluator()

In [None]:
results = []
for idx, row in df.iterrows():
    original = row[col_original_premise]
    generated = row[col_generated_premise]
    
    if original and generated:
        results.append({
            'index': idx,
            'cosine_similarity': evaluator.cosine_similarity_score(original, generated),
            'bleu_score': evaluator.bleu_score(original, generated),
            'length_ratio': evaluator.length_ratio(original, generated),
        })

results_df = pd.DataFrame(results)
print(results_df.head(10))

In [None]:
# Calculate aggregate statistics for evaluation metrics
aggregate_stats = results_df[['cosine_similarity', 'bleu_score', 'length_ratio']].agg(['mean', 'min', 'max', 'std', 'median'])
print("Aggregate Statistics for Evaluation Metrics:")
print(aggregate_stats)
print("\n")

# Additional summary statistics
print("Summary Statistics:")
print(f"Total valid results: {len(results_df)}")
print(f"Average Cosine Similarity: {results_df['cosine_similarity'].mean():.4f}")
print(f"Average BLEU Score: {results_df['bleu_score'].mean():.4f}")
print(f"Average Length Ratio: {results_df['length_ratio'].mean():.4f}")