In [None]:
import sys
sys.path.append("..")

import pandas as pd
import matplotlib.pyplot as plt
from src.evaluation import run_evaluation

# --- 🧠 Define Evaluation Questions ---
questions = [
    "What are common complaints about credit card billing?",
    "How do customers describe poor communication?",
    "What issues are raised about account closures?",
    "Are there complaints about debt collection practices?",
    "Do customers report unauthorized charges?",
    "What are common issues with loan servicing?",
    "How do customers describe delays in fund transfers?",
    "Are there complaints about overdraft fees?",
    "Do customers mention being misled about interest rates?",
    "What problems are reported with mortgage servicing?"
]

# --- 🧪 Evaluate Multiple Models ---
models = ["gemma:2b", "llama2", "mistral"]
results = []

for model in models:
    output_path = f"../reports/eval_{model.replace(':', '_')}.csv"
    run_evaluation(questions, output_path=output_path, method="semantic", model=model)

    df = pd.read_csv(output_path)
    df["model"] = model
    results.append(df)

# --- 📊 Combine and Analyze ---
all_results = pd.concat(results, ignore_index=True)
all_results.dropna(subset=["similarity_score"], inplace=True)

# --- 🏆 Leaderboard ---
leaderboard = (
    all_results.groupby("model")["similarity_score"]
    .mean()
    .reset_index()
    .rename(columns={"similarity_score": "avg_score"})
    .sort_values("avg_score", ascending=False)
)

print("🏆 Model Leaderboard:")
display(leaderboard)

# --- 📈 Score Distribution ---
plt.figure(figsize=(10, 5))
for model in models:
    model_scores = all_results[all_results["model"] == model]["similarity_score"]
    plt.hist(model_scores, bins=10, alpha=0.5, label=model)
plt.title("Score Distribution by Model")
plt.xlabel("Similarity Score")
plt.ylabel("Frequency")
plt.legend()
plt.grid(True)
plt.show()

# --- 📉 Score Trend (per question) ---
plt.figure(figsize=(10, 5))
for model in models:
    model_scores = all_results[all_results["model"] == model].sort_values("question")["similarity_score"].reset_index(drop=True)
    plt.plot(model_scores, label=model, marker="o")
plt.title("Score Trend Across Questions")
plt.xlabel("Question Index")
plt.ylabel("Similarity Score")
plt.legend()
plt.grid(True)
plt.show()

# --- 💾 Save Combined Results ---
all_results.to_csv("../reports/eval_all_models.csv", index=False)
print("✅ Combined evaluation results saved to eval_all_models.csv")


🔍 Evaluating: What are common complaints about credit card billing?
🔍 Evaluating: How do customers describe poor communication?
🔍 Evaluating: What issues are raised about account closures?
🔍 Evaluating: Are there complaints about debt collection practices?
🔍 Evaluating: Do customers report unauthorized charges?
🔍 Evaluating: What are common issues with loan servicing?
🔍 Evaluating: How do customers describe delays in fund transfers?
🔍 Evaluating: Are there complaints about overdraft fees?
🔍 Evaluating: Do customers mention being misled about interest rates?
🔍 Evaluating: What problems are reported with mortgage servicing?
✅ Evaluation results saved to ../reports/eval_gemma_2b.csv
🔍 Evaluating: What are common complaints about credit card billing?
