# Q&A Bot Response Evaluation

This notebook evaluates and compares responses from three Q&A bots using multiple metrics:
- BERTScore (using DeepPavlov/rubert-base-cased model)
- ROUGE-1 and ROUGE-2 scores
- MRR (Mean Reciprocal Rank)
- NDCG (Normalized Discounted Cumulative Gain)

In [None]:
import sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import json
from typing import List, Dict, Any

# Create results directory
results_dir = Path("results")
results_dir.mkdir(parents=True, exist_ok=True)

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(results_dir / 'evaluation.log'),
        logging.StreamHandler()
    ]
)

In [None]:
def setup_path() -> None:
    """Add src directory to Python path."""
    src_path = str(Path.cwd().parent / 'src')
    
    if src_path not in sys.path:
        sys.path.insert(0, src_path)
        print(f"Added {src_path} to Python path")
    
    try:
        import metrics
        print("Successfully imported metrics module")
    except ImportError as e:
        print(f"Error importing metrics module: {e}")
        print(f"Current sys.path: {sys.path}")
        raise

setup_path()
from metrics.evaluator import Evaluator
from metrics.retrieval_metrics import RetrievalMetrics

## Helper Functions

In [None]:
def convert_scores_to_relevance(scores: Dict[str, float]) -> float:
    """Convert similarity scores to relevance score for retrieval metrics."""
    bert_weight = 0.4
    rouge1_weight = 0.3
    rouge2_weight = 0.3
    
    bert_score = scores.get('bert_score', 0.0)
    rouge_scores = scores.get('rouge', {})
    rouge1 = rouge_scores.get('rouge-1', {}).get('f1', 0.0)
    rouge2 = rouge_scores.get('rouge-2', {}).get('f1', 0.0)
    
    return (bert_score * bert_weight + 
            rouge1 * rouge1_weight + 
            rouge2 * rouge2_weight)

## Load and Process Q&A Data

In [None]:
# Load Q&A data
qa_data_path = Path.cwd().parent / 'dataset' / 'qa_pairs.json'
with open(qa_data_path, 'r', encoding='utf-8') as f:
    qa_data = json.load(f)['qa_pairs']

qa_df = pd.DataFrame(qa_data)
qa_df.head()

## Evaluate Bot Answers

In [None]:
# Initialize evaluators
evaluator = Evaluator(bert_model_name='DeepPavlov/rubert-base-cased')
retrieval_metrics = RetrievalMetrics()

# Initialize results lists
bot1_results = []
bot2_results = []
bot3_results = []
bot1_relevance_lists = []
bot2_relevance_lists = []
bot3_relevance_lists = []

# Process all answers
for idx, row in qa_df.iterrows():
    logging.info(f"Processing pair {idx + 1}/{len(qa_df)}")
    
    current_relevance1 = []
    current_relevance2 = []
    current_relevance3 = []
    
    # Evaluate each bot's answer
    for bot_num in range(1, 4):
        bot_answer = row[f'bot{bot_num}_answer']
        if bot_answer and row['reference_answer']:
            try:
                scores = evaluator.evaluate_text_similarity(
                    bot_answer,
                    row['reference_answer']
                )
                
                bert_score = scores.get('bert_score', 0.0)
                rouge_scores = scores.get('rouge', {})
                rouge1_f = rouge_scores.get('rouge-1', {}).get('f1', 0.0)
                rouge2_f = rouge_scores.get('rouge-2', {}).get('f1', 0.0)
                
                relevance = convert_scores_to_relevance(scores)
                
                if bot_num == 1:
                    current_relevance1 = [relevance]
                    bot1_results.append({
                        'question': row['question'],
                        'BERTScore': bert_score,
                        'ROUGE-1': rouge1_f,
                        'ROUGE-2': rouge2_f
                    })
                elif bot_num == 2:
                    current_relevance2 = [relevance]
                    bot2_results.append({
                        'question': row['question'],
                        'BERTScore': bert_score,
                        'ROUGE-1': rouge1_f,
                        'ROUGE-2': rouge2_f
                    })
                else:
                    current_relevance3 = [relevance]
                    bot3_results.append({
                        'question': row['question'],
                        'BERTScore': bert_score,
                        'ROUGE-1': rouge1_f,
                        'ROUGE-2': rouge2_f
                    })
            except Exception as e:
                logging.error(f"Error processing Bot {bot_num} pair {idx}: {e}")
                if bot_num == 1:
                    current_relevance1 = [0.0]
                elif bot_num == 2:
                    current_relevance2 = [0.0]
                else:
                    current_relevance3 = [0.0]
        else:
            if bot_num == 1:
                current_relevance1 = [0.0]
            elif bot_num == 2:
                current_relevance2 = [0.0]
            else:
                current_relevance3 = [0.0]
    
    bot1_relevance_lists.append(current_relevance1)
    bot2_relevance_lists.append(current_relevance2)
    bot3_relevance_lists.append(current_relevance3)

# Create DataFrames
bot1_df = pd.DataFrame(bot1_results)
bot2_df = pd.DataFrame(bot2_results)
bot3_df = pd.DataFrame(bot3_results)

# Calculate retrieval metrics
bot1_mrr = retrieval_metrics.compute_mrr(bot1_relevance_lists)
bot2_mrr = retrieval_metrics.compute_mrr(bot2_relevance_lists)
bot3_mrr = retrieval_metrics.compute_mrr(bot3_relevance_lists)
bot1_ndcg = retrieval_metrics.compute_ndcg(bot1_relevance_lists)
bot2_ndcg = retrieval_metrics.compute_ndcg(bot2_relevance_lists)
bot3_ndcg = retrieval_metrics.compute_ndcg(bot3_relevance_lists)

print("Retrieval Metrics:")
print(f"Bot 1 - MRR: {bot1_mrr:.3f}, NDCG: {bot1_ndcg:.3f}")
print(f"Bot 2 - MRR: {bot2_mrr:.3f}, NDCG: {bot2_ndcg:.3f}")
print(f"Bot 3 - MRR: {bot3_mrr:.3f}, NDCG: {bot3_ndcg:.3f}")

## Analyze Results

In [None]:
# Display summary statistics
print("Bot 1 Summary Statistics:")
print(bot1_df[['BERTScore', 'ROUGE-1', 'ROUGE-2']].describe())
print("\nBot 2 Summary Statistics:")
print(bot2_df[['BERTScore', 'ROUGE-1', 'ROUGE-2']].describe())
print("\nBot 3 Summary Statistics:")
print(bot3_df[['BERTScore', 'ROUGE-1', 'ROUGE-2']].describe())

In [None]:
# Create visualization
plt.figure(figsize=(15, 6))
metrics = ['BERTScore', 'ROUGE-1', 'ROUGE-2', 'MRR', 'NDCG']

plot_data = pd.DataFrame({
    'Bot 1': [bot1_df[m].mean() if m in bot1_df.columns else 
              bot1_mrr if m == 'MRR' else bot1_ndcg 
              for m in metrics],
    'Bot 2': [bot2_df[m].mean() if m in bot2_df.columns else 
              bot2_mrr if m == 'MRR' else bot2_ndcg 
              for m in metrics],
    'Bot 3': [bot3_df[m].mean() if m in bot3_df.columns else 
              bot3_mrr if m == 'MRR' else bot3_ndcg 
              for m in metrics]
}, index=metrics)

plot_data.plot(kind='bar')
plt.title('Average Metrics Comparison Between Bots')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Save Results

In [None]:
# Save results to files
output_dir = Path("results")
output_dir.mkdir(parents=True, exist_ok=True)

# Save metrics
bot1_df.to_csv(output_dir / 'bot1_metrics.csv', index=False)
bot2_df.to_csv(output_dir / 'bot2_metrics.csv', index=False)
bot3_df.to_csv(output_dir / 'bot3_metrics.csv', index=False)

# Save retrieval metrics
retrieval_metrics_df = pd.DataFrame({
    'Metric': ['MRR', 'NDCG'],
    'Bot 1': [bot1_mrr, bot1_ndcg],
    'Bot 2': [bot2_mrr, bot2_ndcg],
    'Bot 3': [bot3_mrr, bot3_ndcg]
})
retrieval_metrics_df.to_csv(output_dir / 'retrieval_metrics.csv', index=False)

# Save detailed analysis to summary.txt
with open(output_dir / 'summary.txt', 'w', encoding='utf-8') as f:
    # Write statistics for each bot
    for bot_num, bot_df in enumerate([bot1_df, bot2_df, bot3_df], 1):
        f.write(f"Bot {bot_num} Summary Statistics:\n")
        f.write(f"{bot_df[['BERTScore', 'ROUGE-1', 'ROUGE-2']].describe().to_string()}\n\n")
    
    # Write retrieval metrics
    f.write("Retrieval Metrics:\n")
    f.write(f"Bot 1 - MRR: {bot1_mrr:.3f}, NDCG: {bot1_ndcg:.3f}\n")
    f.write(f"Bot 2 - MRR: {bot2_mrr:.3f}, NDCG: {bot2_ndcg:.3f}\n")
    f.write(f"Bot 3 - MRR: {bot3_mrr:.3f}, NDCG: {bot3_ndcg:.3f}\n\n")
    
    # Write best answers analysis
    metrics = ['BERTScore', 'ROUGE-1', 'ROUGE-2']
    f.write("\nBest Answers Analysis:\n")
    f.write("=" * 80 + "\n")
    
    for metric in metrics:
        f.write(f"\nBest answers by {metric}:\n")
        f.write("-" * 40 + "\n")
        
        for bot_num, bot_df in enumerate([bot1_df, bot2_df, bot3_df], 1):
            if not bot_df.empty:
                bot_best_idx = bot_df[metric].idxmax()
                question = bot_df.iloc[bot_best_idx]['question']
                f.write(f"\nBot {bot_num} best ({metric}: {bot_df.iloc[bot_best_idx][metric]:.3f}):\n")
                f.write(f"Question: {question}\n")
                qa_row = qa_df[qa_df['question'] == question].iloc[0]
                f.write(f"Answer: {qa_row[f'bot{bot_num}_answer']}\n")
                f.write(f"Reference: {qa_row['reference_answer']}\n")
        
        f.write("\n" + "=" * 80 + "\n")

print("Results saved successfully!")