# Comparison of Q&A Bot Responses

This notebook compares the performance of two Q&A bots using various metrics:
- BERTScore (using DeepPavlov/rubert-base-cased model)
- ROUGE-1 and ROUGE-2 scores

We'll analyze and compare the answers from both bots against reference answers.

In [None]:
import sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import json

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

## Setup Path and Import Evaluator

First, we'll set up the Python path to include our source directory and import the necessary modules.

In [None]:
def setup_path() -> None:
    """Add src directory to Python path."""
    src_path = str(Path.cwd().parent / 'src')
    
    if src_path not in sys.path:
        sys.path.insert(0, src_path)
        print(f"Added {src_path} to Python path")
    
    try:
        import metrics
        print("Successfully imported metrics module")
    except ImportError as e:
        print(f"Error importing metrics module: {e}")
        print(f"Current sys.path: {sys.path}")
        raise

setup_path()
from metrics.evaluator import Evaluator

## Load Q&A Data

Load the Q&A pairs from our JSON file containing both bots' answers and reference answers.

In [None]:
def load_qa_data(file_path: Path = None) -> list:
    """Load Q&A pairs with answers from both bots from JSON file."""
    if file_path is None:
        file_path = Path.cwd().parent / 'dataset' / 'qa_pairs.json'
    
    logging.info(f"Loading Q&A data from {file_path}")
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        return data['qa_pairs']
    except Exception as e:
        logging.error(f"Error loading Q&A data: {e}")
        raise

# Load the data
qa_data = load_qa_data()
qa_df = pd.DataFrame(qa_data)
qa_df.head()

## Evaluate Bot Answers

Now we'll evaluate the answers from both bots using our metrics.

In [None]:
# Initialize evaluator
evaluator = Evaluator(bert_model_name='DeepPavlov/rubert-base-cased')

# Calculate metrics for each bot
bot1_results = []
bot2_results = []

for idx, row in qa_df.iterrows():
    logging.info(f"Processing pair {idx + 1}/{len(qa_df)}")
    
    # Evaluate Bot 1
    if row['bot1_answer'] and row['reference_answer']:
        try:
            scores1 = evaluator.evaluate_text_similarity(
                row['bot1_answer'],
                row['reference_answer']
            )
            logging.info(f"Bot 1 scores for pair {idx}: {scores1}")
            
            bert_score1 = scores1.get('bert_score', 0.0)
            rouge_scores1 = scores1.get('rouge', {})
            rouge1_f1 = rouge_scores1.get('rouge-1', {}).get('f1', 0.0)
            rouge2_f1 = rouge_scores1.get('rouge-2', {}).get('f1', 0.0)
            
            bot1_results.append({
                'question': row['question'],
                'bert_score': bert_score1,
                'rouge1': rouge1_f1,
                'rouge2': rouge2_f1
            })
        except Exception as e:
            logging.error(f"Error processing Bot 1 pair {idx}: {e}")
            continue
    
    # Evaluate Bot 2
    if row['bot2_answer'] and row['reference_answer']:
        try:
            scores2 = evaluator.evaluate_text_similarity(
                row['bot2_answer'],
                row['reference_answer']
            )
            logging.info(f"Bot 2 scores for pair {idx}: {scores2}")
            
            bert_score2 = scores2.get('bert_score', 0.0)
            rouge_scores2 = scores2.get('rouge', {})
            rouge1_f2 = rouge_scores2.get('rouge-1', {}).get('f1', 0.0)
            rouge2_f2 = rouge_scores2.get('rouge-2', {}).get('f1', 0.0)
            
            bot2_results.append({
                'question': row['question'],
                'bert_score': bert_score2,
                'rouge1': rouge1_f2,
                'rouge2': rouge2_f2
            })
        except Exception as e:
            logging.error(f"Error processing Bot 2 pair {idx}: {e}")
            continue

# Create DataFrames for both bots
bot1_df = pd.DataFrame(bot1_results)
bot2_df = pd.DataFrame(bot2_results)

# Rename columns
for df in [bot1_df, bot2_df]:
    df.rename(columns={
        'bert_score': 'BERTScore',
        'rouge1': 'ROUGE-1',
        'rouge2': 'ROUGE-2'
    }, inplace=True)

## Visualize Results

Let's create visualizations to compare the performance of both bots.

In [None]:
# Create comparison visualization
plt.figure(figsize=(15, 6))
metrics = ['BERTScore', 'ROUGE-1', 'ROUGE-2']

# Prepare data for plotting
plot_data = pd.DataFrame({
    'Bot 1': bot1_df[metrics].mean(),
    'Bot 2': bot2_df[metrics].mean()
})

# Create bar plot
plot_data.plot(kind='bar')
plt.title('Average Metrics Comparison Between Bots')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Analysis of Results

Let's look at the summary statistics and best performing answers for each bot.

In [None]:
print("Bot 1 Summary Statistics:")
print(bot1_df[metrics].describe())
print("\nBot 2 Summary Statistics:")
print(bot2_df[metrics].describe())

In [None]:
# Compare best answers
for metric in metrics:
    print(f"\nBest answers by {metric}:")
    
    # Bot 1 best
    if not bot1_df.empty:
        bot1_best_idx = bot1_df[metric].idxmax()
        question = bot1_df.iloc[bot1_best_idx]['question']
        print(f"\nBot 1 best ({metric}: {bot1_df.iloc[bot1_best_idx][metric]:.3f}):")
        print(f"Question: {question}")
        qa_row = qa_df[qa_df['question'] == question].iloc[0]
        print(f"Answer: {qa_row['bot1_answer']}")
        print(f"Reference: {qa_row['reference_answer']}")
    
    # Bot 2 best
    if not bot2_df.empty:
        bot2_best_idx = bot2_df[metric].idxmax()
        question = bot2_df.iloc[bot2_best_idx]['question']
        print(f"\nBot 2 best ({metric}: {bot2_df.iloc[bot2_best_idx][metric]:.3f}):")
        print(f"Question: {question}")
        qa_row = qa_df[qa_df['question'] == question].iloc[0]
        print(f"Answer: {qa_row['bot2_answer']}")
        print(f"Reference: {qa_row['reference_answer']}")

## Save Results

Finally, let's save our results to files.

In [None]:
# Create results directory
output_dir = Path("results")
output_dir.mkdir(parents=True, exist_ok=True)

# Save metrics
bot1_df.to_csv(output_dir / 'bot1_metrics.csv', index=False)
bot2_df.to_csv(output_dir / 'bot2_metrics.csv', index=False)

# Save plot
plt.figure(figsize=(15, 6))
plot_data.plot(kind='bar')
plt.title('Average Metrics Comparison Between Bots')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(output_dir / 'bot_comparison.png')
plt.close()