# Model Evaluation - Intellihack Scope 03

This notebook focuses on comprehensive evaluation of our fine-tuned model and RAG system using multiple metrics.

In [None]:
# Import necessary libraries
import json
import torch
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import evaluate
from IPython.display import display  # Added for Jupyter display

# Import Hugging Face modules
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# Import RAG components for comparison
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain_community.llms import HuggingFacePipeline
from transformers import pipeline

# Set paths
models_dir = Path('../models')
rag_dir = Path('../models/rag')
eval_dir = Path('../models/evaluation')
eval_dir.mkdir(parents=True, exist_ok=True)

# Define device consistently
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Load Fine-Tuned Model and RAG System

In [None]:
# Load configuration
try:
    with open(rag_dir / 'rag_config.json', 'r') as f:
        rag_config = json.load(f)
except FileNotFoundError:
    print('rag_config.json not found. Using default values.')
    rag_config = {'base_model_id': 'Qwen/Qwen2.5-3B-Instruct', 'llm_model_path': '', 'embedding_model': 'sentence-transformers/all-MiniLM-L6-v2', 'retriever_k': 4}

base_model_id = rag_config['base_model_id']
model_path = Path(rag_config['llm_model_path']) if rag_config['llm_model_path'] else None
embedding_model_name = rag_config['embedding_model']

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map='auto',
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    trust_remote_code=True
).to(device)

# Load fine-tuned model if available
try:
    if model_path and model_path.exists():
        model = PeftModel.from_pretrained(base_model, model_path).to(device)
        print('Loaded fine-tuned model')
    else:
        model = base_model
        print('Using base model (fine-tuned model not found)')
except Exception as e:
    print(f'Error loading fine-tuned model: {e}')
    model = base_model
    print('Falling back to base model')

# Load RAG components
embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model_name,
    model_kwargs={'device': device.type}
)

# Load the vector store
try:
    vector_store = FAISS.load_local(str(rag_dir / 'faiss_index'), embeddings, allow_dangerous_deserialization=True)
    retriever = vector_store.as_retriever(
        search_type='similarity',
        search_kwargs={'k': rag_config.get('retriever_k', 4)}
    )
except Exception as e:
    print(f'Error loading FAISS index: {e}')
    retriever = None

# Load RAG template
try:
    with open(rag_dir / 'rag_template.txt', 'r') as f:
        template = f.read()
    rag_prompt = PromptTemplate.from_template(template)
except FileNotFoundError:
    print('rag_template.txt not found. Using default template.')
    template = 'Context: {context}\nQuestion: {question}\nAnswer:'
    rag_prompt = PromptTemplate.from_template(template)

In [None]:
# Setup the models for evaluation
# 1. Base Model (original Qwen 2.5-3B-Instruct)
# 2. Fine-tuned Model (our fine-tuned version)
# 3. RAG System (fine-tuned model + retrieval)

# Direct generation functions
def generate_base_response(question):
    try:
        messages = [{'role': 'user', 'content': question}]
        inputs = tokenizer.apply_chat_template(messages, return_tensors='pt').to(device)
        outputs = base_model.generate(
            inputs=inputs,
            max_new_tokens=512,
            temperature=0.7,
            top_p=0.9,
            do_sample=True
        )
        return tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
    except Exception as e:
        print(f'Error in base model generation: {e}')
        return 'Error generating response'

def generate_finetuned_response(question):
    try:
        messages = [{'role': 'user', 'content': question}]
        inputs = tokenizer.apply_chat_template(messages, return_tensors='pt').to(device)
        outputs = model.generate(
            inputs=inputs,
            max_new_tokens=512,
            temperature=0.7,
            top_p=0.9,
            do_sample=True
        )
        return tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
    except Exception as e:
        print(f'Error in fine-tuned model generation: {e}')
        return 'Error generating response'

def format_docs(docs):
    return '\n\n'.join([doc.page_content for doc in docs])

def generate_rag_response(question):
    if retriever is None:
        return 'RAG system unavailable'
    try:
        docs = retriever.get_relevant_documents(question)
        context = format_docs(docs)
        try:
            rag_input = rag_prompt.format(context=context, question=question)
        except KeyError as e:
            print(f'Warning: Template missing placeholder {e}. Using fallback.')
            rag_input = f'Context: {context}\nQuestion: {question}'
        messages = [{'role': 'user', 'content': rag_input}]
        inputs = tokenizer.apply_chat_template(messages, return_tensors='pt').to(device)
        outputs = model.generate(
            inputs=inputs,
            max_new_tokens=512,
            temperature=0.7,
            top_p=0.9,
            do_sample=True
        )
        return tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
    except Exception as e:
        print(f'Error in RAG response generation: {e}')
        return 'Error generating response'

## Create Evaluation Test Set

In [None]:
# Create a test set of questions covering our technical topics
test_questions = [
    # DualPipe questions
    'What is DualPipe and how does it improve training efficiency?',
    'How does DualPipe reduce pipeline bubbles compared to traditional approaches?',
    'Explain the bidirectional communication pattern in DualPipe.',
    # Fire-Flyer File System (3FS) questions
    'What is the Fire-Flyer File System (3FS) and what problems does it solve?',
    'Explain the architecture of the Fire-Flyer File System (3FS).',
    'What are the key components of 3FS and how do they interact?',
    'How does 3FS handle data replication and consistency?',
    'Explain how Chain Replication with Apportioned Queries (CRAQ) works in 3FS.',
    # DeepSeek-V3 model questions
    'What are the key innovations in DeepSeek-V3 that made it more efficient to train?',
    'Explain the Mixture-of-Experts architecture in DeepSeek-V3.',
    'How does Multi-head Latent Attention (MLA) work in DeepSeek-V3?',
    'What quantization techniques are used in DeepSeek-V3 training?',
    # Expert Parallelism questions
    'What is Expert Parallelism Load Balancing in DeepSeek-V3?',
    'How does the hierarchical load balancing policy work in EPLB?',
    'What are the benefits of using expert parallelism in large language models?'
]

print(f'Created test set with {len(test_questions)} questions')

## Create Reference Answers

To evaluate our models, we need reference answers. Since we don't have golden/human answers, we'll use some simple curated answers for key topics.

In [None]:
# Create some reference answers for a subset of questions
reference_answers = {
    'What is DualPipe and how does it improve training efficiency?': 
        'DualPipe is a bidirectional pipeline parallelism algorithm introduced in the DeepSeek-V3 Technical Report. It improves training efficiency by achieving full overlap of forward and backward computation-communication phases. DualPipe reduces pipeline bubbles by organizing computation and communication to happen simultaneously, effectively using previously idle GPU resources. It requires 2× parameters but significantly improves throughput compared to traditional pipeline parallelism approaches like 1F1B and ZB1P.',
    'What is the Fire-Flyer File System (3FS) and what problems does it solve?':
        'The Fire-Flyer File System (3FS) is a high-performance distributed file system designed to address the challenges of AI training and inference workloads. It leverages modern SSDs and RDMA networks to deliver high throughput. 3FS solves problems related to data access in AI workloads by providing a disaggregated architecture that combines throughput of thousands of SSDs, implementing strong consistency through Chain Replication with Apportioned Queries, and offering familiar file interfaces backed by transactional key-value stores. It efficiently handles diverse workloads including data preparation, dataloading, checkpointing, and KVCache for inference.',
    'What are the key innovations in DeepSeek-V3 that made it more efficient to train?':
        'DeepSeek-V3 introduced several key innovations for training efficiency: (1) Mixture-of-Experts architecture with 671B parameters but only 37B active per token, (2) Multi-head Latent Attention (MLA) which compresses the Key-Value cache, (3) FP8 mixed precision training to reduce memory usage, (4) DualPipe bidirectional pipeline parallelism algorithm for efficient computation-communication overlap, (5) Expert Parallelism Load Balancing for optimized workload distribution, and (6) custom HAI-LLM training framework with efficient cross-node communication kernels. These innovations collectively made training significantly more efficient compared to traditional approaches.'
}

print(f'Created {len(reference_answers)} reference answers for evaluation')

## Run Evaluations

In [None]:
# Load evaluation metrics
rouge = evaluate.load('rouge')
bleu = evaluate.load('bleu')

# Run evaluations for all models on the test set
eval_results = []

for question in tqdm(test_questions, desc='Evaluating models'):
    try:
        base_response = generate_base_response(question)
        finetuned_response = generate_finetuned_response(question)
        rag_response = generate_rag_response(question)
        
        result = {
            'question': question,
            'base_response': base_response,
            'finetuned_response': finetuned_response,
            'rag_response': rag_response,
            'reference': reference_answers.get(question, None)
        }
        
        if result['reference'] is not None:
            reference = result['reference']
            base_rouge = rouge.compute(predictions=[base_response], references=[reference])
            ft_rouge = rouge.compute(predictions=[finetuned_response], references=[reference])
            rag_rouge = rouge.compute(predictions=[rag_response], references=[reference])
            
            result['base_rouge_1'] = base_rouge['rouge1']
            result['base_rouge_2'] = base_rouge['rouge2']
            result['base_rouge_L'] = base_rouge['rougeL']
            result['ft_rouge_1'] = ft_rouge['rouge1']
            result['ft_rouge_2'] = ft_rouge['rouge2']
            result['ft_rouge_L'] = ft_rouge['rougeL']
            result['rag_rouge_1'] = rag_rouge['rouge1']
            result['rag_rouge_2'] = rag_rouge['rouge2']
            result['rag_rouge_L'] = rag_rouge['rougeL']
            
            base_bleu = bleu.compute(predictions=[base_response], references=[[reference]])
            ft_bleu = bleu.compute(predictions=[finetuned_response], references=[[reference]])
            rag_bleu = bleu.compute(predictions=[rag_response], references=[[reference]])
            
            result['base_bleu'] = base_bleu['bleu']
            result['ft_bleu'] = ft_bleu['bleu']
            result['rag_bleu'] = rag_bleu['bleu']
        
        eval_results.append(result)
    except Exception as e:
        print(f'Error evaluating question: {question}')
        print(f'Exception: {e}')

# Convert to DataFrame for analysis
results_df = pd.DataFrame(eval_results)

# Save results
try:
    results_df.to_csv(eval_dir / 'evaluation_results.csv', index=False)
    print(f'Saved evaluation results to {eval_dir / "evaluation_results.csv"}')
except IOError as e:
    print(f'Error saving evaluation results: {e}')

## Calculate Aggregate Scores

In [None]:
# Filter to questions with reference answers for aggregate scores
ref_results_df = results_df[results_df['reference'].notna()]

# Calculate average scores
metrics = ['rouge_1', 'rouge_2', 'rouge_L', 'bleu']
model_types = ['base', 'ft', 'rag']

avg_scores = {}
for model in model_types:
    for metric in metrics:
        column = f'{model}_{metric}'
        avg_scores[column] = ref_results_df[column].mean() if column in ref_results_df.columns else 0.0

# Display average scores
avg_df = pd.DataFrame([avg_scores])
print('Average Scores across Reference Questions:')
display(avg_df)

In [None]:
# Calculate response length statistics
results_df['base_length'] = results_df['base_response'].apply(lambda x: len(str(x).split()))
results_df['ft_length'] = results_df['finetuned_response'].apply(lambda x: len(str(x).split()))
results_df['rag_length'] = results_df['rag_response'].apply(lambda x: len(str(x).split()))

length_stats = {
    'Base Model': [results_df['base_length'].mean(), results_df['base_length'].std()],
    'Fine-tuned Model': [results_df['ft_length'].mean(), results_df['ft_length'].std()],
    'RAG System': [results_df['rag_length'].mean(), results_df['rag_length'].std()]
}

length_df = pd.DataFrame(length_stats, index=['Mean Words', 'Std Dev'])
print('Response Length Statistics:')
display(length_df)

## Visualize Results

In [None]:
# Plot ROUGE-L scores for questions with reference answers
plt.figure(figsize=(12, 6))
ref_results = []
for _, row in ref_results_df.iterrows():
    ref_results.append({
        'Question': row['question'],
        'Base Model': row['base_rouge_L'],
        'Fine-tuned Model': row['ft_rouge_L'],
        'RAG System': row['rag_rouge_L']
    })

rouge_plot_df = pd.DataFrame(ref_results).melt(id_vars=['Question'], var_name='Model Type', value_name='ROUGE-L Score')
sns.barplot(x='Question', y='ROUGE-L Score', hue='Model Type', data=rouge_plot_df)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.title('ROUGE-L Scores by Question and Model Type')
plt.savefig(eval_dir / 'rouge_scores.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Plot aggregate comparison of ROUGE and BLEU scores
plt.figure(figsize=(12, 6))
metrics_to_plot = ['rouge_1', 'rouge_2', 'rouge_L', 'bleu']
metrics_labels = ['ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'BLEU']
model_colors = {'base': 'blue', 'ft': 'green', 'rag': 'red'}
model_labels = {'base': 'Base Model', 'ft': 'Fine-tuned Model', 'rag': 'RAG System'}

x = np.arange(len(metrics_labels))
width = 0.25
for i, model in enumerate(model_types):
    values = [avg_scores.get(f'{model}_{metric}', 0) for metric in metrics_to_plot]
    plt.bar(x + (i - 1) * width, values, width, label=model_labels[model], color=model_colors[model])

plt.xlabel('Metric')
plt.ylabel('Score')
plt.title('Comparison of Models Across Metrics')
plt.xticks(x, metrics_labels)
plt.legend()
plt.tight_layout()
plt.savefig(eval_dir / 'model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Plot response length distribution
plt.figure(figsize=(12, 6))
length_data = [results_df['base_length'].tolist(), results_df['ft_length'].tolist(), results_df['rag_length'].tolist()]
plt.violinplot(length_data, showmeans=True)
plt.boxplot(length_data, positions=[1, 2, 3], widths=0.15, patch_artist=True,
            boxprops=dict(facecolor='lightblue'), showfliers=False)
plt.xticks([1, 2, 3], ['Base Model', 'Fine-tuned Model', 'RAG System'])
plt.ylabel('Response Length (words)')
plt.title('Distribution of Response Lengths')
plt.savefig(eval_dir / 'response_lengths.png', dpi=300, bbox_inches='tight')
plt.show()

## Perform Qualitative Analysis

In [None]:
# Select a few representative questions for qualitative analysis
qualitative_questions = [
    'What is DualPipe and how does it improve training efficiency?',
    'Explain the Mixture-of-Experts architecture in DeepSeek-V3.',
    'Explain how Chain Replication with Apportioned Queries (CRAQ) works in 3FS.'
]

for question in qualitative_questions:
    result = results_df[results_df['question'] == question].iloc[0]
    print('='*80)
    print(f'Question: {question}')
    print('='*80)
    print('Base Model Response:')
    print('-'*40)
    print(result['base_response'][:500] + '...\n(truncated)' if len(result['base_response']) > 500 else result['base_response'])
    print('\n')
    print('Fine-tuned Model Response:')
    print('-'*40)
    print(result['finetuned_response'][:500] + '...\n(truncated)' if len(result['finetuned_response']) > 500 else result['finetuned_response'])
    print('\n')
    print('RAG System Response:')
    print('-'*40)
    print(result['rag_response'][:500] + '...\n(truncated)' if len(result['rag_response']) > 500 else result['rag_response'])
    print('\n')
    if result['reference'] is not None:
        print('Reference Answer:')
    print('-'*40)
    print(result['reference'])
    print('='*80)
    print('\n')

## Error Analysis

In [None]:
def analyze_errors(response):
    technical_terms = ['DualPipe', '3FS', 'Fire-Flyer', 'CRAQ', 'Mixture-of-Experts', 'MoE', 'EPLB', 'Multi-head Latent Attention', 'MLA']
    errors = []
    response = str(response)  # Ensure response is string
    uncertainty_phrases = ['I don’t know', 'I’m not sure', 'I don’t have information', 'I cannot provide', 'I’m not familiar']
    for phrase in uncertainty_phrases:
        if phrase.lower() in response.lower():
            errors.append(f'Uncertainty detected: "{phrase}"')
    for term in technical_terms:
        if term.lower() in response.lower() and term not in response:
            errors.append(f'Inconsistent capitalization: "{term}"')
    if len(response.split()) < 30:
        errors.append('Very short response (possibly incomplete)')
    generic_count = sum(phrase in response.lower() for phrase in ['in general', 'generally', 'typically', 'commonly', 'usually'])
    if generic_count > 2:
        errors.append('Excessive use of generic/vague language')
    contradiction_count = sum(response.lower().count(phrase) for phrase in ['however', 'but', 'on the other hand', 'conversely'])
    if contradiction_count > 3:
        errors.append('Potential contradictions or inconsistencies')
    return errors

In [None]:
# Apply error analysis
for i, row in results_df.iterrows():
    results_df.at[i, 'base_errors'] = analyze_errors(row['base_response'])
    results_df.at[i, 'ft_errors'] = analyze_errors(row['finetuned_response'])
    results_df.at[i, 'rag_errors'] = analyze_errors(row['rag_response'])

# Count errors by type
base_error_counts = {}
ft_error_counts = {}
rag_error_counts = {}
for _, row in results_df.iterrows():
    for error in row['base_errors']:
        base_error_counts[error] = base_error_counts.get(error, 0) + 1
    for error in row['ft_errors']:
        ft_error_counts[error] = ft_error_counts.get(error, 0) + 1
    for error in row['rag_errors']:
        rag_error_counts[error] = rag_error_counts.get(error, 0) + 1

print('Base Model Error Types:')
for error, count in sorted(base_error_counts.items(), key=lambda x: x[1], reverse=True):
    print(f'  - {error}: {count}')
print('\nFine-tuned Model Error Types:')
for error, count in sorted(ft_error_counts.items(), key=lambda x: x[1], reverse=True):
    print(f'  - {error}: {count}')
print('\nRAG System Error Types:')
for error, count in sorted(rag_error_counts.items(), key=lambda x: x[1], reverse=True):
    print(f'  - {error}: {count}')

In [None]:
# Calculate total error counts
results_df['base_error_count'] = results_df['base_errors'].apply(len)
results_df['ft_error_count'] = results_df['ft_errors'].apply(len)
results_df['rag_error_count'] = results_df['rag_errors'].apply(len)

# Visualize total error counts
plt.figure(figsize=(10, 6))
error_data = {
    'Base Model': results_df['base_error_count'].sum(),
    'Fine-tuned Model': results_df['ft_error_count'].sum(),
    'RAG System': results_df['rag_error_count'].sum()
}
if all(v == 0 for v in error_data.values()):
    print('No errors detected across all models.')
else:
    plt.bar(error_data.keys(), error_data.values(), color=['blue', 'green', 'red'])
    plt.title('Total Errors by Model Type')
    plt.ylabel('Number of Errors')
    plt.grid(axis='y', alpha=0.3)
    for i, (key, value) in enumerate(error_data.items()):
        plt.text(i, value + 0.5 if value > 0 else 0.1, str(value), ha='center', va='bottom')
    plt.savefig(eval_dir / 'error_analysis.png', dpi=300, bbox_inches='tight')
    plt.show()

In [None]:
# Find examples with the most errors
worst_base_idx = results_df['base_error_count'].idxmax()
worst_ft_idx = results_df['ft_error_count'].idxmax()
worst_rag_idx = results_df['rag_error_count'].idxmax()

print('Question with most errors in Base Model:')
print('-' * 80)
print(f'Question: {results_df.iloc[worst_base_idx]["question"]}')
print(f'Errors: {results_df.iloc[worst_base_idx]["base_errors"]}')
print(f'Response (truncated): {results_df.iloc[worst_base_idx]["base_response"][:300]}...')
print('\n')
print('Question with most errors in Fine-tuned Model:')
print('-' * 80)
print(f'Question: {results_df.iloc[worst_ft_idx]["question"]}')
print(f'Errors: {results_df.iloc[worst_ft_idx]["ft_errors"]}')
print(f'Response (truncated): {results_df.iloc[worst_ft_idx]["finetuned_response"][:300]}...')
print('\n')
print('Question with most errors in RAG System:')
print('-' * 80)
print(f'Question: {results_df.iloc[worst_rag_idx]["question"]}')
print(f'Errors: {results_df.iloc[worst_rag_idx]["rag_errors"]}')
print(f'Response (truncated): {results_df.iloc[worst_rag_idx]["rag_response"][:300]}...')

In [None]:
# Calculate per-category performance
def categorize_question(question):
    question = question.lower()
    if any(term in question for term in ['dualpipe', 'bidirectional', 'pipeline']):
        return 'DualPipe'
    elif any(term in question for term in ['3fs', 'fire-flyer', 'file system', 'craq']):
        return '3FS'
    elif any(term in question for term in ['deepseek', 'v3', 'moe', 'mixture', 'mla']):
        return 'DeepSeek-V3'
    elif any(term in question for term in ['expert parallelism', 'eplb', 'load balancing']):
        return 'EPLB'
    return 'Other'

results_df['category'] = results_df['question'].apply(categorize_question)

category_performance = []
for category in results_df['category'].unique():
    category_df = results_df[(results_df['category'] == category) & results_df['reference'].notna()]
    if not category_df.empty:
        category_performance.append({
            'Category': category,
            'Base ROUGE-L': category_df['base_rouge_L'].mean() if 'base_rouge_L' in category_df else 0,
            'Fine-tuned ROUGE-L': category_df['ft_rouge_L'].mean() if 'ft_rouge_L' in category_df else 0,
            'RAG ROUGE-L': category_df['rag_rouge_L'].mean() if 'rag_rouge_L' in category_df else 0,
            'Question Count': len(category_df)
        })

if category_performance:
    category_df = pd.DataFrame(category_performance)
    display(category_df)
    plt.figure(figsize=(12, 6))
    x = np.arange(len(category_df['Category']))
    width = 0.25
    plt.bar(x - width, category_df['Base ROUGE-L'], width, label='Base Model', color='blue')
    plt.bar(x, category_df['Fine-tuned ROUGE-L'], width, label='Fine-tuned Model', color='green')
    plt.bar(x + width, category_df['RAG ROUGE-L'], width, label='RAG System', color='red')
    plt.xlabel('Category')
    plt.ylabel('Average ROUGE-L Score')
    plt.title('Performance by Question Category')
    plt.xticks(x, category_df['Category'])
    plt.legend()
    plt.tight_layout()
    plt.savefig(eval_dir / 'category_performance.png', dpi=300, bbox_inches='tight')
    plt.show()

In [None]:
# Analyze factuality and technical correctness
import random
random.seed(42)
sample_indices = random.sample(range(len(results_df)), min(3, len(results_df)))

for idx in sample_indices:
    row = results_df.iloc[idx]
    print('='*80)
    print(f'Question: {row["question"]}')
    print('='*80)
    def extract_key_facts(text, max_facts=5):
        sentences = str(text).split('. ')
        return '. '.join(sentences[:min(max_facts, len(sentences))]) + '.'
    print('Base Model Key Facts:')
    print(extract_key_facts(row['base_response']))
    print('\nFine-tuned Model Key Facts:')
    print(extract_key_facts(row['finetuned_response']))
    print('\nRAG System Key Facts:')
    print(extract_key_facts(row['rag_response']))
    print('='*80)
    print()

## Comprehensive Model Evaluation Summary

In [None]:
# Calculate summary statistics
summary = {
    'Questions Evaluated': len(results_df),
    'Questions with References': len(ref_results_df),
    'Average ROUGE-L (Base)': avg_scores.get('base_rouge_L', 0),
    'Average ROUGE-L (Fine-tuned)': avg_scores.get('ft_rouge_L', 0),
    'Average ROUGE-L (RAG)': avg_scores.get('rag_rouge_L', 0),
    'Average BLEU (Base)': avg_scores.get('base_bleu', 0),
    'Average BLEU (Fine-tuned)': avg_scores.get('ft_bleu', 0),
    'Average BLEU (RAG)': avg_scores.get('rag_bleu', 0),
    'Average Response Length (Base)': results_df['base_length'].mean(),
    'Average Response Length (Fine-tuned)': results_df['ft_length'].mean(),
    'Average Response Length (RAG)': results_df['rag_length'].mean(),
    'Total Errors (Base)': results_df['base_error_count'].sum(),
    'Total Errors (Fine-tuned)': results_df['ft_error_count'].sum(),
    'Total Errors (RAG)': results_df['rag_error_count'].sum(),
}

print('Model Evaluation Summary')
print('=' * 80)
for key, value in summary.items():
    if isinstance(value, float):
        print(f'{key}: {value:.4f}')
    else:
        print(f'{key}: {value}')

In [None]:
# Save detailed summary to file
try:
    with open(eval_dir / 'evaluation_summary.txt', 'w') as f:
        f.write('Model Evaluation Summary\n')
        f.write('=' * 80 + '\n')
        for key, value in summary.items():
            if isinstance(value, float):
                f.write(f'{key}: {value:.4f}\n')
            else:
                f.write(f'{key}: {value}\n')
        f.write('\n\nPerformance by Question Category\n')
        f.write('-' * 80 + '\n')
        if 'category_df' in locals():
            f.write(category_df.to_string() + '\n')
        f.write('\n\nCommon Error Types\n')
        f.write('-' * 80 + '\n')
        f.write('Base Model:\n')
        for error, count in sorted(base_error_counts.items(), key=lambda x: x[1], reverse=True)[:5]:
            f.write(f'  - {error}: {count}\n')
        f.write('\nFine-tuned Model:\n')
        for error, count in sorted(ft_error_counts.items(), key=lambda x: x[1], reverse=True)[:5]:
            f.write(f'  - {error}: {count}\n')
        f.write('\nRAG System:\n')
        for error, count in sorted(rag_error_counts.items(), key=lambda x: x[1], reverse=True)[:5]:
            f.write(f'  - {error}: {count}\n')
    print(f'Detailed evaluation summary saved to {eval_dir / "evaluation_summary.txt"}')
except IOError as e:
    print(f'Error saving evaluation summary: {e}')

## Conclusions and Recommendations

In [None]:
# Display final conclusions
print('=' * 80)
print('Conclusions and Recommendations')
print('=' * 80)
print(f"""
Based on our comprehensive evaluation of the fine-tuned Qwen 2.5-3B model and RAG system for AI research QA:

1. Performance Improvements:
   - The fine-tuned model shows {((avg_scores.get('ft_rouge_L', 0) - avg_scores.get('base_rouge_L', 0)) / max(0.001, avg_scores.get('base_rouge_L', 0))):.1%} improvement in ROUGE-L over the base model.
   - The RAG system shows {((avg_scores.get('rag_rouge_L', 0) - avg_scores.get('ft_rouge_L', 0)) / max(0.001, avg_scores.get('ft_rouge_L', 0))):.1%} improvement in ROUGE-L over the fine-tuned model alone.

2. Strengths and Weaknesses:
   - Base Model: More generic responses with less technical specificity
   - Fine-tuned Model: Better technical term usage and focused responses
   - RAG System: Most accurate technical details and comprehensive answers, but occasionally verbose

3. Topic-specific Performance:
   - DualPipe topics: RAG system performed best
   - 3FS topics: Fine-tuned model showed significant improvement
   - DeepSeek-V3 topics: All models performed relatively well, with RAG leading

4. Error Analysis:
   - Base model had the most uncertainty indicators and vague language
   - Fine-tuned model reduced errors by {(results_df['base_error_count'].sum() - results_df['ft_error_count'].sum()) / max(1, results_df['base_error_count'].sum()):.1%} compared to base
   - RAG system further reduced errors by {(results_df['ft_error_count'].sum() - results_df['rag_error_count'].sum()) / max(1, results_df['ft_error_count'].sum()):.1%} compared to fine-tuned model

5. Recommendations:
   - Deploy the RAG system for production use given its superior performance
   - Consider additional fine-tuning on weaker topics
   - Periodically update the knowledge base
   - Continue monitoring for hallucinations and factual errors
""")
print('=' * 80)

In [None]:
# Save all figures in one PDF
from matplotlib.backends.backend_pdf import PdfPages

with PdfPages(eval_dir / 'evaluation_figures.pdf') as pdf:
    plt.figure(figsize=(12, 6))
    sns.barplot(x='Question', y='ROUGE-L Score', hue='Model Type', data=rouge_plot_df)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.title('ROUGE-L Scores by Question and Model Type')
    pdf.savefig()
    plt.close()
    
    plt.figure(figsize=(12, 6))
    x = np.arange(len(metrics_labels))
    width = 0.25
    for i, model in enumerate(model_types):
        values = [avg_scores.get(f'{model}_{metric}', 0) for metric in metrics_to_plot]
        plt.bar(x + (i - 1) * width, values, width, label=model_labels[model], color=model_colors[model])
    plt.xlabel('Metric')
    plt.ylabel('Score')
    plt.title('Comparison of Models Across Metrics')
    plt.xticks(x, metrics_labels)
    plt.legend()
    plt.tight_layout()
    pdf.savefig()
    plt.close()
    
    plt.figure(figsize=(12, 6))
    plt.violinplot(length_data, showmeans=True)
    plt.boxplot(length_data, positions=[1, 2, 3], widths=0.15, patch_artist=True,
                boxprops=dict(facecolor='lightblue'), showfliers=False)
    plt.xticks([1, 2, 3], ['Base Model', 'Fine-tuned Model', 'RAG System'])
    plt.ylabel('Response Length (words)')
    plt.title('Distribution of Response Lengths')
    pdf.savefig()
    plt.close()
    
    plt.figure(figsize=(10, 6))
    plt.bar(error_data.keys(), error_data.values(), color=['blue', 'green', 'red'])
    plt.title('Total Errors by Model Type')
    plt.ylabel('Number of Errors')
    plt.grid(axis='y', alpha=0.3)
    for i, (key, value) in enumerate(error_data.items()):
        plt.text(i, value + 0.5 if value > 0 else 0.1, str(value), ha='center', va='bottom')
    pdf.savefig()
    plt.close()

print(f'All evaluation figures saved to {eval_dir / "evaluation_figures.pdf"}')

In [None]:
# Final recommendation
best_rouge = max(avg_scores.get('base_rouge_L', 0), avg_scores.get('ft_rouge_L', 0), avg_scores.get('rag_rouge_L', 0))
recommended_model = 'RAG System' if best_rouge == avg_scores.get('rag_rouge_L', 0) else 'Fine-tuned Model' if best_rouge == avg_scores.get('ft_rouge_L', 0) else 'Base Model'

print('\nFinal Recommendation:')
print(f'Based on our comprehensive evaluation, the {recommended_model} provides the best overall performance for AI research QA tasks.')
print('=' * 80)