# 04 - Comprehensive Model Evaluation

**Models:** Base, SFT Trial 1 & 2, DPO Trial 1 & 2

**Output:** `results/evaluation_results.json` with all metrics

In [None]:
!pip install -q datasets transformers peft trl bitsandbytes accelerate sacrebleu nltk

In [None]:
import torch
import json
import os
from datetime import datetime
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
from sacrebleu.metrics import BLEU
import warnings
warnings.filterwarnings('ignore')

SEED = 42
torch.manual_seed(SEED)
device = "cuda" if torch.cuda.is_available() else "cpu"
os.makedirs('results', exist_ok=True)
print(f"Device: {device}")

## 1. Load Evaluation Prompts

In [None]:
with open('evaluation/eval_prompts.json', 'r') as f:
    eval_prompts = json.load(f)
print(f"Loaded {len(eval_prompts)} evaluation prompts")

## 2. Setup Models

In [None]:
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

# Model paths
MODEL_PATHS = {
    "base": None,
    "sft_trial1": "./outputs/sft_trial1/final",
    "sft_trial2": "./outputs/sft_trial2/final",
    "dpo_trial1": "./outputs/dpo_trial1/final",
    "dpo_trial2": "./outputs/dpo_trial2/final",
}

In [None]:
def load_model(adapter_path=None):
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, quantization_config=bnb_config, device_map="auto")
    if adapter_path:
        model = PeftModel.from_pretrained(model, adapter_path)
    return model

def generate_response(model, prompt, max_tokens=256):
    sys_tok = "<" + "|system|" + ">"
    usr_tok = "<" + "|user|" + ">"
    ast_tok = "<" + "|assistant|" + ">"
    eos = "<" + "/s" + ">"
    
    formatted = f"{sys_tok}\nYou are a helpful assistant.{eos}\n{usr_tok}\n{prompt}{eos}\n{ast_tok}\n"
    inputs = tokenizer(formatted, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=max_tokens, temperature=0.7, top_p=0.9, do_sample=True, pad_token_id=tokenizer.eos_token_id)
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=False)
    if ast_tok in response:
        response = response.split(ast_tok)[-1].replace(eos, "").strip()
    return response

bleu = BLEU(effective_order=True)
def calc_bleu(hyp, ref):
    return bleu.sentence_score(hyp, [ref]).score

## 3. Evaluate All Models

In [None]:
all_results = {}

for model_name, model_path in MODEL_PATHS.items():
    print(f"\n{'='*50}")
    print(f"Evaluating: {model_name}")
    print('='*50)
    
    try:
        model = load_model(model_path)
        results = []
        
        for p in eval_prompts:
            response = generate_response(model, p['prompt'])
            bleu_score = calc_bleu(response, p['target_response'])
            
            results.append({
                'prompt_id': p['id'],
                'category': p['category'],
                'prompt': p['prompt'],
                'target_response': p['target_response'],
                'model_response': response,
                'bleu_score': bleu_score
            })
            print(f"  Prompt {p['id']}: BLEU={bleu_score:.2f}")
        
        all_results[model_name] = {
            'results': results,
            'avg_bleu': sum(r['bleu_score'] for r in results) / len(results)
        }
        
        del model
        torch.cuda.empty_cache()
        
    except Exception as e:
        print(f"Error: {e}")
        all_results[model_name] = {'error': str(e)}

## 4. Generate Summary JSON

In [None]:
# Create comprehensive results JSON
evaluation_output = {
    "timestamp": datetime.now().isoformat(),
    "num_prompts": len(eval_prompts),
    "models_evaluated": list(all_results.keys()),
    "summary": {},
    "detailed_results": {}
}

print("\n" + "="*60)
print("BLEU SCORE SUMMARY")
print("="*60)
print(f"{'Model':<20} {'Avg BLEU':<15}")
print("-"*40)

for model_name, data in all_results.items():
    if 'error' not in data:
        avg = data['avg_bleu']
        evaluation_output["summary"][model_name] = {
            "avg_bleu": avg,
            "per_prompt_scores": {r['prompt_id']: r['bleu_score'] for r in data['results']}
        }
        evaluation_output["detailed_results"][model_name] = data['results']
        print(f"{model_name:<20} {avg:<15.2f}")

# Save comprehensive JSON
with open('results/evaluation_results.json', 'w') as f:
    json.dump(evaluation_output, f, indent=2)
print(f"\nResults saved to results/evaluation_results.json")

## 5. Model Comparison Table

In [None]:
import pandas as pd

# Create comparison dataframe
rows = []
for model_name, data in all_results.items():
    if 'results' in data:
        for r in data['results']:
            rows.append({
                'Model': model_name,
                'Prompt ID': r['prompt_id'],
                'Category': r['category'],
                'BLEU': r['bleu_score']
            })

df = pd.DataFrame(rows)
print("\nBLEU Scores by Prompt:")
pivot = df.pivot(index='Prompt ID', columns='Model', values='BLEU')
print(pivot.to_string())

## 6. Sample Response Comparison

In [None]:
print("\n" + "="*60)
print("SAMPLE RESPONSES (Prompt 1)")
print("="*60)

for model_name, data in all_results.items():
    if 'results' in data:
        print(f"\n### {model_name} (BLEU: {data['results'][0]['bleu_score']:.2f}) ###")
        print(data['results'][0]['model_response'][:400])
        print("-"*40)

## 7. Manual Evaluation Template (for DPO models)

In [None]:
# Generate manual evaluation JSON template
manual_eval_template = {
    "evaluator": "YOUR_NAME",
    "date": datetime.now().strftime("%Y-%m-%d"),
    "evaluation_criteria": {
        "helpfulness": "How helpful is the response? (1-5)",
        "harmlessness": "Is the response safe and appropriate? (1-5)",
        "relevance": "How well does it address the prompt? (1-5)"
    },
    "evaluations": []
}

for p in eval_prompts:
    for model in ["dpo_trial1", "dpo_trial2"]:
        if model in all_results and 'results' in all_results[model]:
            response = [r for r in all_results[model]['results'] if r['prompt_id'] == p['id']][0]
            manual_eval_template["evaluations"].append({
                "prompt_id": p['id'],
                "model": model,
                "response_preview": response['model_response'][:200],
                "helpfulness": None,
                "harmlessness": None,
                "relevance": None,
                "notes": ""
            })

with open('results/manual_evaluation_template.json', 'w') as f:
    json.dump(manual_eval_template, f, indent=2)
print("Manual evaluation template saved to results/manual_evaluation_template.json")
print("Fill in the scores (1-5) for each response!")

## 8. Final Summary

In [None]:
print("\n" + "="*60)
print("EVALUATION COMPLETE")
print("="*60)
print("\nGenerated files:")
print("  - results/evaluation_results.json (all BLEU scores & responses)")
print("  - results/manual_evaluation_template.json (for DPO manual eval)")
print("\nAll training results:")
print("  - results/sft_trial1_results.json")
print("  - results/sft_trial2_results.json")
print("  - results/dpo_trial1_results.json")
print("  - results/dpo_trial2_results.json")