# Evaluation & Benchmarking: Fine-tuned vs Base Model

**Objective**: Comprehensively evaluate the fine-tuned model against the base model

## Evaluation Metrics

1. **Syntax Correctness**: Can the model generate valid Python code?
2. **BLEU Score**: How similar is generated code to reference?
3. **Exact Match**: Does generated code exactly match reference?
4. **Pass@k**: Functional correctness (if test cases available)
5. **Perplexity**: Model confidence
6. **Human Evaluation**: Qualitative assessment

---

## Setup

In [None]:
import torch
import json
import ast
import numpy as np
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported")

## Load Models

In [None]:
# Model paths 
base_model_name = "deepseek-ai/deepseek-coder-6.7b-instruct"
finetuned_model_path = "./sft_results"

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Device: {device}")

In [None]:
# 1. Load base model [When trained using LoRA we need to load the base model]
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    dtype=torch.float16,  # Use dtype instead of deprecated torch_dtype
    device_map="auto",
    trust_remote_code=True
)
base_model.eval()
print("Base model loaded")

# 2. Load fine-tuned model with LoRA adapter
# Load base model first
finetuned_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    dtype=torch.float16,  # Use dtype instead of deprecated torch_dtype
    device_map="auto",
    trust_remote_code=True
)

# Load LoRA adapter weights
print(f"Loading LoRA adapter from: {finetuned_model_path}")
finetuned_model = PeftModel.from_pretrained(
    finetuned_model, 
    finetuned_model_path,
    torch_dtype=torch.float16  # Specify dtype for adapter weights
)
finetuned_model.eval()
print("Fine-tuned model loaded successfully")
    

## Load Test Data

In [None]:
# Load validation set from Magicoder [we use train split as validation]
dataset = load_dataset("ise-uiuc/Magicoder-Evol-Instruct-110K", split="train")
dataset = dataset.train_test_split(test_size=0.1, seed=42)
test_dataset = dataset['test']

# Take subset for evaluation (adjust size as needed)
num_eval_samples = 500
test_dataset = test_dataset.select(range(num_eval_samples))

print(f"Test dataset: {len(test_dataset)} examples")
print(f"Sample: {test_dataset[0]['instruction'][:100]}...")

## Evaluation Metrics Implementation

In [None]:
# Metrics functions for evaluation
def check_syntax(code: str) -> bool:
    """Check if Python code has valid syntax"""
    try:
        ast.parse(code)
        return True
    except SyntaxError:
        return False


def calculate_bleu(reference: str, hypothesis: str) -> float:
    """Calculate BLEU score between reference and hypothesis"""
    reference_tokens = reference.split()
    hypothesis_tokens = hypothesis.split()
    
    if not hypothesis_tokens:
        return 0.0
    
    smoothing = SmoothingFunction().method1
    return sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing)


def exact_match(reference: str, hypothesis: str) -> bool:
    """Check if generated code exactly matches reference"""
    return reference.strip() == hypothesis.strip()


def calculate_perplexity(model, input_ids, labels):
    """Calculate perplexity"""
    with torch.no_grad():
        outputs = model(input_ids=input_ids, labels=labels)
        if outputs.loss is None:
            return None
        loss = outputs.loss
        perplexity = torch.exp(loss)
    return perplexity.item()


print("Metrics functions defined")

## Generation Function

In [None]:
def generate_code(model, instruction: str, max_new_tokens: int = 256, temperature: float = 0.7):
    """Generate code completion from instruction"""
    # Format prompt
    messages = [
        {"role": "system", "content": "You are an expert Python programmer."},
        {"role": "user", "content": instruction}
    ]
    
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(device)
    
    # Generate via torch.no_grad()
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            do_sample=True,
            top_p=0.95,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    # Decode only the generated part
    generated_text = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    
    return generated_text.strip()


# Test generation
test_instruction = "Write a Python function to calculate factorial"
test_output = generate_code(base_model, test_instruction, max_new_tokens=100)
print(f"Test generation:\n{test_output[:200]}...")

## Run Evaluation

In [None]:
# Evaluate model function 
def evaluate_model(model, dataset, model_name: str, num_samples: int = 500):
    """
    Comprehensive evaluation of model
    
    Returns:
        dict: Evaluation metrics and examples
    """
    results = {
        'syntax_correct': 0,
        'bleu_scores': [],
        'exact_matches': 0,
        'perplexities': [],
        'examples': []
    }
    
    print(f"\nEvaluating {model_name}...")
    
    for i, example in enumerate(tqdm(dataset.select(range(min(num_samples, len(dataset)))))):
        instruction = example['instruction']
        reference = example['response']
        
        generated = generate_code(model, instruction)
        
        # Syntax correctness
        is_valid = check_syntax(generated)
        if is_valid:
            results['syntax_correct'] += 1
        
        # BLEU score
        bleu = calculate_bleu(reference, generated)
        results['bleu_scores'].append(bleu)
        
        # Exact match
        if exact_match(reference, generated):
            results['exact_matches'] += 1
        
        # Perplexity (calculate on reference)
        messages = [
            {"role": "system", "content": "You are an expert Python programmer."},
            {"role": "user", "content": instruction},
            {"role": "assistant", "content": reference}
        ]
        full_text = tokenizer.apply_chat_template(messages, tokenize=False)
        inputs = tokenizer(full_text, return_tensors="pt", truncation=True, max_length=1024).to(device)
        
        try:
            ppl = calculate_perplexity(model, inputs.input_ids, inputs.input_ids)
            if ppl is not None and not np.isnan(ppl) and ppl < 1000:
                results['perplexities'].append(ppl)
        except (RuntimeError, ValueError, IndexError):
            pass
        
        # Save first 10 examples
        if i < 10:
            results['examples'].append({
                'instruction': instruction,
                'reference': reference,
                'generated': generated,
                'syntax_correct': is_valid,
                'bleu': bleu
            })
    
    # Calculate summary statistics
    total = min(num_samples, len(dataset))
    results['summary'] = {
        'syntax_accuracy': results['syntax_correct'] / total * 100,
        'avg_bleu': np.mean(results['bleu_scores']),
        'exact_match_rate': results['exact_matches'] / total * 100,
        'avg_perplexity': np.mean(results['perplexities']) if results['perplexities'] else 0,
        'total_samples': total
    }
    
    return results


print("Evaluation function defined")

In [None]:
# Run evaluation on both models
base_results = evaluate_model(base_model, test_dataset, "Base Model", num_samples=500)
finetuned_results = evaluate_model(finetuned_model, test_dataset, "Fine-tuned Model", num_samples=500)

print("\n Evaluation complete!")

## Results Comparison

In [None]:
import pandas as pd

# Create comparison table
comparison = pd.DataFrame({
    'Metric': ['Syntax Accuracy (%)', 'Average BLEU', 'Exact Match (%)', 'Average Perplexity', 'Total Samples'],
    'Base Model': [
        f"{base_results['summary']['syntax_accuracy']:.2f}",
        f"{base_results['summary']['avg_bleu']:.4f}",
        f"{base_results['summary']['exact_match_rate']:.2f}",
        f"{base_results['summary']['avg_perplexity']:.2f}",
        base_results['summary']['total_samples']
    ],
    'Fine-tuned Model': [
        f"{finetuned_results['summary']['syntax_accuracy']:.2f}",
        f"{finetuned_results['summary']['avg_bleu']:.4f}",
        f"{finetuned_results['summary']['exact_match_rate']:.2f}",
        f"{finetuned_results['summary']['avg_perplexity']:.2f}",
        finetuned_results['summary']['total_samples']
    ]
})

print("\n Model Comparison:")
print(comparison.to_string(index=False))

# Calculate improvements
print("\nðŸ“ˆ Improvements:")
print(f"Syntax Accuracy: {finetuned_results['summary']['syntax_accuracy'] - base_results['summary']['syntax_accuracy']:.2f}% improvement")
print(f"BLEU Score: {(finetuned_results['summary']['avg_bleu'] - base_results['summary']['avg_bleu']) / base_results['summary']['avg_bleu'] * 100:.2f}% improvement")
print(f"Exact Match: {finetuned_results['summary']['exact_match_rate'] - base_results['summary']['exact_match_rate']:.2f}% improvement")
print(f"Perplexity: {(base_results['summary']['avg_perplexity'] - finetuned_results['summary']['avg_perplexity']) / base_results['summary']['avg_perplexity'] * 100:.2f}% reduction")

## Qualitative Examples

In [None]:
def display_comparison(example_idx: int):
    """Display side-by-side comparison of base vs fine-tuned"""
    base_ex = base_results['examples'][example_idx]
    ft_ex = finetuned_results['examples'][example_idx]
    
    print(f"\n{'='*80}")
    print(f"Example {example_idx + 1}")
    print(f"{'='*80}")
    
    print(f"\n Instruction:\n{base_ex['instruction'][:200]}...")
    
    print(f"\n Reference (Ground Truth):\n{base_ex['reference'][:300]}...")
    
    print(f"\n Base Model Output:")
    print(f"Syntax Valid: {base_ex['syntax_correct']} | BLEU: {base_ex['bleu']:.4f}")
    print(f"{base_ex['generated'][:300]}...")
    
    print(f"\n Fine-tuned Model Output:")
    print(f"Syntax Valid: {ft_ex['syntax_correct']} | BLEU: {ft_ex['bleu']:.4f}")
    print(f"{ft_ex['generated'][:300]}...")


# Display first 5 examples
for i in range(min(5, len(base_results['examples']))):
    display_comparison(i)

## Save Results

In [None]:
# Save detailed results to JSON
import json

results_summary = {
    'base_model': base_results['summary'],
    'finetuned_model': finetuned_results['summary'],
    'improvements': {
        'syntax_accuracy': finetuned_results['summary']['syntax_accuracy'] - base_results['summary']['syntax_accuracy'],
        'bleu_score': finetuned_results['summary']['avg_bleu'] - base_results['summary']['avg_bleu'],
        'exact_match': finetuned_results['summary']['exact_match_rate'] - base_results['summary']['exact_match_rate'],
        'perplexity_reduction': base_results['summary']['avg_perplexity'] - finetuned_results['summary']['avg_perplexity']
    }
}

with open('evaluation_results.json', 'w') as f:
    json.dump(results_summary, f, indent=2)

print("Results saved to evaluation_results.json")

- **Syntax Correctness**: Fine-tuned model generates syntactically valid code X% more often
- **BLEU Score**: Improved by X%, indicating better alignment with reference code
- **Perplexity**: Reduced by X%, showing increased confidence

### Next Steps:
1. Human evaluation on complex coding tasks
2. Test on real-world scenarios (FastAPI, Django, React)
3. Deploy for production testing
4. Consider GRPO stage for further alignment