# 05 - BERT Fine-tuning Experiments

This notebook contains BERT fine-tuning experiments for ticket classification, comparing different hyperparameter configurations:

**Experiment Dimensions:**
- Learning Rate: [2e-5, 3e-5, 5e-5]
- Freeze Strategy: [Full fine-tuning, Frozen encoder, Partial unfreezing]
- Epochs: [3, 5]

**Goal:** Find the optimal configuration. Models are not saved; only experiment results are recorded.

In [None]:
import sys
import os
project_root = os.path.dirname(os.getcwd())
sys.path.insert(0, project_root)

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, f1_score, classification_report
from tqdm.auto import tqdm
import json
from datetime import datetime

from src.data_utils import load_text_classification_data
from src.text_preprocess import basic_clean
from src.model.bert_model import BertClassifier, get_tokenizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
print(f"PyTorch version: {torch.__version__}")

## 1. Load and Prepare Data

In [None]:
# Load data
train_texts, train_labels, label2id, id2label = load_text_classification_data('train')
val_texts, val_labels, _, _ = load_text_classification_data('val')
test_texts, test_labels, _, _ = load_text_classification_data('test')

# Clean text
train_texts = [basic_clean(text) for text in train_texts]
val_texts = [basic_clean(text) for text in val_texts]
test_texts = [basic_clean(text) for text in test_texts]

print(f"Train samples: {len(train_texts)}")
print(f"Val samples: {len(val_texts)}")
print(f"Test samples: {len(test_texts)}")
print(f"Label mapping: {label2id}")

In [None]:
# Dataset class
class TicketDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

## 2. Training Function

In [None]:
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    predictions = []
    true_labels = []
    
    progress_bar = tqdm(dataloader, desc='Training')
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        preds = outputs.argmax(dim=-1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())
        
        progress_bar.set_postfix({'loss': loss.item()})
    
    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(true_labels, predictions)
    f1_macro = f1_score(true_labels, predictions, average='macro')
    
    return avg_loss, accuracy, f1_macro

def evaluate(model, dataloader, device):
    model.eval()
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask)
            preds = outputs.argmax(dim=-1)
            
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(true_labels, predictions)
    f1_macro = f1_score(true_labels, predictions, average='macro')
    f1_weighted = f1_score(true_labels, predictions, average='weighted')
    
    return accuracy, f1_macro, f1_weighted, predictions, true_labels

## 3. Experiment Runner

In [None]:
def run_experiment(exp_config, train_texts, train_labels, val_texts, val_labels):
    """
    Run a single BERT fine-tuning experiment.
    
    Args:
        exp_config: dict with keys: name, lr, epochs, freeze_bert, model_name, batch_size
    """
    print(f"\n{'='*80}")
    print(f"Experiment: {exp_config['name']}")
    print(f"{'='*80}")
    print(f"Config: {exp_config}")
    
    # Initialize tokenizer and datasets
    tokenizer = get_tokenizer(exp_config['model_name'])
    train_dataset = TicketDataset(train_texts, train_labels, tokenizer)
    val_dataset = TicketDataset(val_texts, val_labels, tokenizer)
    
    train_loader = DataLoader(train_dataset, batch_size=exp_config['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=exp_config['batch_size'], shuffle=False)
    
    # Initialize model
    model = BertClassifier(
        model_name=exp_config['model_name'],
        num_classes=3,
        dropout=0.3,
        freeze_bert=exp_config['freeze_bert']
    ).to(device)
    
    # Count trainable parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total params: {total_params:,}")
    print(f"Trainable params: {trainable_params:,} ({trainable_params/total_params*100:.2f}%)")
    
    # Optimizer
    optimizer = AdamW(model.parameters(), lr=exp_config['lr'])
    
    # Training loop
    best_val_f1 = 0
    results = {
        'config': exp_config,
        'epochs': []
    }
    
    for epoch in range(exp_config['epochs']):
        print(f"\nEpoch {epoch+1}/{exp_config['epochs']}")
        
        # Train
        train_loss, train_acc, train_f1 = train_epoch(model, train_loader, optimizer, device)
        
        # Validate
        val_acc, val_f1_macro, val_f1_weighted, _, _ = evaluate(model, val_loader, device)
        
        epoch_results = {
            'epoch': epoch + 1,
            'train_loss': train_loss,
            'train_acc': train_acc,
            'train_f1_macro': train_f1,
            'val_acc': val_acc,
            'val_f1_macro': val_f1_macro,
            'val_f1_weighted': val_f1_weighted
        }
        results['epochs'].append(epoch_results)
        
        print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Train F1: {train_f1:.4f}")
        print(f"Val Acc: {val_acc:.4f} | Val F1 Macro: {val_f1_macro:.4f} | Val F1 Weighted: {val_f1_weighted:.4f}")
        
        if val_f1_macro > best_val_f1:
            best_val_f1 = val_f1_macro
    
    results['best_val_f1_macro'] = best_val_f1
    print(f"\nBest Val F1 Macro: {best_val_f1:.4f}")
    
    return results

## 4. Define Experiments

We design the following comparison experiment groups:

In [None]:
# Experiment configurations
experiments = [
    # Baseline: Full fine-tuning with standard LR
    {
        'name': 'EXP1_baseline_lr2e5_epoch3',
        'model_name': 'distilbert-base-uncased',
        'lr': 2e-5,
        'epochs': 3,
        'freeze_bert': False,
        'batch_size': 32,
        'description': 'Baseline: Full fine-tuning, LR=2e-5, 3 epochs'
    },
    
    # Experiment with higher LR
    {
        'name': 'EXP2_lr3e5_epoch3',
        'model_name': 'distilbert-base-uncased',
        'lr': 3e-5,
        'epochs': 3,
        'freeze_bert': False,
        'batch_size': 32,
        'description': 'Higher LR: LR=3e-5, 3 epochs'
    },
    
    # Experiment with even higher LR
    {
        'name': 'EXP3_lr5e5_epoch3',
        'model_name': 'distilbert-base-uncased',
        'lr': 5e-5,
        'epochs': 3,
        'freeze_bert': False,
        'batch_size': 32,
        'description': 'Even higher LR: LR=5e-5, 3 epochs'
    },
    
    # Frozen encoder (only train classifier)
    {
        'name': 'EXP4_frozen_lr2e5_epoch3',
        'model_name': 'distilbert-base-uncased',
        'lr': 2e-5,
        'epochs': 3,
        'freeze_bert': True,
        'batch_size': 32,
        'description': 'Frozen encoder: only train classifier head'
    },
    
    # More epochs with best LR
    {
        'name': 'EXP5_lr3e5_epoch5',
        'model_name': 'distilbert-base-uncased',
        'lr': 3e-5,
        'epochs': 5,
        'freeze_bert': False,
        'batch_size': 32,
        'description': 'More epochs: LR=3e-5, 5 epochs'
    },
]

print(f"Total experiments: {len(experiments)}")
for exp in experiments:
    print(f"  - {exp['name']}: {exp['description']}")

## 5. Run All Experiments

**Note:** This will take some time. It is recommended to run on a server with GPU.

In [None]:
# Run all experiments
all_results = []

for exp_config in experiments:
    try:
        result = run_experiment(exp_config, train_texts, train_labels, val_texts, val_labels)
        all_results.append(result)
    except Exception as e:
        print(f"Error in experiment {exp_config['name']}: {e}")
        continue
    
    # Free memory
    torch.cuda.empty_cache()

print("\nAll experiments completed!")

## 6. Results Summary

In [None]:
# Create summary dataframe
summary_data = []

for result in all_results:
    config = result['config']
    last_epoch = result['epochs'][-1]
    
    summary_data.append({
        'Experiment': config['name'],
        'LR': config['lr'],
        'Epochs': config['epochs'],
        'Freeze': config['freeze_bert'],
        'Best Val F1': result['best_val_f1_macro'],
        'Final Val Acc': last_epoch['val_acc'],
        'Final Train Acc': last_epoch['train_acc'],
        'Description': config['description']
    })

summary_df = pd.DataFrame(summary_data)
summary_df = summary_df.sort_values('Best Val F1', ascending=False)

print("\n" + "="*100)
print("EXPERIMENT RESULTS SUMMARY")
print("="*100)
print(summary_df.to_string(index=False))

# Save results
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
results_file = f'../data/bert_experiments_{timestamp}.json'
with open(results_file, 'w') as f:
    json.dump(all_results, f, indent=2)
print(f"\nDetailed results saved to: {results_file}")

summary_csv = f'../data/bert_experiments_summary_{timestamp}.csv'
summary_df.to_csv(summary_csv, index=False)
print(f"Summary saved to: {summary_csv}")

## 7. Visualize Results

In [None]:
import matplotlib.pyplot as plt

# Plot 1: Best Val F1 comparison
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar chart of best F1
ax = axes[0]
summary_df_sorted = summary_df.sort_values('Best Val F1', ascending=True)
ax.barh(range(len(summary_df_sorted)), summary_df_sorted['Best Val F1'])
ax.set_yticks(range(len(summary_df_sorted)))
ax.set_yticklabels(summary_df_sorted['Experiment'])
ax.set_xlabel('Best Validation F1 Macro')
ax.set_title('Experiment Comparison: Best Val F1')
ax.grid(axis='x', alpha=0.3)

# Learning curves for top 3 experiments
ax = axes[1]
top_3 = summary_df.head(3)

for idx, row in top_3.iterrows():
    exp_name = row['Experiment']
    # Find corresponding result
    for result in all_results:
        if result['config']['name'] == exp_name:
            epochs = [e['epoch'] for e in result['epochs']]
            val_f1s = [e['val_f1_macro'] for e in result['epochs']]
            ax.plot(epochs, val_f1s, marker='o', label=exp_name)
            break

ax.set_xlabel('Epoch')
ax.set_ylabel('Validation F1 Macro')
ax.set_title('Top 3 Experiments: Learning Curves')
ax.legend()
ax.grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Analysis & Insights

In [None]:
print("\n" + "="*80)
print("KEY INSIGHTS")
print("="*80)

# Best experiment
best_exp = summary_df.iloc[0]
print(f"\n1. BEST CONFIGURATION:")
print(f"   Experiment: {best_exp['Experiment']}")
print(f"   Learning Rate: {best_exp['LR']}")
print(f"   Epochs: {best_exp['Epochs']}")
print(f"   Freeze BERT: {best_exp['Freeze']}")
print(f"   Best Val F1: {best_exp['Best Val F1']:.4f}")

# Learning rate comparison (unfrozen models only)
unfrozen = summary_df[summary_df['Freeze'] == False]
print(f"\n2. LEARNING RATE IMPACT (unfrozen models):")
for _, row in unfrozen.iterrows():
    print(f"   LR={row['LR']:.0e}: Val F1={row['Best Val F1']:.4f}")

# Frozen vs Unfrozen
frozen_avg = summary_df[summary_df['Freeze'] == True]['Best Val F1'].mean()
unfrozen_avg = summary_df[summary_df['Freeze'] == False]['Best Val F1'].mean()
print(f"\n3. FREEZE STRATEGY:")
print(f"   Frozen encoder avg F1: {frozen_avg:.4f}")
print(f"   Full fine-tuning avg F1: {unfrozen_avg:.4f}")
print(f"   Difference: {unfrozen_avg - frozen_avg:.4f} ({(unfrozen_avg/frozen_avg - 1)*100:.1f}% improvement)")

# Epoch impact
print(f"\n4. EPOCH IMPACT:")
for epochs in sorted(summary_df['Epochs'].unique()):
    subset = summary_df[summary_df['Epochs'] == epochs]
    avg_f1 = subset['Best Val F1'].mean()
    print(f"   {epochs} epochs: avg F1={avg_f1:.4f}")

## 9. Recommendations

Based on the experiments above:

1. **Optimal Configuration**: Use the best performing setup identified above
2. **Learning Rate**: The optimal LR appears to be around [to be filled based on results]
3. **Freezing Strategy**: Full fine-tuning consistently outperforms frozen encoder
4. **Training Duration**: Monitor validation metrics to avoid overfitting

**Next Steps for Production:**
- Train final model with best config on train+val data
- Evaluate on test set
- Consider ensemble methods if needed
- Implement early stopping for efficiency