# Advanced Models: Ensemble and BERT

Testing if combining models or using transformers helps with multi-label classification

In [None]:
import sys
sys.path.append('..')

import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pickle
import time

from src.models import LSTMClassifier, SimpleCNN
from src.utils import get_predictions, calculate_metrics
from src.train import clean_text, build_vocab, TextDataset
from torch.utils.data import DataLoader

In [None]:
# load saved artifacts
with open('../outputs/vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)
with open('../outputs/label_cols.pkl', 'rb') as f:
    label_cols = pickle.load(f)

print(f"Vocab size: {len(vocab)}")
print(f"Labels: {label_cols}")

In [None]:
# reload data
df = pd.read_csv("../data/raw/train.csv")
df["clean_text"] = df["comment_text"].apply(clean_text)

X_train, X_test, y_train, y_test = train_test_split(
    df["clean_text"].values, df[label_cols].values,
    test_size=0.2, random_state=42
)

test_dataset = TextDataset(X_test, y_test, vocab, max_len=100)
test_loader = DataLoader(test_dataset, batch_size=64)

In [None]:
# load trained models
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

lstm_model = LSTMClassifier(
    vocab_size=len(vocab),
    embedding_dim=100,
    hidden_dim=128,
    output_dim=len(label_cols),
    n_layers=2,
    dropout=0.3
).to(device)
lstm_model.load_state_dict(torch.load('../outputs/lstm_model.pt', map_location=device))
lstm_model.eval()

cnn_model = SimpleCNN(
    vocab_size=len(vocab),
    embedding_dim=100,
    n_filters=100,
    filter_sizes=[3, 4, 5],
    output_dim=len(label_cols),
    dropout=0.5
).to(device)
cnn_model.load_state_dict(torch.load('../outputs/cnn_model.pt', map_location=device))
cnn_model.eval()

print("Models loaded")

## Ensemble: Averaging Predictions

Average probabilities from LSTM and CNN

In [None]:
# get predictions
lstm_preds, y_true = get_predictions(lstm_model, test_loader, device)
cnn_preds, _ = get_predictions(cnn_model, test_loader, device)

print(f"Predictions shape: {lstm_preds.shape}")

In [None]:
# average ensemble
ensemble_preds = (lstm_preds + cnn_preds) / 2

metrics_ensemble = calculate_metrics(y_true, ensemble_preds, label_cols)

print("Ensemble Results:")
for k, v in metrics_ensemble['overall'].items():
    print(f"{k}: {v:.4f}")

In [None]:
# check per-label improvement
metrics_lstm = calculate_metrics(y_true, lstm_preds, label_cols)
metrics_cnn = calculate_metrics(y_true, cnn_preds, label_cols)

comparison = pd.DataFrame({
    'Label': label_cols,
    'LSTM F1': [metrics_lstm[l]['f1'] for l in label_cols],
    'CNN F1': [metrics_cnn[l]['f1'] for l in label_cols],
    'Ensemble F1': [metrics_ensemble[l]['f1'] for l in label_cols]
})

print(comparison.to_string(index=False))

## BERT Fine-tuning

Using pretrained transformer for multi-label classification

In [None]:
try:
    from transformers import BertTokenizer, BertForSequenceClassification, AdamW
    from torch.utils.data import TensorDataset
    BERT_AVAILABLE = True
except ImportError:
    print("transformers not installed. Run: pip install transformers")
    BERT_AVAILABLE = False

In [None]:
if BERT_AVAILABLE:
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    # tokenize - using smaller subset for speed
    train_size = 50000
    X_train_bert = X_train[:train_size]
    y_train_bert = y_train[:train_size]
    
    train_encodings = tokenizer(list(X_train_bert), truncation=True, padding=True, max_length=128)
    test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128)
    
    train_dataset_bert = TensorDataset(
        torch.tensor(train_encodings['input_ids']),
        torch.tensor(train_encodings['attention_mask']),
        torch.tensor(y_train_bert, dtype=torch.float)
    )
    test_dataset_bert = TensorDataset(
        torch.tensor(test_encodings['input_ids']),
        torch.tensor(test_encodings['attention_mask']),
        torch.tensor(y_test, dtype=torch.float)
    )
    
    train_loader_bert = DataLoader(train_dataset_bert, batch_size=16, shuffle=True)
    test_loader_bert = DataLoader(test_dataset_bert, batch_size=16)
    
    print(f"BERT datasets ready (using {train_size} training samples)")

In [None]:
if BERT_AVAILABLE:
    bert_model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=len(label_cols),
        problem_type="multi_label_classification"
    ).to(device)
    
    optimizer_bert = AdamW(bert_model.parameters(), lr=2e-5)
    
    print(f"BERT params: {sum(p.numel() for p in bert_model.parameters())}")

In [None]:
if BERT_AVAILABLE:
    n_epochs_bert = 3
    
    start_time = time.time()
    
    for epoch in range(n_epochs_bert):
        bert_model.train()
        train_loss = 0
        
        for batch in train_loader_bert:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            
            optimizer_bert.zero_grad()
            outputs = bert_model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            
            loss.backward()
            optimizer_bert.step()
            
            train_loss += loss.item()
        
        avg_train_loss = train_loss / len(train_loader_bert)
        print(f'Epoch {epoch+1}/{n_epochs_bert} | Train Loss: {avg_train_loss:.3f}')
    
    bert_train_time = time.time() - start_time
    print(f"\nBERT training time: {bert_train_time:.2f}s")

In [None]:
if BERT_AVAILABLE:
    bert_model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in test_loader_bert:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = bert_model(input_ids, attention_mask=attention_mask)
            preds = (torch.sigmoid(outputs.logits) > 0.5).float()
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    bert_preds = np.array(all_preds)
    bert_labels = np.array(all_labels)
    
    metrics_bert = calculate_metrics(bert_labels, bert_preds, label_cols)
    
    print("BERT Results:")
    for k, v in metrics_bert['overall'].items():
        print(f"{k}: {v:.4f}")

## Final Comparison

In [None]:
results = {
    'Model': ['LSTM', 'CNN', 'Ensemble'],
    'F1': [metrics_lstm['overall']['f1'], metrics_cnn['overall']['f1'], metrics_ensemble['overall']['f1']]
}

if BERT_AVAILABLE:
    results['Model'].append('BERT')
    results['F1'].append(metrics_bert['overall']['f1'])

results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))

In [None]:
# visualize
plt.figure(figsize=(8, 5))
plt.bar(results_df['Model'], results_df['F1'], alpha=0.7)
plt.ylabel('F1 Score')
plt.title('Model Comparison - Multi-label Classification')
plt.ylim([0.5, 1.0])
plt.grid(axis='y', alpha=0.3)
for i, v in enumerate(results_df['F1']):
    plt.text(i, v + 0.01, f'{v:.3f}', ha='center')
plt.tight_layout()
plt.savefig('../outputs/model_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
if BERT_AVAILABLE:
    torch.save(bert_model.state_dict(), '../outputs/bert_model.pt')
    print("BERT model saved")

## Summary

**Ensemble findings:**
- [Add observations after running]

**BERT findings:**
- [Add observations after running]

**Multi-label insights:**
- Some labels are harder to predict (threat, identity_hate are rare)
- Models might benefit from class weighting
- Threshold tuning per label could help

**Best model:**
- [Note which performed best]