# 05 - Error Analysis: Baseline Models vs BERT

本 notebook 对比 **Logistic Regression**, **TextCNN**, 和 **BERT** 三个模型的性能，用于作业提交。

**目标:**
- 展示从传统机器学习 → 深度学习 → Transformer 的性能提升
- 分析各模型的错误模式
- 满足数据处理、模型对比的作业要求

In [None]:
import sys
import os
# Add project root to Python path
project_root = os.path.dirname(os.getcwd())
sys.path.insert(0, project_root)

import numpy as np
import pandas as pd
import joblib
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import matplotlib.pyplot as plt

from src.data_utils import load_text_classification_data
from src.text_preprocess import basic_clean
from src.features import build_tfidf_vectorizer
from src.model.text_cnn import TextCNN
from src.model.bert_model import BertClassifier, get_tokenizer
from src.evaluate import evaluate_classification

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 1. Load Test Data

In [None]:
# Load test data
test_texts, test_labels, label2id, id2label = load_text_classification_data('test')
test_texts_clean = [basic_clean(text) for text in test_texts]

print(f"Test samples: {len(test_texts)}")
print(f"Label mapping: {label2id}")
print(f"Class distribution: {Counter(test_labels)}")

## 2. Load Baseline Model (Logistic Regression)

In [None]:
# Load TF-IDF vectorizer and Logistic Regression model
tfidf_vectorizer = joblib.load('../src/model/tfidf_vectorizer.joblib')
logreg_model = joblib.load('../src/model/baseline_logreg.joblib')

# Get predictions
X_test_tfidf = tfidf_vectorizer.transform(test_texts_clean)
logreg_pred = logreg_model.predict(X_test_tfidf)

print("Logistic Regression predictions obtained")
logreg_results = evaluate_classification(test_labels, logreg_pred)
print(f"Accuracy: {logreg_results['accuracy']:.4f}")
print(f"F1 Macro: {logreg_results['f1_macro']:.4f}")

## 3. Load TextCNN Model

In [None]:
# Load TextCNN model
checkpoint = torch.load('../src/model/textcnn.pt', map_location=device, weights_only=False)
vocab = checkpoint['vocab']
model_config = checkpoint['model_config']

# Recreate model
textcnn_model = TextCNN(**model_config).to(device)
textcnn_model.load_state_dict(checkpoint['model_state_dict'])
textcnn_model.eval()

# Create dataset and dataloader
class TextDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len=128):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        words = text.split()[:self.max_len]
        tokens = [self.vocab.get(word, self.vocab['<UNK>']) for word in words]
        padded = tokens + [self.vocab['<PAD>']] * (self.max_len - len(tokens))
        return torch.tensor(padded, dtype=torch.long), torch.tensor(label, dtype=torch.long)

test_dataset_cnn = TextDataset(test_texts_clean, test_labels, vocab, max_len=128)
test_loader_cnn = DataLoader(test_dataset_cnn, batch_size=64, shuffle=False)

# Get predictions
textcnn_pred = []
with torch.no_grad():
    for inputs, _ in test_loader_cnn:
        inputs = inputs.to(device)
        outputs = textcnn_model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        textcnn_pred.extend(predicted.cpu().numpy())

textcnn_pred = np.array(textcnn_pred)
print("TextCNN predictions obtained")
textcnn_results = evaluate_classification(test_labels, textcnn_pred)
print(f"Accuracy: {textcnn_results['accuracy']:.4f}")
print(f"F1 Macro: {textcnn_results['f1_macro']:.4f}")

## 4. Load BERT Model (from HuggingFace)

In [None]:
# Load BERT model - now using online model
checkpoint_bert = torch.load('../src/model/bert_finetuned.pt', map_location=device, weights_only=False)
model_name = checkpoint_bert.get('model_name', 'distilbert-base-uncased')

# Recreate model - will download from HuggingFace Hub
bert_model = BertClassifier(model_name=model_name, num_classes=3).to(device)
bert_model.load_state_dict(checkpoint_bert['model_state_dict'])
bert_model.eval()

# Create dataset and dataloader
tokenizer = get_tokenizer(model_name)

class BertDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

test_dataset_bert = BertDataset(test_texts_clean, test_labels, tokenizer, max_length=128)
test_loader_bert = DataLoader(test_dataset_bert, batch_size=16, shuffle=False)

# Get predictions
bert_pred = []
with torch.no_grad():
    for batch in test_loader_bert:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = bert_model(input_ids, attention_mask)
        _, predicted = torch.max(outputs.data, 1)
        bert_pred.extend(predicted.cpu().numpy())

bert_pred = np.array(bert_pred)
print("BERT predictions obtained")
bert_results = evaluate_classification(test_labels, bert_pred)
print(f"Accuracy: {bert_results['accuracy']:.4f}")
print(f"F1 Macro: {bert_results['f1_macro']:.4f}")

## 5. Model Comparison Summary

In [None]:
# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'TextCNN', 'BERT'],
    'Accuracy': [logreg_results['accuracy'], textcnn_results['accuracy'], bert_results['accuracy']],
    'F1 Macro': [logreg_results['f1_macro'], textcnn_results['f1_macro'], bert_results['f1_macro']],
    'F1 Micro': [logreg_results['f1_micro'], textcnn_results['f1_micro'], bert_results['f1_micro']],
    'F1 Weighted': [logreg_results['f1_weighted'], textcnn_results['f1_weighted'], bert_results['f1_weighted']]
})

print("\n" + "="*80)
print("MODEL COMPARISON (Assignment)")
print("="*80)
print(comparison_df.to_string(index=False))

# Visualize comparison
fig, ax = plt.subplots(1, 2, figsize=(14, 5))
models = comparison_df['Model']
x = np.arange(len(models))
width = 0.35

ax[0].bar(x, comparison_df['Accuracy'], width, label='Accuracy')
ax[0].set_ylabel('Score')
ax[0].set_title('Model Accuracy Comparison')
ax[0].set_xticks(x)
ax[0].set_xticklabels(models, rotation=15, ha='right')
ax[0].set_ylim([0, 1])
ax[0].grid(axis='y', alpha=0.3)

ax[1].bar(x - width/2, comparison_df['F1 Macro'], width, label='F1 Macro')
ax[1].bar(x + width/2, comparison_df['F1 Weighted'], width, label='F1 Weighted')
ax[1].set_ylabel('F1 Score')
ax[1].set_title('F1 Score Comparison')
ax[1].set_xticks(x)
ax[1].set_xticklabels(models, rotation=15, ha='right')
ax[1].set_ylim([0, 1])
ax[1].legend()
ax[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Confusion Matrices

In [None]:
# Create confusion matrices
label_names = [id2label[i] for i in sorted(id2label.keys())]

cm_logreg = confusion_matrix(test_labels, logreg_pred)
cm_textcnn = confusion_matrix(test_labels, textcnn_pred)
cm_bert = confusion_matrix(test_labels, bert_pred)

# Plot confusion matrices
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for ax, cm, title in zip(axes, [cm_logreg, cm_textcnn, cm_bert], 
                          ['Logistic Regression', 'TextCNN', 'BERT']):
    im = ax.imshow(cm, cmap='Blues')
    ax.set_xticks(np.arange(len(label_names)))
    ax.set_yticks(np.arange(len(label_names)))
    ax.set_xticklabels(label_names)
    ax.set_yticklabels(label_names)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('True')
    ax.set_title(f'{title}\nConfusion Matrix')
    
    # Add text annotations
    for i in range(len(label_names)):
        for j in range(len(label_names)):
            text = ax.text(j, i, cm[i, j], ha="center", va="center", color="black" if cm[i, j] < cm.max()/2 else "white")
    
    plt.colorbar(im, ax=ax)

plt.tight_layout()
plt.show()

## 7. Per-Class Performance Analysis

In [None]:
from sklearn.metrics import precision_recall_fscore_support

# Calculate per-class metrics for each model
precision_logreg, recall_logreg, f1_logreg, _ = precision_recall_fscore_support(
    test_labels, logreg_pred, average=None, labels=[0, 1, 2]
)
precision_textcnn, recall_textcnn, f1_textcnn, _ = precision_recall_fscore_support(
    test_labels, textcnn_pred, average=None, labels=[0, 1, 2]
)
precision_bert, recall_bert, f1_bert, _ = precision_recall_fscore_support(
    test_labels, bert_pred, average=None, labels=[0, 1, 2]
)

# Create comparison dataframe
per_class_df = pd.DataFrame({
    'Class': label_names * 3,
    'Model': ['LogReg'] * 3 + ['TextCNN'] * 3 + ['BERT'] * 3,
    'Precision': list(precision_logreg) + list(precision_textcnn) + list(precision_bert),
    'Recall': list(recall_logreg) + list(recall_textcnn) + list(recall_bert),
    'F1': list(f1_logreg) + list(f1_textcnn) + list(f1_bert)
})

print("\nPer-Class Performance:")
print(per_class_df.to_string(index=False))

## 8. Error Analysis

In [None]:
# Find misclassified samples for each model
misclassified_logreg = np.where(test_labels != logreg_pred)[0]
misclassified_textcnn = np.where(test_labels != textcnn_pred)[0]
misclassified_bert = np.where(test_labels != bert_pred)[0]

print(f"Misclassified samples:")
print(f"  Logistic Regression: {len(misclassified_logreg)} ({len(misclassified_logreg)/len(test_labels)*100:.2f}%)")
print(f"  TextCNN: {len(misclassified_textcnn)} ({len(misclassified_textcnn)/len(test_labels)*100:.2f}%)")
print(f"  BERT: {len(misclassified_bert)} ({len(misclassified_bert)/len(test_labels)*100:.2f}%)")

# Find samples where all models agree (correct or incorrect)
all_correct = np.where((test_labels == logreg_pred) & 
                       (test_labels == textcnn_pred) & 
                       (test_labels == bert_pred))[0]
all_wrong = np.where((test_labels != logreg_pred) & 
                     (test_labels != textcnn_pred) & 
                     (test_labels != bert_pred))[0]

print(f"\nModel Agreement:")
print(f"  All correct: {len(all_correct)} ({len(all_correct)/len(test_labels)*100:.2f}%)")
print(f"  All wrong: {len(all_wrong)} ({len(all_wrong)/len(test_labels)*100:.2f}%)")

## 9. Sample Error Cases

In [None]:
# Display some examples where all models failed
print("\n" + "="*80)
print("Examples where ALL models failed:")
print("="*80)

for i, idx in enumerate(all_wrong[:5]):
    print(f"\n[Example {i+1}]")
    print(f"Text: {test_texts[idx][:150]}...")
    print(f"True: {id2label[test_labels[idx]]}")
    print(f"LogReg→{id2label[logreg_pred[idx]]}, CNN→{id2label[textcnn_pred[idx]]}, BERT→{id2label[bert_pred[idx]]}")

## 10. Summary & Insights

### Key Findings:

1. **Model Performance Progression:**
   - Traditional ML (TF-IDF + LogReg) provides a solid baseline
   - Deep learning (TextCNN) may struggle without sufficient data/tuning
   - Transformer models (BERT) generally achieve the best performance

2. **Data Processing Requirements:**
   - Text cleaning and preprocessing is crucial for all models
   - Different models require different input formats (TF-IDF vs embeddings vs subword tokens)

3. **Error Patterns:**
   - Some samples are inherently difficult across all models
   - Class imbalance may affect minority class performance
   - Ambiguous or short texts pose challenges

### Assignment Deliverables:
- ✅ Data preprocessing pipeline
- ✅ Multiple model implementations (LogReg, CNN, BERT)
- ✅ Comprehensive evaluation metrics
- ✅ Error analysis and insights