# Error Analysis: Sentiment Classification Model

This notebook performs comprehensive error analysis on the trained sentiment analysis model.

## Analysis Components:
1. Load predictions from evaluation
2. Identify misclassified examples
3. Analyze error patterns
4. Performance by text length
5. Confidence score analysis
6. Visualizations (confusion matrix, ROC curve)
7. Example misclassifications
8. Recommendations for improvement

In [None]:
# Imports
import sys
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
import json

# Add parent directory to path
sys.path.insert(0, str(Path.cwd().parent))

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

print("Imports successful!")

## 1. Load Evaluation Results

In [None]:
# Load predictions
predictions_df = pd.read_csv('../evaluation_results/predictions.csv')

# Load metrics
with open('../evaluation_results/metrics.json', 'r') as f:
    metrics = json.load(f)

print(f"Total samples: {len(predictions_df)}")
print(f"Correct predictions: {predictions_df['correct'].sum()}")
print(f"Incorrect predictions: {(~predictions_df['correct']).sum()}")
print(f"\nOverall Accuracy: {metrics['accuracy']:.4f}")
print(f"F1-Score: {metrics['f1_score']:.4f}")

predictions_df.head()

## 2. Load Test Data with Text

In [None]:
# Load original test data to get text
from src.sentiment_analyzer.data.data_loader import SentimentDataLoader

loader = SentimentDataLoader(
    data_path='../data/raw/amazon_polarity_20k.csv',
    train_split=0.6,
    test_split=0.2,
    random_seed=42
)

_, _, test_dataset = loader.load_and_prepare()
test_df = test_dataset.to_pandas()

# Add text to predictions
predictions_df['text'] = test_df['text'].values
predictions_df['text_length'] = predictions_df['text'].str.len()
predictions_df['word_count'] = predictions_df['text'].str.split().str.len()

print(f"Text added to {len(predictions_df)} predictions")
predictions_df.head()

## 3. Confusion Matrix Visualization

In [None]:
# Create confusion matrix
cm = confusion_matrix(predictions_df['true_label'], predictions_df['predicted_label'])

# Plot
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True,
            xticklabels=['Negative', 'Positive'],
            yticklabels=['Negative', 'Positive'])
plt.title('Confusion Matrix', fontsize=16, fontweight='bold')
plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.tight_layout()
plt.savefig('../evaluation_results/confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nConfusion Matrix:")
print(f"True Negatives:  {cm[0,0]} ({cm[0,0]/cm[0].sum()*100:.1f}%)")
print(f"False Positives: {cm[0,1]} ({cm[0,1]/cm[0].sum()*100:.1f}%)")
print(f"False Negatives: {cm[1,0]} ({cm[1,0]/cm[1].sum()*100:.1f}%)")
print(f"True Positives:  {cm[1,1]} ({cm[1,1]/cm[1].sum()*100:.1f}%)")

## 4. ROC Curve

In [None]:
# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(predictions_df['true_label'], 
                                   predictions_df['prob_positive'])
roc_auc = roc_auc_score(predictions_df['true_label'], 
                        predictions_df['prob_positive'])

# Plot
plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, 
         label=f'ROC curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('Receiver Operating Characteristic (ROC) Curve', fontsize=16, fontweight='bold')
plt.legend(loc="lower right", fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('../evaluation_results/roc_curve.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"ROC-AUC Score: {roc_auc:.4f}")

## 5. Error Analysis by Text Length

In [None]:
# Define length buckets
bins = [0, 50, 200, 500, 1000, 10000]
labels = ['Very Short\n(0-50)', 'Short\n(51-200)', 'Medium\n(201-500)', 
          'Long\n(501-1000)', 'Very Long\n(1000+)']
predictions_df['length_bucket'] = pd.cut(predictions_df['text_length'], 
                                          bins=bins, labels=labels)

# Calculate accuracy by length
length_analysis = predictions_df.groupby('length_bucket').agg({
    'correct': ['count', 'sum', 'mean'],
    'confidence': 'mean',
    'f1_score': lambda x: metrics['f1_score']  # placeholder
}).round(4)

length_analysis.columns = ['_'.join(col).strip() for col in length_analysis.columns.values]
length_analysis = length_analysis.rename(columns={
    'correct_count': 'total',
    'correct_sum': 'correct',
    'correct_mean': 'accuracy',
    'confidence_mean': 'avg_confidence'
})

print("\nPerformance by Text Length:")
print(length_analysis)

# Visualize
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Accuracy by length
length_analysis['accuracy'].plot(kind='bar', ax=ax1, color='skyblue', edgecolor='black')
ax1.set_title('Accuracy by Text Length', fontsize=14, fontweight='bold')
ax1.set_ylabel('Accuracy', fontsize=12)
ax1.set_xlabel('Text Length Category', fontsize=12)
ax1.axhline(y=metrics['accuracy'], color='r', linestyle='--', label='Overall Accuracy')
ax1.legend()
ax1.set_ylim([0, 1])

# Sample count by length
length_analysis['total'].plot(kind='bar', ax=ax2, color='lightcoral', edgecolor='black')
ax2.set_title('Sample Distribution by Text Length', fontsize=14, fontweight='bold')
ax2.set_ylabel('Number of Samples', fontsize=12)
ax2.set_xlabel('Text Length Category', fontsize=12)

plt.tight_layout()
plt.savefig('../evaluation_results/performance_by_length.png', dpi=300, bbox_inches='tight')
plt.show()

## 6. Confidence Score Analysis

In [None]:
# Confidence distribution
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Overall confidence distribution
axes[0, 0].hist(predictions_df['confidence'], bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Overall Confidence Distribution', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Confidence', fontsize=12)
axes[0, 0].set_ylabel('Frequency', fontsize=12)
axes[0, 0].axvline(predictions_df['confidence'].mean(), color='r', 
                   linestyle='--', label=f'Mean: {predictions_df["confidence"].mean():.3f}')
axes[0, 0].legend()

# Confidence by correctness
correct_conf = predictions_df[predictions_df['correct']]['confidence']
incorrect_conf = predictions_df[~predictions_df['correct']]['confidence']

axes[0, 1].hist([correct_conf, incorrect_conf], bins=30, label=['Correct', 'Incorrect'],
               alpha=0.7, edgecolor='black')
axes[0, 1].set_title('Confidence: Correct vs Incorrect', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Confidence', fontsize=12)
axes[0, 1].set_ylabel('Frequency', fontsize=12)
axes[0, 1].legend()

# Accuracy by confidence bins
conf_bins = [0, 0.6, 0.7, 0.8, 0.9, 1.0]
conf_labels = ['0.5-0.6', '0.6-0.7', '0.7-0.8', '0.8-0.9', '0.9-1.0']
predictions_df['conf_bucket'] = pd.cut(predictions_df['confidence'], 
                                        bins=conf_bins, labels=conf_labels)

conf_analysis = predictions_df.groupby('conf_bucket').agg({
    'correct': ['count', 'mean']
})
conf_analysis.columns = ['count', 'accuracy']

conf_analysis['accuracy'].plot(kind='bar', ax=axes[1, 0], color='lightgreen', edgecolor='black')
axes[1, 0].set_title('Accuracy by Confidence Range', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Confidence Range', fontsize=12)
axes[1, 0].set_ylabel('Accuracy', fontsize=12)
axes[1, 0].set_ylim([0, 1])

# Sample count by confidence
conf_analysis['count'].plot(kind='bar', ax=axes[1, 1], color='orange', edgecolor='black')
axes[1, 1].set_title('Sample Distribution by Confidence', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Confidence Range', fontsize=12)
axes[1, 1].set_ylabel('Number of Samples', fontsize=12)

plt.tight_layout()
plt.savefig('../evaluation_results/confidence_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nConfidence Statistics:")
print(f"Mean confidence (correct): {correct_conf.mean():.4f}")
print(f"Mean confidence (incorrect): {incorrect_conf.mean():.4f}")
print(f"\nAccuracy by Confidence Range:")
print(conf_analysis)

## 7. Misclassification Examples

In [None]:
# Get misclassified examples
errors = predictions_df[~predictions_df['correct']].copy()
errors = errors.sort_values('confidence', ascending=False)

print(f"\nTotal Errors: {len(errors)}")
print(f"Error Rate: {len(errors)/len(predictions_df)*100:.2f}%")

# False Positives (predicted positive, actually negative)
false_positives = errors[(errors['true_label'] == 0) & (errors['predicted_label'] == 1)]
print(f"\nFalse Positives: {len(false_positives)}")

# False Negatives (predicted negative, actually positive)
false_negatives = errors[(errors['true_label'] == 1) & (errors['predicted_label'] == 0)]
print(f"False Negatives: {len(false_negatives)}")

# Show hardest errors (high confidence but wrong)
print("\n" + "="*80)
print("HARDEST ERRORS (High Confidence but Wrong)")
print("="*80)

for idx, row in errors.head(10).iterrows():
    true_label = "POSITIVE" if row['true_label'] == 1 else "NEGATIVE"
    pred_label = "POSITIVE" if row['predicted_label'] == 1 else "NEGATIVE"
    
    print(f"\nExample {idx+1}:")
    print(f"Text: {row['text'][:200]}..." if len(row['text']) > 200 else f"Text: {row['text']}")
    print(f"True Label: {true_label}")
    print(f"Predicted: {pred_label} (confidence: {row['confidence']:.4f})")
    print("-" * 80)

## 8. Error Pattern Analysis

In [None]:
# Analyze error patterns
print("\nError Statistics:")
print(f"Mean text length (errors): {errors['text_length'].mean():.0f} chars")
print(f"Mean text length (correct): {predictions_df[predictions_df['correct']]['text_length'].mean():.0f} chars")
print(f"\nMean confidence (errors): {errors['confidence'].mean():.4f}")
print(f"Mean confidence (correct): {predictions_df[predictions_df['correct']]['confidence'].mean():.4f}")

# Error rate by length
error_by_length = predictions_df.groupby('length_bucket').agg({
    'correct': lambda x: (~x).sum() / len(x) * 100
}).round(2)
error_by_length.columns = ['error_rate_%']

print("\nError Rate by Text Length:")
print(error_by_length)

## 9. Recommendations for Improvement

In [None]:
recommendations = []

# Check for low accuracy
if metrics['accuracy'] < 0.90:
    recommendations.append("üî¥ Overall accuracy below 90% target - consider additional training epochs or larger dataset")

# Check for class imbalance in errors
fp_rate = len(false_positives) / len(errors) * 100
fn_rate = len(false_negatives) / len(errors) * 100
if abs(fp_rate - fn_rate) > 20:
    recommendations.append(f"‚ö†Ô∏è  Imbalanced error types (FP: {fp_rate:.1f}%, FN: {fn_rate:.1f}%) - consider class weights")

# Check confidence calibration
low_conf_errors = errors[errors['confidence'] < 0.7]
if len(low_conf_errors) > len(errors) * 0.3:
    recommendations.append("‚ö†Ô∏è  Many low-confidence errors - model is uncertain, consider more training data")

# Check for length bias
short_text_acc = predictions_df[predictions_df['text_length'] < 50]['correct'].mean()
if short_text_acc < metrics['accuracy'] - 0.05:
    recommendations.append(f"‚ö†Ô∏è  Poor performance on short texts ({short_text_acc:.2%}) - consider data augmentation")

# General recommendations
recommendations.append("‚úÖ Analyze hardest errors above to identify failure patterns (sarcasm, negation, etc.)")
recommendations.append("‚úÖ Consider ensemble methods or model distillation for improvement")
recommendations.append("‚úÖ Collect more examples similar to frequent error cases")

print("\n" + "="*80)
print("RECOMMENDATIONS FOR IMPROVEMENT")
print("="*80)
for i, rec in enumerate(recommendations, 1):
    print(f"{i}. {rec}")

# Save recommendations
with open('../evaluation_results/recommendations.txt', 'w') as f:
    f.write("Recommendations for Model Improvement\n")
    f.write("=" * 50 + "\n\n")
    for i, rec in enumerate(recommendations, 1):
        f.write(f"{i}. {rec}\n")

print("\nRecommendations saved to evaluation_results/recommendations.txt")

## Summary

This error analysis notebook has:
1. ‚úÖ Visualized confusion matrix and ROC curve
2. ‚úÖ Analyzed performance by text length
3. ‚úÖ Investigated confidence score calibration
4. ‚úÖ Identified hardest misclassifications
5. ‚úÖ Provided actionable recommendations

**Next Steps:**
- Review hardest errors to understand failure modes
- Implement recommended improvements
- Re-train and re-evaluate
- Document findings in `docs/ERROR_ANALYSIS.md`