# 5. Evaluation and Prediction

This notebook performs final model evaluation, generates test predictions, and creates the submission file.

## Steps:
1. Load trained model and test data
2. Generate predictions on test set
3. Create submission file
4. Comprehensive performance analysis
5. Generate visualizations and reports

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import DistilBertTokenizerFast, DistilBertModel
from sklearn.metrics import (
    classification_report, confusion_matrix, 
    roc_curve, auc, precision_recall_curve, average_precision_score
)
import matplotlib.pyplot as plt
import seaborn as sns
import json
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)

# Setup device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

print("✅ Libraries imported successfully!")

## 5.1 Load Model and Data

In [None]:
# Load model architecture (same as training notebook)
class HybridDistilBERTClassifier(nn.Module):
    """
    Hybrid model combining DistilBERT with meta-features.
    """
    def __init__(self, bert_model_name, num_meta_features, num_labels=2, dropout=0.2):
        super().__init__()
        
        # DistilBERT model
        self.bert = DistilBertModel.from_pretrained(bert_model_name)
        self.bert_dim = self.bert.config.hidden_size
        
        # Meta-feature processing
        self.meta_bn = nn.BatchNorm1d(num_meta_features)
        self.meta_fc = nn.Linear(num_meta_features, 32)
        self.meta_activation = nn.ReLU()
        self.meta_dropout = nn.Dropout(dropout)
        
        # Combined features
        combined_dim = self.bert_dim + 32
        self.classifier = nn.Sequential(
            nn.Linear(combined_dim, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, num_labels)
        )
        
    def forward(self, input_ids, attention_mask, meta_features):
        # BERT encoding
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_pooled = bert_outputs.last_hidden_state[:, 0, :]  # CLS token
        
        # Meta-feature processing
        meta_processed = self.meta_bn(meta_features)
        meta_processed = self.meta_fc(meta_processed)
        meta_processed = self.meta_activation(meta_processed)
        meta_processed = self.meta_dropout(meta_processed)
        
        # Combine features
        combined = torch.cat([bert_pooled, meta_processed], dim=1)
        
        # Classification
        logits = self.classifier(combined)
        
        return logits

# Load trained model
checkpoint = torch.load('models/best_model.pth', map_location=device)
model_info = checkpoint.get('config', {})

# Recreate model
model = HybridDistilBERTClassifier(
    bert_model_name=model_info.get('model_config', {}).get('model_name', 'distilbert-base-uncased'),
    num_meta_features=10,  # Based on our feature engineering
    num_labels=2,
    dropout=model_info.get('model_config', {}).get('dropout', 0.2)
)

model.load_state_dict(checkpoint['model_state_dict'])
model = model.to(device)
model.eval()

print("✅ Model loaded successfully!")
print(f"Best validation F1: {checkpoint.get('val_f1', 'N/A')}")
print(f"Best epoch: {checkpoint.get('epoch', 'N/A') + 1}")

# Load test data
df_test = pd.read_csv('Data/test_cleaned.csv')
print(f"\nTest data shape: {df_test.shape}")

# Identify meta-feature columns
meta_cols = [col for col in df_test.columns if col.endswith('_meta')]
print(f"Meta-features found: {len(meta_cols)}")
print(f"Meta-feature columns: {meta_cols}")

## 5.2 Prepare Test Data

In [None]:
# Import dataset class from training notebook
from transformers import DistilBertTokenizerFast
import torch
from torch.utils.data import Dataset

# Initialize tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Custom Dataset class
class DisasterTweetDataset(Dataset):
    """
    Custom dataset class for combining BERT tokens with meta-features.
    """
    def __init__(self, encodings, meta_features, labels=None):
        self.encodings = encodings
        self.meta_features = torch.FloatTensor(meta_features)
        self.labels = torch.LongTensor(labels) if labels is not None else None
    
    def __len__(self):
        return len(self.meta_features)
    
    def __getitem__(self, idx):
        item = {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'meta_features': self.meta_features[idx]
        }
        
        if self.labels is not None:
            item['labels'] = self.labels[idx]
        
        return item

# Prepare test data
test_texts = df_test['text_clean'].values
test_meta = df_test[meta_cols].values
test_ids = df_test['id'].values

# Tokenize test texts
print("Tokenizing test data...")
test_encodings = tokenizer(
    test_texts.tolist(),
    truncation=True,
    padding='max_length',
    max_length=160,
    return_tensors='pt'
)

# Create test dataset
test_dataset = DisasterTweetDataset(test_encodings, test_meta)

# Create test dataloader
test_loader = DataLoader(
    test_dataset,
    batch_size=32,
    shuffle=False,
    num_workers=2
)

print(f"Test dataset size: {len(test_dataset)}")
print(f"Test batches: {len(test_loader)}")

# Verify data integrity
print(f"\nTest data shapes:")
print(f"Texts: {test_texts.shape}")
print(f"Meta features: {test_meta.shape}")
print(f"IDs: {test_ids.shape}")
print(f"Token encodings: {test_encodings['input_ids'].shape}")

## 5.3 Generate Test Predictions

In [None]:
# Generate predictions
print("Generating test predictions...")

all_predictions = []
all_probabilities = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting"):
        # Move inputs to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        meta_features = batch['meta_features'].to(device)
        
        # Forward pass
        outputs = model(input_ids, attention_mask, meta_features)
        
        # Get probabilities and predictions
        probabilities = torch.softmax(outputs, dim=1)
        predictions = torch.argmax(outputs, dim=1)
        
        all_predictions.extend(predictions.cpu().numpy())
        all_probabilities.extend(probabilities.cpu().numpy())

# Convert to numpy arrays
predictions = np.array(all_predictions)
probabilities = np.array(all_probabilities)

print(f"\n✅ Predictions generated!")
print(f"Predictions shape: {predictions.shape}")
print(f"Probabilities shape: {probabilities.shape}")

# Prediction distribution
unique, counts = np.unique(predictions, return_counts=True)
for cls, count in zip(unique, counts):
    label = 'Non-Disaster' if cls == 0 else 'Disaster'
    percentage = count / len(predictions) * 100
    print(f"{label}: {count} ({percentage:.1f}%)")

# Confidence analysis
confidence_scores = np.max(probabilities, axis=1)
print(f"\nConfidence statistics:")
print(f"Mean confidence: {confidence_scores.mean():.4f}")
print(f"Min confidence: {confidence_scores.min():.4f}")
print(f"Max confidence: {confidence_scores.max():.4f}")
print(f"Std confidence: {confidence_scores.std():.4f}")

## 5.4 Create Submission File

In [None]:
# Create submission dataframe
submission_df = pd.DataFrame({
    'id': test_ids,
    'target': predictions
})

# Add confidence scores
submission_df['confidence'] = confidence_scores
submission_df['disaster_probability'] = probabilities[:, 1]  # Probability of disaster class

# Display sample of submission
print("Sample submission:")
display(submission_df.head(10))

# Verify submission format
print(f"\nSubmission verification:")
print(f"Submission shape: {submission_df.shape}")
print(f"ID range: {submission_df['id'].min()} to {submission_df['id'].max()}")
print(f"Target values: {sorted(submission_df['target'].unique())}")
print(f"Missing values: {submission_df.isnull().sum().sum()}")

# Compare with expected format
expected_test_size = len(df_test)
actual_test_size = len(submission_df)
print(f"Expected test samples: {expected_test_size}")
print(f"Actual predictions: {actual_test_size}")
print(f"Match: {expected_test_size == actual_test_size}")

# Save submission file
submission_path = 'Data/submission.csv'
submission_df[['id', 'target']].to_csv(submission_path, index=False)

# Save enhanced submission with confidence scores
enhanced_submission_path = 'Data/submission_enhanced.csv'
submission_df.to_csv(enhanced_submission_path, index=False)

print(f"\n✅ Submission files created:")
print(f"- {submission_path} (for competition)")
print(f"- {enhanced_submission_path} (with confidence scores)")

## 5.5 Load Validation Data for Comparison

In [None]:
# Load training data for validation analysis
df_train = pd.read_csv('Data/train_cleaned.csv')

# Prepare validation data (using the same split as training)
from sklearn.model_selection import train_test_split

meta_cols = [col for col in df_train.columns if col.endswith('_meta')]
X_text = df_train['text_clean'].values
y = df_train['target'].values
X_meta = df_train[meta_cols].values

# Split to get validation set
X_text_train, X_text_val, X_meta_train, X_meta_val, y_train, y_val = train_test_split(
    X_text, X_meta, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Tokenize validation data
val_encodings = tokenizer(
    X_text_val.tolist(),
    truncation=True,
    padding='max_length',
    max_length=160,
    return_tensors='pt'
)

# Create validation dataset
val_dataset = DisasterTweetDataset(val_encodings, X_meta_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Generate validation predictions
print("Generating validation predictions for analysis...")

val_predictions = []
val_probabilities = []
val_labels_list = []

with torch.no_grad():
    for batch in tqdm(val_loader, desc="Validating"):
        # Move inputs to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        meta_features = batch['meta_features'].to(device)
        labels = batch['labels']
        
        # Forward pass
        outputs = model(input_ids, attention_mask, meta_features)
        
        # Get probabilities and predictions
        probabilities = torch.softmax(outputs, dim=1)
        predictions = torch.argmax(outputs, dim=1)
        
        val_predictions.extend(predictions.cpu().numpy())
        val_probabilities.extend(probabilities.cpu().numpy())
        val_labels_list.extend(labels.numpy())

val_predictions = np.array(val_predictions)
val_probabilities = np.array(val_probabilities)
val_labels = np.array(val_labels_list)

print(f"\n✅ Validation predictions generated!")
print(f"Validation samples: {len(val_labels)}")

# Calculate validation metrics
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

val_f1 = f1_score(val_labels, val_predictions, average='weighted')
val_accuracy = accuracy_score(val_labels, val_predictions)
val_precision = precision_score(val_labels, val_predictions, average='weighted')
val_recall = recall_score(val_labels, val_predictions, average='weighted')

print(f"\n📊 Validation Performance:")
print(f"Accuracy: {val_accuracy:.4f}")
print(f"Precision: {val_precision:.4f}")
print(f"Recall: {val_recall:.4f}")
print(f"F1 Score: {val_f1:.4f}")

## 5.6 Comprehensive Performance Analysis

In [None]:
# Detailed classification report
print("Detailed Classification Report (Validation):")
print("="*60)
report = classification_report(
    val_labels, val_predictions, 
    target_names=['Non-Disaster', 'Disaster'],
    output_dict=True
)

# Print formatted report
print(f"{'Class':<15} {'Precision':<10} {'Recall':<10} {'F1-Score':<10} {'Support':<10}")
print("-" * 60)
for class_name in ['Non-Disaster', 'Disaster']:
    metrics = report[class_name]
    print(f"{class_name:<15} {metrics['precision']:<10.4f} {metrics['recall']:<10.4f} {metrics['f1-score']:<10.4f} {metrics['support']:<10.0f}")

print(f"{'accuracy':<15} {report['accuracy']:<10.4f} {report['accuracy']:<10.4f} {report['accuracy']:<10.4f} {report['macro avg']['support']:<10.0f}")
print(f"{'macro avg':<15} {report['macro avg']['precision']:<10.4f} {report['macro avg']['recall']:<10.4f} {report['macro avg']['f1-score']:<10.4f} {report['macro avg']['support']:<10.0f}")
print(f"{'weighted avg':<15} {report['weighted avg']['precision']:<10.4f} {report['weighted avg']['recall']:<10.4f} {report['weighted avg']['f1-score']:<10.4f} {report['weighted avg']['support']:<10.0f}")

# Per-class analysis
print("\n" + "="*60)
print("Per-Class Performance Analysis:")
print("="*60)

for i, class_name in enumerate(['Non-Disaster', 'Disaster']):
    class_mask = val_labels == i
    class_preds = val_predictions[class_mask]
    class_probs = val_probabilities[class_mask, i]
    
    # True positives, false positives, false negatives
    tp = (class_preds == i).sum()
    fp = (class_preds != i).sum()
    fn = (val_labels[class_preds == i] != i).sum()
    
    # Specificity (true negative rate)
    tn = ((val_predictions != i) & (val_labels != i)).sum()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    
    print(f"\n{class_name} Class:")
    print(f"  Samples: {class_mask.sum()}")
    print(f"  True Positives: {tp}")
    print(f"  False Positives: {fp}")
    print(f"  False Negatives: {fn}")
    print(f"  True Negatives: {tn}")
    print(f"  Specificity: {specificity:.4f}")
    print(f"  Avg Confidence: {class_probs.mean():.4f}")
    print(f"  Confidence Std: {class_probs.std():.4f}")

## 5.7 Advanced Performance Visualization

In [None]:
# Create comprehensive performance visualizations
fig, axes = plt.subplots(2, 3, figsize=(20, 12))

# 1. Confusion Matrix
cm = confusion_matrix(val_labels, val_predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0, 0],
            xticklabels=['Non-Disaster', 'Disaster'],
            yticklabels=['Non-Disaster', 'Disaster'])
axes[0, 0].set_title('Confusion Matrix')
axes[0, 0].set_xlabel('Predicted')
axes[0, 0].set_ylabel('Actual')

# 2. ROC Curve
fpr, tpr, _ = roc_curve(val_labels, val_probabilities[:, 1])
roc_auc = auc(fpr, tpr)
axes[0, 1].plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.3f})')
axes[0, 1].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
axes[0, 1].set_xlim([0.0, 1.0])
axes[0, 1].set_ylim([0.0, 1.05])
axes[0, 1].set_xlabel('False Positive Rate')
axes[0, 1].set_ylabel('True Positive Rate')
axes[0, 1].set_title('ROC Curve')
axes[0, 1].legend(loc="lower right")
axes[0, 1].grid(True, alpha=0.3)

# 3. Precision-Recall Curve
precision, recall, _ = precision_recall_curve(val_labels, val_probabilities[:, 1])
pr_auc = average_precision_score(val_labels, val_probabilities[:, 1])
axes[0, 2].plot(recall, precision, color='blue', lw=2, label=f'PR curve (AUC = {pr_auc:.3f})')
axes[0, 2].set_xlabel('Recall')
axes[0, 2].set_ylabel('Precision')
axes[0, 2].set_title('Precision-Recall Curve')
axes[0, 2].legend(loc="lower left")
axes[0, 2].grid(True, alpha=0.3)

# 4. Confidence Distribution by Class
confidence_correct = []
confidence_incorrect = []
for i, (true_label, pred_label, probs) in enumerate(zip(val_labels, val_predictions, val_probabilities)):
    confidence = probs[pred_label]
    if true_label == pred_label:
        confidence_correct.append(confidence)
    else:
        confidence_incorrect.append(confidence)

axes[1, 0].hist(confidence_correct, bins=30, alpha=0.7, label='Correct Predictions', color='green')
axes[1, 0].hist(confidence_incorrect, bins=30, alpha=0.7, label='Incorrect Predictions', color='red')
axes[1, 0].set_xlabel('Confidence Score')
axes[1, 0].set_ylabel('Count')
axes[1, 0].set_title('Confidence Distribution by Prediction Correctness')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# 5. Prediction Distribution on Test Set
test_disaster_probs = submission_df['disaster_probability']
axes[1, 1].hist(test_disaster_probs, bins=30, alpha=0.7, color='purple')
axes[1, 1].set_xlabel('Disaster Probability')
axes[1, 1].set_ylabel('Count')
axes[1, 1].set_title('Test Set Prediction Distribution')
axes[1, 1].grid(True, alpha=0.3)

# 6. Class Performance Comparison
classes = ['Non-Disaster', 'Disaster']
precision_scores = [report['Non-Disaster']['precision'], report['Disaster']['precision']]
recall_scores = [report['Non-Disaster']['recall'], report['Disaster']['recall']]
f1_scores = [report['Non-Disaster']['f1-score'], report['Disaster']['f1-score']]

x = np.arange(len(classes))
width = 0.25

axes[1, 2].bar(x - width, precision_scores, width, label='Precision', alpha=0.8)
axes[1, 2].bar(x, recall_scores, width, label='Recall', alpha=0.8)
axes[1, 2].bar(x + width, f1_scores, width, label='F1-Score', alpha=0.8)

axes[1, 2].set_xlabel('Class')
axes[1, 2].set_ylabel('Score')
axes[1, 2].set_title('Class Performance Comparison')
axes[1, 2].set_xticks(x)
axes[1, 2].set_xticklabels(classes)
axes[1, 2].legend()
axes[1, 2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Save visualizations
plt.savefig('results/visualizations/comprehensive_performance.png', dpi=300, bbox_inches='tight')
print("✅ Performance visualizations saved to results/visualizations/comprehensive_performance.png")

## 5.8 Error Analysis

In [None]:
# Perform error analysis
print("Error Analysis on Validation Set:")
print("="*60)

# Find misclassified examples
misclassified_mask = val_predictions != val_labels
misclassified_indices = np.where(misclassified_mask)[0]

print(f"Total misclassified: {len(misclassified_indices)} out of {len(val_labels)} ({len(misclassified_indices)/len(val_labels)*100:.1f}%)")

# Analyze error types
false_positives = []
false_negatives = []

for idx in misclassified_indices:
    true_label = val_labels[idx]
    pred_label = val_predictions[idx]
    confidence = val_probabilities[idx, pred_label]
    
    if true_label == 0 and pred_label == 1:  # False positive
        false_positives.append({
            'index': idx,
            'confidence': confidence,
            'true_label': 'Non-Disaster',
            'pred_label': 'Disaster'
        })
    elif true_label == 1 and pred_label == 0:  # False negative
        false_negatives.append({
            'index': idx,
            'confidence': confidence,
            'true_label': 'Disaster',
            'pred_label': 'Non-Disaster'
        })

print(f"\nFalse Positives (Non-Disaster predicted as Disaster): {len(false_positives)}")
print(f"False Negatives (Disaster predicted as Non-Disaster): {len(false_negatives)}")

# Show examples of errors
print("\n" + "="*60)
print("Sample Error Analysis:")
print("="*60)

# Get original texts for misclassified examples
val_indices = np.where((np.arange(len(y_train))[:, np.newaxis] == y_train.reshape(-1, 1)).all(axis=1))[0]

# Show false positives
print("\nFalse Positives (Non-Disaster → Disaster):")
print("-" * 50)
for i, error in enumerate(false_positives[:3]):  # Show first 3
    idx = error['index']
    print(f"Example {i+1}:")
    print(f"  Text: {X_text_val[idx][:100]}...")
    print(f"  Confidence: {error['confidence']:.4f}")
    print(f"  True: {error['true_label']}, Predicted: {error['pred_label']}")
    print()

# Show false negatives
print("False Negatives (Disaster → Non-Disaster):")
print("-" * 50)
for i, error in enumerate(false_negatives[:3]):  # Show first 3
    idx = error['index']
    print(f"Example {i+1}:")
    print(f"  Text: {X_text_val[idx][:100]}...")
    print(f"  Confidence: {error['confidence']:.4f}")
    print(f"  True: {error['true_label']}, Predicted: {error['pred_label']}")
    print()

# Confidence analysis for errors
fp_confidences = [error['confidence'] for error in false_positives]
fn_confidences = [error['confidence'] for error in false_negatives]

print(f"\nError Confidence Analysis:")
print(f"False Positives - Mean confidence: {np.mean(fp_confidences):.4f}, Std: {np.std(fp_confidences):.4f}")
print(f"False Negatives - Mean confidence: {np.mean(fn_confidences):.4f}, Std: {np.std(fn_confidences):.4f}")

## 5.9 Save Final Results and Summary

In [None]:
# Compile final results
final_results = {
    'model_performance': {
        'validation_accuracy': float(val_accuracy),
        'validation_precision': float(val_precision),
        'validation_recall': float(val_recall),
        'validation_f1': float(val_f1),
        'roc_auc': float(roc_auc),
        'pr_auc': float(pr_auc)
    },
    'per_class_performance': {
        'non_disaster': {
            'precision': report['Non-Disaster']['precision'],
            'recall': report['Non-Disaster']['recall'],
            'f1_score': report['Non-Disaster']['f1-score'],
            'support': report['Non-Disaster']['support']
        },
        'disaster': {
            'precision': report['Disaster']['precision'],
            'recall': report['Disaster']['recall'],
            'f1_score': report['Disaster']['f1-score'],
            'support': report['Disaster']['support']
        }
    },
    'error_analysis': {
        'total_errors': len(misclassified_indices),
        'error_rate': len(misclassified_indices) / len(val_labels),
        'false_positives': len(false_positives),
        'false_negatives': len(false_negatives),
        'fp_mean_confidence': float(np.mean(fp_confidences)) if fp_confidences else 0,
        'fn_mean_confidence': float(np.mean(fn_confidences)) if fn_confidences else 0
    },
    'test_predictions': {
        'total_samples': len(predictions),
        'disaster_predictions': int(np.sum(predictions)),
        'non_disaster_predictions': int(np.sum(predictions == 0)),
        'disaster_percentage': float(np.mean(predictions)),
        'mean_confidence': float(np.mean(confidence_scores)),
        'confidence_std': float(np.std(confidence_scores))
    },
    'model_info': {
        'model_type': 'Hybrid DistilBERT + Meta-Features',
        'bert_model': 'distilbert-base-uncased',
        'num_meta_features': len(meta_cols),
        'total_parameters': sum(p.numel() for p in model.parameters()),
        'best_epoch': int(checkpoint.get('epoch', 0)) + 1
    }
}

# Save comprehensive results
with open('results/metrics/final_results.json', 'w') as f:
    json.dump(final_results, f, indent=2)

# Print summary
print("🎯 FINAL MODEL SUMMARY")
print("="*60)
print(f"\n📊 Validation Performance:")
print(f"  Accuracy: {val_accuracy:.4f} ({val_accuracy*100:.2f}%)")
print(f"  F1 Score: {val_f1:.4f}")
print(f"  Precision: {val_precision:.4f}")
print(f"  Recall: {val_recall:.4f}")
print(f"  ROC AUC: {roc_auc:.4f}")

print(f"\n🎯 Per-Class Performance:")
print(f"  Non-Disaster - F1: {report['Non-Disaster']['f1-score']:.4f}, Precision: {report['Non-Disaster']['precision']:.4f}, Recall: {report['Non-Disaster']['recall']:.4f}")
print(f"  Disaster - F1: {report['Disaster']['f1-score']:.4f}, Precision: {report['Disaster']['precision']:.4f}, Recall: {report['Disaster']['recall']:.4f}")

print(f"\n📈 Test Predictions:")
print(f"  Total samples: {len(predictions)}")
print(f"  Disaster predictions: {np.sum(predictions)} ({np.mean(predictions)*100:.1f}%)")
print(f"  Mean confidence: {np.mean(confidence_scores):.4f}")

print(f"\n🔍 Error Analysis:")
print(f"  Total errors: {len(misclassified_indices)} ({len(misclassified_indices)/len(val_labels)*100:.1f}%)")
print(f"  False positives: {len(false_positives)}")
print(f"  False negatives: {len(false_negatives)}")

print(f"\n🤖 Model Architecture:")
print(f"  Type: Hybrid DistilBERT + Meta-Features")
print(f"  Total parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"  Meta features: {len(meta_cols)}")
print(f"  Best epoch: {checkpoint.get('epoch', 0) + 1}")

print(f"\n📁 Files Created:")
print(f"  - Data/submission.csv (competition submission)")
print(f"  - Data/submission_enhanced.csv (with confidence scores)")
print(f"  - results/metrics/final_results.json (comprehensive metrics)")
print(f"  - results/visualizations/comprehensive_performance.png")
print(f"  - models/best_model.pth (trained model)")

print("\n" + "="*60)
print("🎉 Disaster Tweet Classification Project Completed!")
print("="*60)

## 5.10 Next Steps and Recommendations

In [None]:
# Print recommendations based on analysis
print("🚀 RECOMMENDATIONS FOR FUTURE IMPROVEMENT")
print("="*60)

print("\n🔧 Model Improvements:")
print("  1. Try larger BERT models (BERT-base, RoBERTa)")
print("  2. Implement ensemble methods (multiple models)")
print("  3. Add more sophisticated meta-features")
print("  4. Experiment with different architectures (LSTM + BERT)")

print("\n📊 Data Enhancements:")
print("  1. Data augmentation techniques (back-translation, synonym replacement)")
print("  2. Handle class imbalance with weighted loss or sampling")
print("  3. Collect more labeled data for minority class")
print("  4. Implement cross-validation for robust evaluation")

print("\n⚡ Training Optimizations:")
print("  1. Hyperparameter tuning (learning rate, batch size, dropout)")
print("  2. Advanced regularization techniques")
print("  3. Learning rate scheduling and warmup")
print("  4. Mixed precision training for faster training")

print("\n🌐 Deployment Ready:")
print("  1. Create REST API for model inference")
print("  2. Implement real-time Twitter stream processing")
print("  3. Add model monitoring and drift detection")
print("  4. Containerize with Docker for easy deployment")

print("\n📈 Based on Error Analysis:")")
if len(false_negatives) > len(false_positives):
    print("  - Focus on improving disaster recall (missed disasters)")
    print("  - Consider using focal loss to prioritize hard examples")
else:
    print("  - Focus on reducing false alarms")
    print("  - Adjust classification threshold if needed")

print(f"\n✅ Project successfully completed with {val_f1:.4f} F1 score!")
print("Ready for production deployment and further improvements.")