# 4. Model Training

This notebook builds the hybrid DistilBERT + meta-features model and trains it on the preprocessed data.

## Model Architecture:
- **Base Model**: DistilBERT for sequence classification
- **Meta Features**: 10 normalized engineered features
- **Hybrid Approach**: Concatenate BERT embeddings with meta-features
- **Classification**: Binary classification (disaster vs non-disaster)

In [None]:
print("Libraries imported successfully!")

## 4.1 Load Preprocessed Data

In [None]:
# Load cleaned datasets
df_train = pd.read_csv('../Data/train_cleaned.csv')
df_test = pd.read_csv('../Data/test_cleaned.csv')

## 4.2 Data Preparation

In [None]:
# Prepare training data
X_text = df_train['text_clean'].values
y = df_train['target'].values
X_meta = df_train[meta_cols].values

print(f"Text data shape: {X_text.shape}")
print(f"Meta features shape: {X_meta.shape}")
print(f"Target shape: {y.shape}")

# Split into train and validation sets
X_text_train, X_text_val, X_meta_train, X_meta_val, y_train, y_val = train_test_split(
    X_text, X_meta, y,
    test_size=config['training_config']['val_split'],
    random_state=config['training_config']['random_seed'],
    stratify=y
)

print(f"\nTraining set size: {len(X_text_train)}")
print(f"Validation set size: {len(X_text_val)}")
print(f"Training class distribution: {np.bincount(y_train)}")
print(f"Validation class distribution: {np.bincount(y_val)}")

## 4.3 Tokenization Setup

In [None]:
# Initialize tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained(config['model_config']['model_name'])

# Tokenization function
def tokenize_function(texts, max_length=None):
    if max_length is None:
        max_length = config['model_config']['max_length']
    
    return tokenizer(
        texts.tolist() if isinstance(texts, np.ndarray) else texts,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )

# Tokenize training and validation data
print("Tokenizing training data...")
train_encodings = tokenize_function(X_text_train)

print("Tokenizing validation data...")
val_encodings = tokenize_function(X_text_val)

print("Tokenizing test data...")
test_texts = df_test['text_clean'].values
test_meta = df_test[meta_cols].values
test_encodings = tokenize_function(test_texts)

print(f"\nTokenization completed!")
print(f"Training encodings shape: {train_encodings['input_ids'].shape}")
print(f"Validation encodings shape: {val_encodings['input_ids'].shape}")
print(f"Test encodings shape: {test_encodings['input_ids'].shape}")

## 4.4 Custom Dataset Class

In [None]:
class DisasterTweetDataset(Dataset):
    """
    Custom dataset class for combining BERT tokens with meta-features.
    """
    def __init__(self, encodings, meta_features, labels=None):
        self.encodings = encodings
        self.meta_features = torch.FloatTensor(meta_features)
        self.labels = torch.LongTensor(labels) if labels is not None else None
    
    def __len__(self):
        return len(self.meta_features)
    
    def __getitem__(self, idx):
        item = {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'meta_features': self.meta_features[idx]
        }
        
        if self.labels is not None:
            item['labels'] = self.labels[idx]
        
        return item

# Create datasets
train_dataset = DisasterTweetDataset(train_encodings, X_meta_train, y_train)
val_dataset = DisasterTweetDataset(val_encodings, X_meta_val, y_val)
test_dataset = DisasterTweetDataset(test_encodings, test_meta)

print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

# Sample from dataset
sample = train_dataset[0]
print(f"\nSample data keys: {list(sample.keys())}")
print(f"Input IDs shape: {sample['input_ids'].shape}")
print(f"Meta features shape: {sample['meta_features'].shape}")
print(f"Label: {sample['labels']}")

## 4.5 Hybrid Model Architecture

In [None]:
class HybridDistilBERTClassifier(nn.Module):
    """
    Hybrid model combining DistilBERT with meta-features.
    """
    def __init__(self, bert_model_name, num_meta_features, num_labels=2, dropout=0.2):
        super().__init__()
        
        # DistilBERT model
        self.bert = DistilBertModel.from_pretrained(bert_model_name)
        self.bert_dim = self.bert.config.hidden_size
        
        # Meta-feature processing
        self.meta_bn = nn.BatchNorm1d(num_meta_features)
        self.meta_fc = nn.Linear(num_meta_features, 32)
        self.meta_activation = nn.ReLU()
        self.meta_dropout = nn.Dropout(dropout)
        
        # Combined features
        combined_dim = self.bert_dim + 32
        self.classifier = nn.Sequential(
            nn.Linear(combined_dim, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, num_labels)
        )
        
    def forward(self, input_ids, attention_mask, meta_features):
        # BERT encoding
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_pooled = bert_outputs.last_hidden_state[:, 0, :]  # CLS token
        
        # Meta-feature processing
        meta_processed = self.meta_bn(meta_features)
        meta_processed = self.meta_fc(meta_processed)
        meta_processed = self.meta_activation(meta_processed)
        meta_processed = self.meta_dropout(meta_processed)
        
        # Combine features
        combined = torch.cat([bert_pooled, meta_processed], dim=1)
        
        # Classification
        logits = self.classifier(combined)
        
        return logits

# Initialize model
model = HybridDistilBERTClassifier(
    bert_model_name=config['model_config']['model_name'],
    num_meta_features=len(meta_cols),
    num_labels=config['model_config']['num_labels'],
    dropout=config['model_config']['dropout']
)

model = model.to(device)

print(f"Model created successfully!")
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

# Model architecture summary
print("\nModel Architecture:")
print(f"- BERT model: {config['model_config']['model_name']}")
print(f"- BERT dimension: {model.bert_dim}")
print(f"- Meta features: {len(meta_cols)}")
print(f"- Meta processed dimension: 32")
print(f"- Combined dimension: {model.bert_dim + 32}")
print(f"- Output classes: {config['model_config']['num_labels']}")

## 4.6 Training Setup

In [None]:
# Create data loaders
train_loader = DataLoader(
    train_dataset,
    batch_size=config['model_config']['batch_size'],
    shuffle=True,
    num_workers=2
)

val_loader = DataLoader(
    val_dataset,
    batch_size=config['model_config']['batch_size'],
    shuffle=False,
    num_workers=2
)

test_loader = DataLoader(
    test_dataset,
    batch_size=config['model_config']['batch_size'],
    shuffle=False,
    num_workers=2
)

print(f"Data loaders created:")
print(f"Train batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")
print(f"Test batches: {len(test_loader)}")

# Setup optimizer and scheduler
optimizer = AdamW(
    model.parameters(),
    lr=config['model_config']['learning_rate'],
    weight_decay=0.01
)

num_training_steps = len(train_loader) * config['model_config']['num_epochs']
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=config['training_config']['warmup_steps'],
    num_training_steps=num_training_steps
)

# Loss function
criterion = nn.CrossEntropyLoss().to(device)  # Move criterion to device

print(f"\nTraining setup completed:")
print(f"Optimizer: AdamW with lr={config['model_config']['learning_rate']}")
print(f"Scheduler: Linear decay over {num_training_steps} steps")
print(f"Loss function: CrossEntropyLoss")

## 4.7 Training Functions

In [None]:
def train_epoch(model, dataloader, optimizer, scheduler, criterion, device):
    """
    Train model for one epoch.
    """
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    
    progress_bar = tqdm(dataloader, desc="Training")
    
    for batch in progress_bar:
        optimizer.zero_grad()
        
        # Move inputs to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        meta_features = batch['meta_features'].to(device)
        labels = batch['labels'].to(device)
        
        # Forward pass
        outputs = model(input_ids, attention_mask, meta_features)
        loss = criterion(outputs, labels)
        
        # Backward pass
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), config['training_config']['gradient_clipping'])
        optimizer.step()
        scheduler.step()
        
        # Statistics
        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct_predictions += (predicted == labels).sum().item()
        total_predictions += labels.size(0)
        
        # Update progress bar
        current_loss = total_loss / len(dataloader)
        current_acc = correct_predictions / total_predictions
        progress_bar.set_postfix({
            'loss': f'{current_loss:.4f}',
            'acc': f'{current_acc:.4f}'
        })
    
    avg_loss = total_loss / len(dataloader)
    accuracy = correct_predictions / total_predictions
    
    return avg_loss, accuracy

def evaluate_epoch(model, dataloader, criterion, device):
    """
    Evaluate model on validation set.
    """
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            # Move inputs to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            meta_features = batch['meta_features'].to(device)
            labels = batch['labels'].to(device)
            
            # Forward pass
            outputs = model(input_ids, attention_mask, meta_features)
            loss = criterion(outputs, labels)
            
            # Statistics
            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == labels).sum().item()
            total_predictions += labels.size(0)
            
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(dataloader)
    accuracy = correct_predictions / total_predictions
    f1 = f1_score(all_labels, all_predictions, average='weighted')
    
    return avg_loss, accuracy, f1, all_predictions, all_labels

## 4.8 Training Loop

In [None]:
      print(f"New best model saved! (F1: {val_f1:.4f})")

## 4.9 Training Visualization

In [None]:
print("Training history saved to results/metrics/training_history.json")

## 4.10 Model Evaluation

In [None]:
print("Confusion matrix saved to results/visualizations/confusion_matrix.png")

## 4.11 Save Model Artifacts

In [None]:
print("Model artifacts saved:")
print("- models/best_model.pth (best checkpoint)")
print("- models/final_model.pth (final model)")
print("- models/model_info.json (model metadata)")
print("- results/metrics/training_history.json")
print("- results/visualizations/confusion_matrix.png")

print("")
print("="*60)
print("Model training completed successfully!")
print(f"Best validation F1 score: {best_val_f1:.4f}")
print(f"Model saved for next notebook: 05_evaluation_prediction.ipynb")
print("="*60)