In [1]:
import time
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import joblib

# Helper function for timing info
def print_timer_info(message):
    print(f"[TIMER INFO] {message}")

# Set up device
def setup_device():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print_timer_info(f"Using device: {device}")
    torch.cuda.empty_cache()
    return device

# Load and preprocess dataset
def load_data(data_path):
    print_timer_info("Loading preprocessed dataset...")
    X_train, y_train, X_val, y_val = joblib.load(data_path)
    print_timer_info("Preprocessed dataset loaded")
    return X_train, y_train, X_val, y_val

# Convert data to tensors
def convert_to_tensors(X_train, y_train, X_val, y_val, device):
    X_train_tensor = torch.tensor(np.array(X_train), dtype=torch.float32).to(device)
    y_train_tensor = torch.tensor(np.array(y_train), dtype=torch.long).to(device)
    X_val_tensor = torch.tensor(np.array(X_val), dtype=torch.float32).to(device)
    y_val_tensor = torch.tensor(np.array(y_val), dtype=torch.long).to(device)
    return X_train_tensor, y_train_tensor, X_val_tensor, y_val_tensor

# Dataset and DataLoader creation
class PreprocessedDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

def create_data_loaders(X_train_tensor, y_train_tensor, X_val_tensor, y_val_tensor, batch_size):
    train_dataset = PreprocessedDataset(X_train_tensor, y_train_tensor)
    val_dataset = PreprocessedDataset(X_val_tensor, y_val_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    print_timer_info("Data preparation completed")
    return train_loader, val_loader

# Define CNN-Bidirectional LSTM Model with Dropout Regularization
class CNNLSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_lstm_layers=2, kernel_size=5, num_filters=128, dropout_prob=0.5):
        super(CNNLSTMClassifier, self).__init__()
        
        self.conv1 = nn.Conv1d(in_channels=input_dim, out_channels=num_filters, kernel_size=kernel_size)
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.dropout = nn.Dropout(dropout_prob)
        
        # Bidirectional LSTM
        self.lstm = nn.LSTM(input_size=num_filters, hidden_size=hidden_dim, num_layers=num_lstm_layers, 
                            batch_first=True, bidirectional=True)
        
        # Adjust the fully connected layer for bidirectional output
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        x = x.permute(0, 2, 1)  # Convert to shape (batch_size, input_dim, seq_len)
        x = self.conv1(x)
        x = F.relu(x)
        x = self.pool(x)
        x = self.dropout(x)
        x = x.permute(0, 2, 1)  # Convert back to shape (batch_size, seq_len, num_filters)
        
        # Forward pass through bidirectional LSTM
        x, _ = self.lstm(x)
        x = x[:, -1, :]  # (batch_size, hidden_dim * 2) for bidirectional LSTM
        
        x = self.dropout(x)
        x = self.fc(x)
        return x

# Training loop with early stopping
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, patience, device):
    best_val_loss = float('inf')
    epochs_no_improve = 0
    best_model_state = None

    print_timer_info("Starting model training...")
    training_start_time = time.time()

    for epoch in range(num_epochs):
        epoch_start_time = time.time()
        model.train()
        epoch_loss = 0

        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            predictions = model(X_batch)
            loss = criterion(predictions, y_batch)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        # Validation phase
        model.eval()
        val_loss = 0
        correct_predictions = 0
        total_predictions = 0

        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                predictions = model(X_batch)
                loss = criterion(predictions, y_batch)
                val_loss += loss.item()
                _, predicted_classes = torch.max(predictions, 1)
                correct_predictions += (predicted_classes == y_batch).sum().item()
                total_predictions += y_batch.size(0)

        val_acc = correct_predictions / total_predictions
        avg_val_loss = val_loss / len(val_loader)
        print(f"[EPOCH {epoch + 1}/{num_epochs}] Training Loss: {epoch_loss / len(train_loader):.4f}, "
              f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_acc:.4f}, "
              f"Epoch Time: {time.time() - epoch_start_time:.2f} seconds")

        # Early stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            epochs_no_improve = 0
            best_model_state = model.state_dict()
        else:
            epochs_no_improve += 1

        if epochs_no_improve >= patience:
            print_timer_info(f"Early stopping triggered after {epoch + 1} epochs")
            break

    print_timer_info(f"Total training time: {time.time() - training_start_time:.2f} seconds")
    model.load_state_dict(best_model_state)

# Model evaluation
def evaluate_model(model, val_loader, device):
    model.eval()
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            outputs = model(X_batch)
            _, predicted = torch.max(outputs, 1)
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(y_batch.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average='weighted')

    print(f"\n=== Evaluation Results ===")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    report = classification_report(all_labels, all_predictions)
    print(report)

# Main function to run the entire process
def main():
    device = setup_device()
    data_path = '/kaggle/input/sha-longs-ai6103-bert-train-test-data/bert_train_test_data.pkl'
    X_train, y_train, X_val, y_val = load_data(data_path)

    X_train_tensor, y_train_tensor, X_val_tensor, y_val_tensor = convert_to_tensors(X_train, y_train, X_val, y_val, device)

    batch_size = 16
    train_loader, val_loader = create_data_loaders(X_train_tensor, y_train_tensor, X_val_tensor, y_val_tensor, batch_size)

    input_dim = 768
    hidden_dim = 256
    output_dim = len(np.unique(y_train))

    model = CNNLSTMClassifier(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    num_epochs = 200
    patience = 10
    train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, patience, device)

    evaluate_model(model, val_loader, device)

if __name__ == "__main__":
    main()


[TIMER INFO] Using device: cuda
[TIMER INFO] Loading preprocessed dataset...
[TIMER INFO] Preprocessed dataset loaded
[TIMER INFO] Data preparation completed
[TIMER INFO] Starting model training...
[EPOCH 1/200] Training Loss: 2.0113, Validation Loss: 1.5077, Validation Accuracy: 0.4674, Epoch Time: 14.78 seconds
[EPOCH 2/200] Training Loss: 1.4232, Validation Loss: 1.3197, Validation Accuracy: 0.5395, Epoch Time: 13.31 seconds
[EPOCH 3/200] Training Loss: 1.2215, Validation Loss: 1.2882, Validation Accuracy: 0.5637, Epoch Time: 12.85 seconds
[EPOCH 4/200] Training Loss: 1.1251, Validation Loss: 1.1637, Validation Accuracy: 0.6138, Epoch Time: 13.04 seconds
[EPOCH 5/200] Training Loss: 1.0633, Validation Loss: 1.1109, Validation Accuracy: 0.6363, Epoch Time: 12.97 seconds
[EPOCH 6/200] Training Loss: 0.9925, Validation Loss: 1.1453, Validation Accuracy: 0.6438, Epoch Time: 12.65 seconds
[EPOCH 7/200] Training Loss: 0.9495, Validation Loss: 1.1002, Validation Accuracy: 0.6541, Epoch Tim