In [24]:
import time
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import Counter

# Helper function for timing info
def print_timer_info(message):
    print(f"[TIMER INFO] {message}")

# Set up device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print_timer_info(f"Using device: {device}")

# === Load Dataset ===
start_time = time.time()
print_timer_info("Loading the 20 Newsgroups dataset...")
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
df = pd.DataFrame({'Text': newsgroups.data, 'Category': newsgroups.target})
print_timer_info(f"Dataset loaded in {time.time() - start_time:.2f} seconds")

# === Pre-process Data ===
print_timer_info("Starting data pre-processing...")
preprocess_start_time = time.time()

# Text cleaning, tokenization, stopword removal, and stemming
df['Text'] = df['Text'].str.lower()
df['Text'] = df['Text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
df['Tokens'] = df['Text'].apply(word_tokenize)

# Stopwords removal
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))
df['Tokens'] = df['Tokens'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

# Stemming
stemmer = PorterStemmer()
df['Tokens'] = df['Tokens'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])
print_timer_info(f"Pre-processing completed in {time.time() - preprocess_start_time:.2f} seconds")

# === Vocabulary and Sequence Preparation ===
print_timer_info("Building vocabulary and preparing sequences...")
vocab_build_start_time = time.time()

# Build vocabulary
vocab = Counter()
for tokens in df['Tokens']:
    vocab.update(tokens)
vocab_size = 10000
vocab = dict(vocab.most_common(vocab_size))
word_to_index = {word: idx + 1 for idx, word in enumerate(vocab.keys())}

# Random embedding matrix initialization with increased dimension
embedding_dim = 200
embedding_matrix = np.random.uniform(-0.05, 0.05, (vocab_size + 1, embedding_dim))
embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32).to(device)
print_timer_info(f"Vocabulary and embedding matrix created in {time.time() - vocab_build_start_time:.2f} seconds")

# Convert text to sequences and pad sequences
max_seq_length = 120
df['Sequences'] = df['Tokens'].apply(lambda tokens: [word_to_index.get(word, 0) for word in tokens])
df['Padded_Sequences'] = df['Sequences'].apply(lambda seq: seq[:max_seq_length] + [0] * (max_seq_length - len(seq)))

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Category'])

# Split the data
X_train, X_val, y_train, y_val = train_test_split(df['Padded_Sequences'], y, test_size=0.2, random_state=42)

# Convert to tensors
X_train_tensor = torch.tensor(X_train.tolist(), dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_val_tensor = torch.tensor(X_val.tolist(), dtype=torch.long)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

# Create PyTorch Dataset
class SequenceDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Create DataLoaders with reduced batch size
batch_size = 16
train_dataset = SequenceDataset(X_train_tensor, y_train_tensor)
val_dataset = SequenceDataset(X_val_tensor, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

print_timer_info("Data preparation completed")

# === Define CNN-LSTM Model with Dropout Regularization ===
class CNNLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_lstm_layers=2, kernel_size=5, num_filters=128, dropout_prob=0.5):
        super(CNNLSTMClassifier, self).__init__()
        
        # Embedding layer with random initialization, CNN, LSTM, and fully connected layers with dropout
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.conv1 = nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=kernel_size)
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.dropout = nn.Dropout(dropout_prob)
        self.lstm = nn.LSTM(input_size=num_filters, hidden_size=hidden_dim, num_layers=num_lstm_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = self.conv1(x)
        x = F.relu(x)
        x = self.pool(x)
        x = self.dropout(x)
        x = x.permute(0, 2, 1)
        x, _ = self.lstm(x)
        x = x[:, -1, :]
        x = self.dropout(x)
        x = self.fc(x)
        return x

# Model Parameters
hidden_dim = 256
output_dim = len(label_encoder.classes_)

# Instantiate and move model to device
model = CNNLSTMClassifier(vocab_size=vocab_size + 1, embedding_dim=embedding_dim, hidden_dim=hidden_dim, output_dim=output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# === Train Model with Early Stopping ===
num_epochs = 100
patience = 10
best_val_loss = float('inf')
epochs_no_improve = 0

print_timer_info("Starting model training...")
training_start_time = time.time()

for epoch in range(num_epochs):
    epoch_start_time = time.time()
    model.train()
    epoch_loss = 0

    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        predictions = model(X_batch)
        loss = criterion(predictions, y_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    # Validation phase
    model.eval()
    val_loss = 0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            predictions = model(X_batch)
            loss = criterion(predictions, y_batch)
            val_loss += loss.item()
            _, predicted_classes = torch.max(predictions, 1)
            correct_predictions += (predicted_classes == y_batch).sum().item()
            total_predictions += y_batch.size(0)

    val_acc = correct_predictions / total_predictions
    avg_val_loss = val_loss / len(val_loader)
    print(f"[EPOCH {epoch + 1}/{num_epochs}] Training Loss: {epoch_loss / len(train_loader):.4f}, "
          f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_acc:.4f}, "
          f"Epoch Time: {time.time() - epoch_start_time:.2f} seconds")

    # Early stopping
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_no_improve = 0
        best_model_state = model.state_dict()
    else:
        epochs_no_improve += 1

    if epochs_no_improve >= patience:
        print_timer_info(f"Early stopping triggered after {epoch + 1} epochs")
        break

print_timer_info(f"Total training time: {time.time() - training_start_time:.2f} seconds")

# Load best model state
model.load_state_dict(best_model_state)

# === Evaluate Model ===
print_timer_info("Evaluating model on validation set...")
eval_start_time = time.time()

# Evaluation metrics
model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for X_batch, y_batch in val_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        _, predicted = torch.max(outputs, 1)
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(y_batch.cpu().numpy())

# Calculate evaluation metrics
accuracy = accuracy_score(all_labels, all_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average='weighted')

print(f"\n=== Evaluation Results ===")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print_timer_info(f"Evaluation completed in {time.time() - eval_start_time:.2f} seconds")

# === Classification Report ===
print_timer_info("Generating classification report...")
report_start_time = time.time()
report = classification_report(all_labels, all_predictions, target_names=list(map(str, label_encoder.classes_)))
print(report)
print_timer_info(f"Classification report generated in {time.time() - report_start_time:.2f} seconds")


[TIMER INFO] Using device: cuda
[TIMER INFO] Loading the 20 Newsgroups dataset...
[TIMER INFO] Dataset loaded in 2.96 seconds
[TIMER INFO] Starting data pre-processing...
[TIMER INFO] Pre-processing completed in 85.44 seconds
[TIMER INFO] Building vocabulary and preparing sequences...
[TIMER INFO] Vocabulary and embedding matrix created in 0.43 seconds
[TIMER INFO] Data preparation completed
[TIMER INFO] Starting model training...
[EPOCH 2/100] Training Loss: 2.5411, Validation Loss: 2.4478, Validation Accuracy: 0.1554, Epoch Time: 7.46 seconds
[EPOCH 3/100] Training Loss: 2.3646, Validation Loss: 2.3198, Validation Accuracy: 0.1732, Epoch Time: 7.32 seconds
[EPOCH 4/100] Training Loss: 2.2260, Validation Loss: 2.2464, Validation Accuracy: 0.1984, Epoch Time: 7.36 seconds
[EPOCH 5/100] Training Loss: 2.0807, Validation Loss: 2.1431, Validation Accuracy: 0.2146, Epoch Time: 7.62 seconds
[EPOCH 6/100] Training Loss: 1.9364, Validation Loss: 2.0460, Validation Accuracy: 0.2676, Epoch Time