# Preparation that all models would require

In [8]:
import time
import pandas as pd
import numpy as np
import re
import nltk

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

import torch
from torch.utils.data import Dataset, DataLoader

# Custom print functions
def print_timer_info(message):
    print(f"[TIMER INFO] {message}")

def print_output_data(message):
    print(f"[OUTPUT DATA] {message}")

# Automatically download missing NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Check CUDA availability and set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print_output_data('Using device: {}'.format(device))


# Load the dataset
start_time = time.time()
print_timer_info("Loading the 20 Newsgroups dataset...")
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
df = pd.DataFrame({'Text': newsgroups.data, 'Category': newsgroups.target})
df['Category Name'] = df['Category'].apply(lambda x: newsgroups.target_names[x])
print_timer_info(f"Time taken to load dataset: {time.time() - start_time:.2f} seconds")

# Pre-processing
print_timer_info("Starting pre-processing...")
preprocess_start_time = time.time()

df['Text'] = df['Text'].str.lower()  # Lowercasing
df['Text'] = df['Text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))  # Remove punctuation/special characters
df['Tokens'] = df['Text'].apply(word_tokenize)  # Tokenization

stop_words = set(stopwords.words('english'))
df['Tokens'] = df['Tokens'].apply(lambda tokens: [word for word in tokens if word not in stop_words])  # Stopwords removal

stemmer = PorterStemmer()
df['Tokens'] = df['Tokens'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])  # Stemming

print_timer_info(f"Total time for pre-processing: {time.time() - preprocess_start_time:.2f} seconds")

# Move processed data to GPU if available (Example - converting text data to tensor)
max_token_length = max(df['Tokens'].apply(len))
print_output_data(f"Maximum length of tokenized text: {max_token_length}")

# Data Preparation
data_prep_start_time = time.time()
print_timer_info("Starting data preparation...")

# Define and fit the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=6620, stop_words='english')  # Set max_features to limit vocab size if necessary
X_tfidf = tfidf_vectorizer.fit_transform(df['Text']).toarray()  # Assuming you are using the original 'Text' column

# Encode the labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Category'])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

# Create PyTorch Dataset
class TfidfDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = TfidfDataset(X_train_tensor, y_train_tensor)
val_dataset = TfidfDataset(X_val_tensor, y_val_tensor)

# Create DataLoaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

print_timer_info(f"Total time for data preparation: {time.time() - data_prep_start_time:.2f} seconds")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shalo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shalo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[OUTPUT DATA] Using device: cuda
[TIMER INFO] Loading the 20 Newsgroups dataset...
[TIMER INFO] Time taken to load dataset: 1.68 seconds
[TIMER INFO] Starting pre-processing...
[TIMER INFO] Total time for pre-processing: 28.19 seconds
[OUTPUT DATA] Maximum length of tokenized text: 6620
[TIMER INFO] Starting data preparation...
[TIMER INFO] Total time for data preparation: 2.11 seconds


# Baseline Model

In [9]:
import torch.nn as nn
import torch.optim as optim

class LogisticRegressionClassifier(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegressionClassifier, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)

# Model Training
# Model Parameters
input_dim = X_train.shape[1]
output_dim = len(label_encoder.classes_)

# Instantiate the model
model = LogisticRegressionClassifier(input_dim, output_dim)
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

# Move model to GPU if available
model = model.to(device)
criterion = criterion.to(device)

num_epochs = 50  
patience = 5  # Number of epochs with no improvement after which training will be stopped
best_val_loss = float('inf')
epochs_no_improve = 0

# Training loop with early stopping
print_timer_info("Starting model training...")
training_start_time = time.time()

for epoch in range(num_epochs):
    epoch_start_time = time.time()
    model.train()
    epoch_loss = 0

    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        predictions = model(X_batch)

        # Calculate loss
        loss = criterion(predictions, y_batch)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Accumulate epoch loss
        epoch_loss += loss.item()

    # Validation phase
    model.eval()
    val_loss = 0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            predictions = model(X_batch)
            loss = criterion(predictions, y_batch)
            val_loss += loss.item()

            # Calculate accuracy
            _, predicted_classes = torch.max(predictions, 1)
            correct_predictions += (predicted_classes == y_batch).sum().item()
            total_predictions += y_batch.size(0)

    val_acc = correct_predictions / total_predictions
    avg_val_loss = val_loss / len(val_loader)

    # Print the results for the current epoch
    print(f'Epoch [{epoch + 1}/{num_epochs}], Training Loss: {epoch_loss / len(train_loader):.4f}, Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_acc:.4f}, Time Taken: {time.time() - epoch_start_time:.2f} seconds')

    # Early stopping check
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_no_improve = 0
        # You can also save the model if it has the best validation loss so far
        best_model_state = model.state_dict()
    else:
        epochs_no_improve += 1

    if epochs_no_improve >= patience:
        print(f'Early stopping triggered after {epoch + 1} epochs.')
        break

print_timer_info(f"Total training time: {time.time() - training_start_time:.2f} seconds")

# Load the best model state if needed
model.load_state_dict(best_model_state)

# Model Evaluation
eval_start_time = time.time()
print_timer_info("Starting model evaluation...")

# Switch model to evaluation mode
model.eval()

# Initialize lists to store the predictions and true labels
all_predictions = []
all_labels = []

# Disable gradient computation for evaluation
with torch.no_grad():
    for X_batch, y_batch in val_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        # Get model predictions
        outputs = model(X_batch)
        _, predicted = torch.max(outputs, 1)
        
        # Store predictions and labels
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(y_batch.cpu().numpy())

# Convert to NumPy arrays for evaluation
all_predictions = np.array(all_predictions)
all_labels = np.array(all_labels)

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_predictions)

# Calculate precision, recall, and F1-score
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average='weighted')

# Print evaluation results
print(f'Validation Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

print_timer_info(f"Total evaluation time: {time.time() - eval_start_time:.2f} seconds")

# Print report
print_timer_info("Generating classification report...")
report_start_time = time.time()

# Generate a classification report
report = classification_report(all_labels, all_predictions, target_names=list(map(str, label_encoder.classes_)))
print(report)

print_timer_info(f"Time taken to generate classification report: {time.time() - report_start_time:.2f} seconds")

[TIMER INFO] Starting model training...
Epoch [1/50], Training Loss: 2.5029, Validation Loss: 2.1053, Validation Accuracy: 0.6655, Time Taken: 0.57 seconds
Epoch [2/50], Training Loss: 1.6989, Validation Loss: 1.6707, Validation Accuracy: 0.6878, Time Taken: 0.56 seconds
Epoch [3/50], Training Loss: 1.2814, Validation Loss: 1.4424, Validation Accuracy: 0.6960, Time Taken: 0.55 seconds
Epoch [4/50], Training Loss: 1.0333, Validation Loss: 1.3094, Validation Accuracy: 0.6973, Time Taken: 0.55 seconds
Epoch [5/50], Training Loss: 0.8692, Validation Loss: 1.2244, Validation Accuracy: 0.6981, Time Taken: 0.53 seconds
Epoch [6/50], Training Loss: 0.7530, Validation Loss: 1.1675, Validation Accuracy: 0.7005, Time Taken: 0.56 seconds
Epoch [7/50], Training Loss: 0.6649, Validation Loss: 1.1274, Validation Accuracy: 0.7034, Time Taken: 0.54 seconds
Epoch [8/50], Training Loss: 0.5947, Validation Loss: 1.0979, Validation Accuracy: 0.7024, Time Taken: 0.55 seconds
Epoch [9/50], Training Loss: 0.5