# Import Packages

In [26]:
import pandas as pd
import numpy as np
import re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import classification_report

import torch
from torch.utils.data import Dataset, DataLoader


# Load Data

In [2]:
# Load the dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

In [3]:
df = pd.DataFrame({
    'Text': newsgroups.data,
    'Category': newsgroups.target
})
df['Category Name'] = df['Category'].apply(lambda x: newsgroups.target_names[x])
df

Unnamed: 0,Text,Category,Category Name
0,\n\nI am sure some bashers of Pens fans are pr...,10,rec.sport.hockey
1,My brother is in the market for a high-perform...,3,comp.sys.ibm.pc.hardware
2,\n\n\n\n\tFinally you said what you dream abou...,17,talk.politics.mideast
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3,comp.sys.ibm.pc.hardware
4,1) I have an old Jasmine drive which I cann...,4,comp.sys.mac.hardware
...,...,...,...
18841,DN> From: nyeda@cnsvax.uwec.edu (David Nye)\nD...,13,sci.med
18842,\nNot in isolated ground recepticles (usually ...,12,sci.electronics
18843,I just installed a DX2-66 CPU in a clone mothe...,3,comp.sys.ibm.pc.hardware
18844,\nWouldn't this require a hyper-sphere. In 3-...,1,comp.graphics


# Preprocessing

In [4]:
# Lowercasing
df['Text'] = df['Text'].str.lower()

# Remove Punctuation and special characters
df['Text'] = df['Text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

# Tokenization
df['Tokens'] = df['Text'].apply(word_tokenize)

# Removing Stopwords
stop_words = set(stopwords.words('english'))
df['Tokens'] = df['Tokens'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

# Stemming
stemmer = PorterStemmer()
df['Tokens'] = df['Tokens'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])

In [8]:
max(df['Tokens'].apply(lambda x: len(x)))

6620

# Baseline Model

## Data Preperation

In [10]:
# Define and fit the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=6620, stop_words='english')  # Set max_features to limit vocab size if necessary
X_tfidf = tfidf_vectorizer.fit_transform(df['Text']).toarray()  # Assuming you are using the original 'Text' column

# Encode the labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Category'])

In [11]:
X_train, X_val, y_train, y_val = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [12]:
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

# Create PyTorch Dataset
class TfidfDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = TfidfDataset(X_train_tensor, y_train_tensor)
val_dataset = TfidfDataset(X_val_tensor, y_val_tensor)

# Create DataLoaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

## Baseline Model

In [17]:
import torch.nn as nn
import torch.optim as optim

class LogisticRegressionClassifier(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegressionClassifier, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)

## Model Training

In [21]:
# Model Parameters
input_dim = X_train.shape[1]
output_dim = len(label_encoder.classes_)

# Instantiate the model
model = LogisticRegressionClassifier(input_dim, output_dim)
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = criterion.to(device)

num_epochs = 50  
patience = 5  # Number of epochs with no improvement after which training will be stopped
best_val_loss = float('inf')
epochs_no_improve = 0

# Training loop with early stopping
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0

    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        predictions = model(X_batch)

        # Calculate loss
        loss = criterion(predictions, y_batch)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Accumulate epoch loss
        epoch_loss += loss.item()

    # Validation phase
    model.eval()
    val_loss = 0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            predictions = model(X_batch)
            loss = criterion(predictions, y_batch)
            val_loss += loss.item()

            # Calculate accuracy
            _, predicted_classes = torch.max(predictions, 1)
            correct_predictions += (predicted_classes == y_batch).sum().item()
            total_predictions += y_batch.size(0)

    val_acc = correct_predictions / total_predictions
    avg_val_loss = val_loss / len(val_loader)

    # Print the results for the current epoch
    print(f'Epoch [{epoch + 1}/{num_epochs}], Training Loss: {epoch_loss / len(train_loader):.4f}, Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_acc:.4f}')

    # Early stopping check
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_no_improve = 0
        # You can also save the model if it has the best validation loss so far
        best_model_state = model.state_dict()
    else:
        epochs_no_improve += 1

    if epochs_no_improve >= patience:
        print(f'Early stopping triggered after {epoch + 1} epochs.')
        break

# Load the best model state if needed
model.load_state_dict(best_model_state)

Epoch [1/50], Training Loss: 2.5063, Validation Loss: 2.1089, Validation Accuracy: 0.6629
Epoch [2/50], Training Loss: 1.7015, Validation Loss: 1.6734, Validation Accuracy: 0.6867
Epoch [3/50], Training Loss: 1.2818, Validation Loss: 1.4430, Validation Accuracy: 0.6947
Epoch [4/50], Training Loss: 1.0336, Validation Loss: 1.3097, Validation Accuracy: 0.6987
Epoch [5/50], Training Loss: 0.8700, Validation Loss: 1.2250, Validation Accuracy: 0.7029
Epoch [6/50], Training Loss: 0.7538, Validation Loss: 1.1674, Validation Accuracy: 0.7027
Epoch [7/50], Training Loss: 0.6645, Validation Loss: 1.1264, Validation Accuracy: 0.7021
Epoch [8/50], Training Loss: 0.5948, Validation Loss: 1.0984, Validation Accuracy: 0.6997
Epoch [9/50], Training Loss: 0.5382, Validation Loss: 1.0769, Validation Accuracy: 0.6989
Epoch [10/50], Training Loss: 0.4904, Validation Loss: 1.0613, Validation Accuracy: 0.6987
Epoch [11/50], Training Loss: 0.4506, Validation Loss: 1.0497, Validation Accuracy: 0.6960
Epoch [1

<All keys matched successfully>

## Model Evaluation

In [24]:
# Switch model to evaluation mode
model.eval()

# Initialize lists to store the predictions and true labels
all_predictions = []
all_labels = []

# Disable gradient computation for evaluation
with torch.no_grad():
    for X_batch, y_batch in val_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        # Get model predictions
        outputs = model(X_batch)
        _, predicted = torch.max(outputs, 1)
        
        # Store predictions and labels
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(y_batch.cpu().numpy())

# Convert to NumPy arrays for evaluation
all_predictions = np.array(all_predictions)
all_labels = np.array(all_labels)

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_predictions)

# Calculate precision, recall, and F1-score
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average='weighted')

# Print evaluation results
print(f'Validation Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

Validation Accuracy: 0.6865
Precision: 0.6949
Recall: 0.6865
F1 Score: 0.6882


In [30]:
# Initialize lists to store the predictions and true labels
all_predictions = []
all_labels = []

# Disable gradient computation for evaluation
with torch.no_grad():
    for X_batch, y_batch in val_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        # Get model predictions
        outputs = model(X_batch)
        _, predicted = torch.max(outputs, 1)

        # Store predictions and labels
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(y_batch.cpu().numpy())

# Convert to NumPy arrays for evaluation
all_predictions = np.array(all_predictions)
all_labels = np.array(all_labels)

# Convert label encoder classes to a list of strings
target_names = list(map(str, label_encoder.classes_))

# Generate a classification report
report = classification_report(all_labels, all_predictions, target_names=target_names)
print(report)

              precision    recall  f1-score   support

           0       0.53      0.52      0.52       151
           1       0.65      0.62      0.63       202
           2       0.63      0.59      0.61       195
           3       0.59      0.64      0.61       183
           4       0.71      0.61      0.66       205
           5       0.78      0.78      0.78       215
           6       0.73      0.69      0.71       193
           7       0.47      0.73      0.57       196
           8       0.68      0.69      0.69       168
           9       0.84      0.80      0.82       211
          10       0.92      0.85      0.88       198
          11       0.85      0.75      0.79       201
          12       0.58      0.63      0.60       202
          13       0.78      0.80      0.79       194
          14       0.76      0.74      0.75       189
          15       0.76      0.75      0.76       202
          16       0.65      0.67      0.66       188
          17       0.79    