In [7]:
import torch
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
from torchtext.vocab import GloVe
import torchtext
from torchtext.data import get_tokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader

In [2]:
file_path = '../data/processed/full_2k.csv'
df = pd.read_csv(file_path)

In [3]:
le = LabelEncoder()
df['label'] = le.fit_transform(df['Category'])
num_classes = len(le.classes_)

df.head(5)

Unnamed: 0,Description,Category,label
0,The Russian author offers an affectionate chro...,Biography,0
1,"""The Bible and the social and moral consequenc...",Religion,8
2,A New York Times Notable Book of the YearThis ...,Biography,0
3,"It was a wonderful summer, a great memory, the...",General,3
4,"For the first time ever, veteran World War II ...",History,4


In [4]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

df['cleaned_desc'] = df['Description'].apply(clean_text)

In [5]:
# First, split off 10% test
X_temp, X_test, y_temp, y_test = train_test_split(
    df['cleaned_desc'], df['label'], 
    test_size=0.1, stratify=df['label'], random_state=13)

# Then split temp into train and val
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.11111111111111,
    stratify=y_temp, random_state=13)

print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

Train: 27543, Val: 3443, Test: 3443


In [25]:
glove = GloVe(name='6B', dim=300)

def sentence_to_sequence(sentence, glove, max_len=300):
    """
    Convert a sentence into a tensor of GloVe vectors (seq_len, embedding_dim).
    Pads or truncates to max_len.
    """
    # Handle empty / NaN sentences
    if not isinstance(sentence, str) or len(sentence.strip()) == 0:
        return torch.zeros(max_len, glove.dim)
    
    words = sentence.split()
    vecs = [glove[word] for word in words if word in glove.stoi]

    # If sentence has fewer than max_len words → pad
    if len(vecs) < max_len:
        padding = [torch.zeros(glove.dim) for _ in range(max_len - len(vecs))]
        vecs.extend(padding)
    else:
        vecs = vecs[:max_len]  # truncate
    
    return torch.stack(vecs)  # shape: (max_len, glove.dim)


In [33]:
X_train_seq = torch.stack(X_train.apply(lambda x: sentence_to_sequence(x, glove, max_len=300)).tolist())
X_val_seq = torch.stack(X_val.apply(lambda x: sentence_to_sequence(x, glove, max_len=300)).tolist())
X_test_seq = torch.stack(X_test.apply(lambda x: sentence_to_sequence(x, glove, max_len=300)).tolist())

In [34]:
X_train_seq.shape

torch.Size([27543, 300, 300])

In [38]:
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_loader = DataLoader(TextDataset(X_train_seq, y_train_tensor), batch_size=32, shuffle=True)
val_loader = DataLoader(TextDataset(X_val_seq, y_val_tensor), batch_size=32)
test_loader = DataLoader(TextDataset(X_test_seq, y_test_tensor), batch_size=32)


In [39]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm 

# ✅ 1. Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ✅ 2. Define GRU model
class GRUNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes, num_layers=1, bidirectional=False):
        super(GRUNet, self).__init__()
        self.gru = nn.GRU(
            input_dim,
            hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional
        )
        direction_factor = 2 if bidirectional else 1
        self.fc = nn.Linear(hidden_dim * direction_factor, num_classes)

    def forward(self, x):
        # x shape: (batch_size, seq_len, input_dim)
        out, _ = self.gru(x)
        out = out[:, -1, :]  # last time step output
        out = self.fc(out)
        return out

# ✅ 3. Initialize model, loss, and optimizer
input_dim = 300       # GloVe dimension
hidden_dim = 128
num_classes = len(y_train_tensor.unique())

model = GRUNet(input_dim, hidden_dim, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


Using device: cuda


In [40]:
num_epochs = 10

for epoch in range(num_epochs):
    # ---- Training ----
    model.train()
    train_loss, correct, total = 0.0, 0, 0

    for X_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]"):
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * X_batch.size(0)
        _, predicted = torch.max(outputs, dim=1)
        correct += (predicted == y_batch).sum().item()
        total += y_batch.size(0)

    train_acc = correct / total
    train_loss /= total

    # ---- Validation ----
    model.eval()
    val_loss, val_correct, val_total = 0.0, 0, 0

    with torch.no_grad():
        for X_batch, y_batch in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]"):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)

            val_loss += loss.item() * X_batch.size(0)
            _, predicted = torch.max(outputs, dim=1)
            val_correct += (predicted == y_batch).sum().item()
            val_total += y_batch.size(0)

    val_acc = val_correct / val_total
    val_loss /= val_total

    print(f"Epoch [{epoch+1}/{num_epochs}] "
          f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} "
          f"| Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")


Epoch 1/10 [Train]: 100%|██████████| 861/861 [00:27<00:00, 31.68it/s]
Epoch 1/10 [Val]: 100%|██████████| 108/108 [00:03<00:00, 29.26it/s]


Epoch [1/10] Train Loss: 1.9226 | Train Acc: 0.3711 | Val Loss: 0.9504 | Val Acc: 0.6634


Epoch 2/10 [Train]: 100%|██████████| 861/861 [00:38<00:00, 22.22it/s]
Epoch 2/10 [Val]: 100%|██████████| 108/108 [00:04<00:00, 24.66it/s]


Epoch [2/10] Train Loss: 0.8123 | Train Acc: 0.7123 | Val Loss: 0.7312 | Val Acc: 0.7383


Epoch 3/10 [Train]: 100%|██████████| 861/861 [00:22<00:00, 38.97it/s]
Epoch 3/10 [Val]: 100%|██████████| 108/108 [00:02<00:00, 37.09it/s]


Epoch [3/10] Train Loss: 0.6453 | Train Acc: 0.7724 | Val Loss: 0.6720 | Val Acc: 0.7595


Epoch 4/10 [Train]: 100%|██████████| 861/861 [00:11<00:00, 73.21it/s]
Epoch 4/10 [Val]: 100%|██████████| 108/108 [00:00<00:00, 159.89it/s]


Epoch [4/10] Train Loss: 0.5533 | Train Acc: 0.8060 | Val Loss: 0.6600 | Val Acc: 0.7685


Epoch 5/10 [Train]: 100%|██████████| 861/861 [00:12<00:00, 66.69it/s]
Epoch 5/10 [Val]: 100%|██████████| 108/108 [00:00<00:00, 151.98it/s]


Epoch [5/10] Train Loss: 0.4722 | Train Acc: 0.8349 | Val Loss: 0.6551 | Val Acc: 0.7807


Epoch 6/10 [Train]: 100%|██████████| 861/861 [00:21<00:00, 39.23it/s]
Epoch 6/10 [Val]: 100%|██████████| 108/108 [00:00<00:00, 161.88it/s]


Epoch [6/10] Train Loss: 0.4007 | Train Acc: 0.8595 | Val Loss: 0.6946 | Val Acc: 0.7778


Epoch 7/10 [Train]: 100%|██████████| 861/861 [00:09<00:00, 90.21it/s]
Epoch 7/10 [Val]: 100%|██████████| 108/108 [00:00<00:00, 151.69it/s]


Epoch [7/10] Train Loss: 0.3298 | Train Acc: 0.8858 | Val Loss: 0.7379 | Val Acc: 0.7729


Epoch 8/10 [Train]: 100%|██████████| 861/861 [00:13<00:00, 64.66it/s]
Epoch 8/10 [Val]: 100%|██████████| 108/108 [00:00<00:00, 152.88it/s]


Epoch [8/10] Train Loss: 0.2645 | Train Acc: 0.9094 | Val Loss: 0.8211 | Val Acc: 0.7630


Epoch 9/10 [Train]: 100%|██████████| 861/861 [00:09<00:00, 89.17it/s]
Epoch 9/10 [Val]: 100%|██████████| 108/108 [00:00<00:00, 162.81it/s]


Epoch [9/10] Train Loss: 0.2139 | Train Acc: 0.9304 | Val Loss: 0.8846 | Val Acc: 0.7668


Epoch 10/10 [Train]: 100%|██████████| 861/861 [00:09<00:00, 89.17it/s]
Epoch 10/10 [Val]: 100%|██████████| 108/108 [00:00<00:00, 157.08it/s]

Epoch [10/10] Train Loss: 0.1652 | Train Acc: 0.9466 | Val Loss: 0.9588 | Val Acc: 0.7671



