In [20]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import GloVe

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import re

In [7]:
file_path = '../data/processed/full_2k.csv'
full_2k = pd.read_csv(file_path)

In [12]:
le = LabelEncoder()
full_2k['label'] = le.fit_transform(full_2k['Category'])
num_classes = len(le.classes_)

print(full_2k[['Category', 'label']].head())

    Category  label
0  Biography      0
1   Religion      8
2  Biography      0
3    General      3
4    History      4


In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    full_2k['Description'], full_2k['label'], test_size=0.2, random_state=13)

In [22]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

X_train = X_train.apply(clean_text)
X_test = X_test.apply(clean_text)

full_2k['cleaned_desc'] = full_2k['Description'].apply(clean_text)

In [24]:
glove = GloVe(name='6B', dim=50)  # 50-dimensional embeddings

def sentence_to_vec(sentence, glove):
    words = sentence.split()
    vecs = [glove[word] for word in words if word in glove.stoi]  # skip unknown words
    if len(vecs) == 0:
        return torch.zeros(glove.dim)
    return torch.mean(torch.stack(vecs), dim=0)

X_vectors = torch.stack(full_2k['cleaned_desc'].apply(lambda x: sentence_to_vec(x, glove)).tolist())

In [25]:
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [26]:
# Convert labels to tensors
y_tensor = torch.tensor(full_2k['label'].values, dtype=torch.long)

# Split X_vectors into train/test
N_TRAIN = int(len(full_2k)*0.8)
X_train_vectors = X_vectors[:N_TRAIN]
X_test_vectors = X_vectors[N_TRAIN:]
y_train_tensor = y_tensor[:N_TRAIN]
y_test_tensor = y_tensor[N_TRAIN:]

train_dataset = TextDataset(X_train_vectors, y_train_tensor)
test_dataset = TextDataset(X_test_vectors, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)


In [27]:
class SimpleNN(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, num_classes)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x


In [31]:
model = SimpleNN(input_dim=50, num_classes=num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


ModuleNotFoundError: No module named 'optree._C'

2.8.0+cu129
12.9
