In [1]:
# define import
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence

# Data Loading Pre-processing

In [2]:
import pickle
import random

with open("text_classification_train_words", "rb") as f:
    train = pickle.load(f)

with open("text_classification_test_words", "rb") as f:
    test = pickle.load(f)

print(f"{len(train)} samples in train")
print(f"{train[random.randint(0, len(train) - 1)]}")
print(f"{len(test)} samples in test")
print(f"{test[random.randint(0, len(test) - 1)]}")

25000 samples in train
{'id': '4303', 'rating': 1, 'label': 0, 'text': "Previous comment made me write this. It says that Muslims are blonde and Serbs are dark (because our blood is mixed). This comment just says that this opinion can be made by racist.Look,race is nothing.I'm color blind.I look like Pierce Brosnan but I'm no Irish. So what?I might add that I am not 100% Serb,that I have some Austrian and Croat blood within me but whats the point.I'm dark, half-breed?Is that so? Anyone using racial prejudices with such bad intent like Lantos(producer9and director is racist for me.Karadzhic, Izetbegovich, Milosevic, Tudjman they are all monsters and I blame them for destroying my life, my family, my country, Yuggoslavia. Hope they will be all in hell but that wont return our dead relatives back. I am proud of being Serb and I am proud of my cousins, Austrians,Croats,Muslims, Hungarians, Arabs (yes I am from Serbia and I have multiethnical family).This movie doesn't show sufferings of Se

In [3]:
# get vocab size
vocab = set()
i = 0
for sample in train + test:
    for word in sample['words']:
        vocab.add(word)

vocab_size = len(vocab)
print(f"Found a vocab size of {vocab_size}")

Found a vocab size of 133264


## Prepare to be embeddings

In [4]:
torch.manual_seed(1)

word_to_ix = {}

for i, word in enumerate(vocab):
    word_to_ix[word] = i

##  Split Training Data

In [5]:
def create_dataset(dataset):
    X, Y = [], []
    for data in dataset:
        embeddings = torch.tensor([torch.tensor([word_to_ix[word]], dtype=torch.long) for word in data['words']])
        X.append(embeddings)
        Y.append(data['label'])

    return pad_sequence(X, batch_first=True), torch.tensor(Y)


X_train, Y_train = create_dataset(train)
X_test, Y_test = create_dataset(test)

# Prepare For Training

In [6]:
# device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"using device: {device}")

using device: cpu


In [7]:
# define our model class
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size, dropout_p=0.2):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.dropout_embed = nn.Dropout(dropout_p)
        self.rnn = nn.RNN(embedding_dim, hidden_size, batch_first=True)
        self.dropout_rnn_out = nn.Dropout(dropout_p)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        x = self.dropout_embed(x)
        out, _ = self.rnn(x)
        last_out = out[:, -1, :]
        last_out = self.dropout_rnn_out(last_out)
        out = self.fc(last_out)  # Get output from the last time step
        return out

In [11]:
embedding_dim = 64  # tunable
hidden_size = 128  # tunable
output_size = 2 # binary classification
learning_rate = 0.001
sequence_length = 100  # Length of input sequences
epochs = 2  # Number of epochs

# Instantiate the model
model = RNN(vocab_size, embedding_dim, hidden_size, output_size).to(device)
train_data = torch.utils.data.TensorDataset(X_train, Y_train)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=64, shuffle=True)

val_data = torch.utils.data.TensorDataset(X_test, Y_test)
val_loader = torch.utils.data.DataLoader(val_data, batch_size=64, shuffle=False)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(epochs):
    # Training phase
    model.train()
    train_loss_epoch = 0
    num_batches = len(train_loader)
    for i, (batch_X, batch_y) in enumerate(train_loader):
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss_epoch += loss.item()
        print(f"Batch [{i}/{num_batches}]\r", end="")
    avg_train_loss = train_loss_epoch / len(train_loader)
    print(f'Epoch [{epoch + 1}/{epochs}], Training Loss: {avg_train_loss:.4f}\r')

    # Validation phase
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch_X_val, batch_y_val in val_loader:
            batch_X_val, batch_y_val = batch_X_val.to(device), batch_y_val.to(device) # Uncomment if using GPU
            outputs_val = model(batch_X_val)
            loss_val = criterion(outputs_val, batch_y_val)
            val_loss += loss_val.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f'Epoch [{epoch + 1}/{epochs}], Validation Loss: {avg_val_loss:.4f}')

print("Training complete.")

Epoch [1/2], Training Loss: 0.6958
Epoch [1/2], Validation Loss: 0.6957
Epoch [2/2], Training Loss: 0.6949
Epoch [2/2], Validation Loss: 0.6933
Training complete.
