In [1]:
# define import
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence

# Data Loading Pre-processing

In [2]:
import pickle
import random

with open("text_classification_train_words", "rb") as f:
    train = pickle.load(f)

with open("text_classification_test_words", "rb") as f:
    test = pickle.load(f)

print(f"{len(train)} samples in train")
print(f"{train[random.randint(0, len(train) - 1)]}")
print(f"{len(test)} samples in test")
print(f"{test[random.randint(0, len(test) - 1)]}")
print(len(train[0]['text']))

25000 samples in train
{'id': '9235', 'rating': 4, 'label': 0, 'text': 'This woman never stops talking throughout the movie. She memorized every line, and delivered all without a bit of natural emotion. She also has a most uncharming lisp, and the pitch of her voice sounds like nails on a blackboard. This film has WAY too much Betsy Drake, and not enough Cary Grant, who carried what little was left of the film entirely on his own.', 'words': ['woman', 'never', 'stops', 'talking', 'throughout', 'movie', 'memorized', 'every', 'line', 'delivered', 'without', 'bit', 'natural', 'emotion', 'also', 'uncharming', 'lisp', 'pitch', 'voice', 'sounds', 'like', 'nails', 'blackboard', 'film', 'way', 'much', 'betsy', 'drake', 'enough', 'cary', 'grant', 'carried', 'little', 'left', 'film', 'entirely']}
25000 samples in test
{'id': '12250', 'rating': 10, 'label': 1, 'text': "Not only do I think this was the best film of 1987, it's probably in my own amorphous list as one of the 10-20 best films I've ev

In [3]:
# get vocab size
vocab = set()
i = 0
for sample in train + test:
    for word in sample['words']:
        vocab.add(word)

vocab_size = len(vocab)
print(f"Found a vocab size of {vocab_size}")

Found a vocab size of 133264


## Prepare to be embeddings

In [4]:
torch.manual_seed(1)

word_to_ix = {}

for i, word in enumerate(vocab):
    word_to_ix[word] = i

##  Split Training Data

In [5]:
def create_dataset(dataset):
    X, Y = [], []
    for data in dataset:
        embeddings = torch.tensor([torch.tensor([word_to_ix[word]], dtype=torch.long) for word in data['words']])
        X.append(embeddings)
        Y.append(data['label'])

    return pad_sequence(X, batch_first=True), torch.tensor(Y)


X_train, Y_train = create_dataset(train)
X_test, Y_test = create_dataset(test)

# Prepare For Training

In [6]:
# device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"using device: {device}")

using device: cuda


In [7]:
# define our model class
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, lstm1_hidden_size, lstm2_hidden_size, dense_hidden_size, output_size, dropout_p=0.5):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.dropout_embed = nn.Dropout(dropout_p)

        self.lstm1 = nn.LSTM(embedding_dim,
                             lstm1_hidden_size,
                             num_layers=1,
                             batch_first=True,
                             bidirectional=True)
        self.dropout_lstm1 = nn.Dropout(dropout_p)

        self.lstm2 = nn.LSTM(lstm1_hidden_size * 2,
                             lstm2_hidden_size,
                             num_layers=1,
                             batch_first=True,
                             bidirectional=True)
        self.dropout_lstm2 = nn.Dropout(dropout_p)

        self.fc1 = nn.Linear(lstm2_hidden_size * 2, dense_hidden_size)
        self.relu = nn.ReLU()
        self.dropout_fc1 = nn.Dropout(dropout_p)

        # Final output layer
        self.fc2 = nn.Linear(dense_hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        x = self.dropout_embed(x) # Shape: (batch_size, seq_len, embedding_dim)

        lstm1_out, _ = self.lstm1(x)
        lstm1_out = self.dropout_lstm1(lstm1_out)

        _, (hn_lstm2, cn_lstm2) = self.lstm2(lstm1_out)

        hidden_combined = torch.cat((hn_lstm2[-2,:,:], hn_lstm2[-1,:,:]), dim=1)

        out_fc1 = self.fc1(hidden_combined)
        out_relu = self.relu(out_fc1)
        out_dropout_fc1 = self.dropout_fc1(out_relu)

        # Final output layer
        out = self.fc2(out_dropout_fc1) # Shape: (batch_size, output_size)
        return out

In [9]:
embedding_dim = 256
lstm1_hidden_size = 128 
lstm2_hidden_size = 64
dense_hidden_size = 64
dropout_rate = 0.2

output_size = 2 # binary classification (remains the same)
learning_rate = 0.001 # Keep as is, or tune
epochs = 20  # Keep as is, or tune

# Instantiate the model with new parameters
model = RNN(vocab_size,
            embedding_dim,
            lstm1_hidden_size,
            lstm2_hidden_size,
            dense_hidden_size,
            output_size,
            dropout_p=dropout_rate).to(device)

# 1. Create your initial TensorDataset
full_train_data = torch.utils.data.TensorDataset(X_train, Y_train)

# 2. Define the sizes for your training and validation sets
total_size = len(full_train_data)
train_size = int(0.9 * total_size)  # 90% for training
val_size = total_size - train_size   # Remaining 10% for validation

# 3. Split the dataset
train_subset, val_subset = torch.utils.data.random_split(full_train_data, [train_size, val_size])

# 4. Create DataLoaders for your training and validation sets
train_loader = torch.utils.data.DataLoader(train_subset, batch_size=64, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_subset, batch_size=64, shuffle=False) # No need to shuffle validation data

print(f"Original dataset size: {total_size}")
print(f"Training subset size: {len(train_subset)}")
print(f"Validation subset size: {len(val_subset)}")

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of trainable parameters: {num_params}")

for epoch in range(epochs):
    # Training phase
    model.train()
    train_loss_epoch = 0
    num_batches = len(train_loader)
    for i, (batch_X, batch_y) in enumerate(train_loader):
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss_epoch += loss.item()
        print(f"Batch [{i}/{num_batches}]\r", end="")
    avg_train_loss = train_loss_epoch / len(train_loader)
    print(f'Epoch [{epoch + 1}/{epochs}], Training Loss: {avg_train_loss:.4f}\r')

    # Validation phase
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch_X_val, batch_y_val in val_loader:
            batch_X_val, batch_y_val = batch_X_val.to(device), batch_y_val.to(device) # Uncomment if using GPU
            outputs_val = model(batch_X_val)
            loss_val = criterion(outputs_val, batch_y_val)
            val_loss += loss_val.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f'Epoch [{epoch + 1}/{epochs}], Validation Loss: {avg_val_loss:.4f}')

print("Training complete.")

Original dataset size: 25000
Training subset size: 22500
Validation subset size: 2500
Number of trainable parameters: 34684098
Epoch [1/20], Training Loss: 0.6199
Epoch [1/20], Validation Loss: 0.6827
Epoch [2/20], Training Loss: 0.4863
Epoch [2/20], Validation Loss: 0.4151
Epoch [3/20], Training Loss: 0.3312
Epoch [3/20], Validation Loss: 0.3368
Epoch [4/20], Training Loss: 0.2483
Epoch [4/20], Validation Loss: 0.3790
Epoch [5/20], Training Loss: 0.1846
Epoch [5/20], Validation Loss: 0.3731
Epoch [6/20], Training Loss: 0.1324
Epoch [6/20], Validation Loss: 0.3778
Epoch [7/20], Training Loss: 0.0931
Epoch [7/20], Validation Loss: 0.3789
Epoch [8/20], Training Loss: 0.0680
Epoch [8/20], Validation Loss: 0.4354
Epoch [9/20], Training Loss: 0.0463
Epoch [9/20], Validation Loss: 0.5011
Epoch [10/20], Training Loss: 0.0354
Epoch [10/20], Validation Loss: 0.5169
Epoch [11/20], Training Loss: 0.0317
Epoch [11/20], Validation Loss: 0.5119
Epoch [12/20], Training Loss: 0.0291
Epoch [12/20], Val

In [10]:
# Calculate accuracy on the test set
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    correct = 0
    total = 0
    # Create a DataLoader for the test set
    test_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(X_test, Y_test), batch_size=64, shuffle=False)
    for batch_X_test, batch_y_test in test_loader:
        batch_X_test, batch_y_test = batch_X_test.to(device), batch_y_test.to(device)
        outputs_test = model(batch_X_test)
        _, predicted = torch.max(outputs_test.data, 1)
        total += batch_y_test.size(0)
        correct += (predicted == batch_y_test).sum().item()

    accuracy = 100 * correct / total
    print(f'Accuracy of the model on the test data: {accuracy:.2f}%')

Accuracy of the model on the test data: 85.27%
