In [1]:
# define import
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence

# Data Loading Pre-processing

In [12]:
import pickle
import random

with open("text_classification_train_words", "rb") as f:
    train = pickle.load(f)

with open("text_classification_test_words", "rb") as f:
    test = pickle.load(f)

print(f"{len(train)} samples in train")
print(f"{train[random.randint(0, len(train) - 1)]}")
print(f"{len(test)} samples in test")
print(f"{test[random.randint(0, len(test) - 1)]}")
print(len(train[0]['text']))

25000 samples in train
{'id': '2194', 'rating': 10, 'label': 1, 'text': "One of the best records of Israel's response to the murder of Rabin.Extremely true and natural, it captured the spirit of the nation.Especially important was the response of young people to the trauma of Israel's loss and the feeling that we shall overcome.", 'words': ['one', 'best', 'records', 'israel', 'response', 'murder', 'rabinextremely', 'true', 'natural', 'captured', 'spirit', 'nationespecially', 'important', 'response', 'young', 'people', 'trauma', 'israel', 'loss', 'feeling', 'shall', 'overcome']}
25000 samples in test
{'id': '804', 'rating': 1, 'label': 0, 'text': "Where do I begin? The story was so bad, it must have been written in a high school film club! The acting was so wooden I felt sorry for the actors! One actor even reminded me of what a deer must look like when staring into a car's headlights! Another actor has this constant look of being constipated! But it was the dialog that takes the cake! 

In [3]:
# get vocab size
vocab = set()
i = 0
for sample in train + test:
    for word in sample['words']:
        vocab.add(word)

vocab_size = len(vocab)
print(f"Found a vocab size of {vocab_size}")

Found a vocab size of 133264


## Prepare to be embeddings

In [4]:
torch.manual_seed(1)

word_to_ix = {}

for i, word in enumerate(vocab):
    word_to_ix[word] = i

##  Split Training Data

In [5]:
def create_dataset(dataset):
    X, Y = [], []
    for data in dataset:
        embeddings = torch.tensor([torch.tensor([word_to_ix[word]], dtype=torch.long) for word in data['words']])
        X.append(embeddings)
        Y.append(data['label'])

    return pad_sequence(X, batch_first=True), torch.tensor(Y)


X_train, Y_train = create_dataset(train)
X_test, Y_test = create_dataset(test)

# Prepare For Training

In [6]:
# device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"using device: {device}")

using device: cpu


In [14]:
# define our model class
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, lstm1_hidden_size, lstm2_hidden_size, dense_hidden_size, output_size, dropout_p=0.5):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.dropout_embed = nn.Dropout(dropout_p)

        # First Bidirectional LSTM layer
        # The input features for the first LSTM is the embedding dimension.
        self.lstm1 = nn.LSTM(embedding_dim,
                             lstm1_hidden_size,
                             num_layers=1, # You can experiment with more layers here
                             batch_first=True,
                             bidirectional=True)
        self.dropout_lstm1 = nn.Dropout(dropout_p)

        # Second Bidirectional LSTM layer
        # The input features for the second LSTM is the output of the first BiLSTM (lstm1_hidden_size * 2).
        self.lstm2 = nn.LSTM(lstm1_hidden_size * 2, # Times 2 because of bidirectionality
                             lstm2_hidden_size,
                             num_layers=1, # You can experiment with more layers here
                             batch_first=True,
                             bidirectional=True)
        self.dropout_lstm2 = nn.Dropout(dropout_p)

        # Dense layer with ReLU (as per GeeksforGeeks)
        # The input features for this dense layer is the output of the second BiLSTM (lstm2_hidden_size * 2).
        self.fc1 = nn.Linear(lstm2_hidden_size * 2, dense_hidden_size) # Times 2 because of bidirectionality
        self.relu = nn.ReLU()
        self.dropout_fc1 = nn.Dropout(dropout_p)

        # Final output layer
        self.fc2 = nn.Linear(dense_hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        x = self.dropout_embed(x) # Shape: (batch_size, seq_len, embedding_dim)

        # First LSTM layer
        # lstm1_out shape: (batch_size, seq_len, lstm1_hidden_size * 2)
        # self.lstm1 also returns (hn, cn) which are the final hidden and cell states.
        lstm1_out, _ = self.lstm1(x)
        lstm1_out = self.dropout_lstm1(lstm1_out)

        # Second LSTM layer
        # lstm2_out shape: (batch_size, seq_len, lstm2_hidden_size * 2)
        # hn_lstm2 shape: (num_layers*num_directions, batch_size, lstm2_hidden_size)
        # For num_layers=1 and bidirectional=True, hn_lstm2 shape: (2, batch_size, lstm2_hidden_size)
        _, (hn_lstm2, cn_lstm2) = self.lstm2(lstm1_out)
        # No dropout directly on hn_lstm2 before concatenation, dropout_lstm2 was applied to the full sequence output.

        # Concatenate the final forward and backward hidden states from the last LSTM layer (lstm2)
        # hn_lstm2[-2,:,:] is the last forward hidden state.
        # hn_lstm2[-1,:,:] is the last backward hidden state.
        # Resulting shape: (batch_size, lstm2_hidden_size * 2)
        hidden_combined = torch.cat((hn_lstm2[-2,:,:], hn_lstm2[-1,:,:]), dim=1)

        # Dense layer with ReLU
        out_fc1 = self.fc1(hidden_combined)
        out_relu = self.relu(out_fc1)
        out_dropout_fc1 = self.dropout_fc1(out_relu)

        # Final output layer
        out = self.fc2(out_dropout_fc1) # Shape: (batch_size, output_size)
        return out

In [15]:
embedding_dim = 256  # Keep as is, or tune
# New hidden size parameters for the LSTMs and Dense layer
lstm1_hidden_size = 128  # Tunable, G4G example might use 64
lstm2_hidden_size = 64   # Tunable, G4G example might use 32
dense_hidden_size = 64   # Tunable, G4G example uses 64
dropout_rate = 0.5       # Tunable dropout rate

output_size = 2 # binary classification (remains the same)
learning_rate = 0.001 # Keep as is, or tune
epochs = 20  # Keep as is, or tune

# Instantiate the model with new parameters
model = RNN(vocab_size,
            embedding_dim,
            lstm1_hidden_size,
            lstm2_hidden_size,
            dense_hidden_size,
            output_size,
            dropout_p=dropout_rate).to(device)

train_data = torch.utils.data.TensorDataset(X_train, Y_train)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=64, shuffle=True) # Consider a smaller batch size if memory issues arise with a more complex model

val_data = torch.utils.data.TensorDataset(X_test, Y_test)
val_loader = torch.utils.data.DataLoader(val_data, batch_size=64, shuffle=False)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of trainable parameters: {num_params}")

# The rest of your training loop in this cell can remain the same.
# for epoch in range(epochs):
#    ...

for epoch in range(epochs):
    # Training phase
    model.train()
    train_loss_epoch = 0
    num_batches = len(train_loader)
    for i, (batch_X, batch_y) in enumerate(train_loader):
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss_epoch += loss.item()
        print(f"Batch [{i}/{num_batches}]\r", end="")
    avg_train_loss = train_loss_epoch / len(train_loader)
    print(f'Epoch [{epoch + 1}/{epochs}], Training Loss: {avg_train_loss:.4f}\r')

    # Validation phase
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch_X_val, batch_y_val in val_loader:
            batch_X_val, batch_y_val = batch_X_val.to(device), batch_y_val.to(device) # Uncomment if using GPU
            outputs_val = model(batch_X_val)
            loss_val = criterion(outputs_val, batch_y_val)
            val_loss += loss_val.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f'Epoch [{epoch + 1}/{epochs}], Validation Loss: {avg_val_loss:.4f}')

print("Training complete.")

Number of trainable parameters: 34684098
Batch [24/391]

KeyboardInterrupt: 