In [1]:
!pip install torch datasets numpy

Collecting datasets
  Downloading datasets-3.3.0-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting 

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import re
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from datasets import load_dataset

In [3]:
dataset = load_dataset("imdb")
train_data = dataset["train"].shuffle(seed=42)
test_data = dataset["test"].shuffle(seed=42)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [4]:

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"<br />", " ", text)  # Remove HTML line breaks
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text

# Sample subset for demonstration (full dataset takes longer to train)
max_samples = 5000
train_texts = [preprocess_text(text) for text in train_data["text"][:max_samples]]
train_labels = train_data["label"][:max_samples]
test_texts = [preprocess_text(text) for text in test_data["text"][:max_samples//5]]
test_labels = test_data["label"][:max_samples//5]

In [5]:

word_to_index = {"<PAD>": 0, "<UNK>": 1}
index = 2
max_vocab = 10000  # Limit vocabulary size

for text in train_texts:
    for word in text.split():
        if word not in word_to_index and index < max_vocab:
            word_to_index[word] = index
            index += 1

In [6]:


max_length = 200
def text_to_sequence(text):
    return [word_to_index.get(word, 1) for word in text.split()[:max_length]] + \
           [0]*(max_length - len(text.split()))

X_train = torch.tensor([text_to_sequence(text) for text in train_texts], dtype=torch.long)
X_test = torch.tensor([text_to_sequence(text) for text in test_texts], dtype=torch.long)
y_train = torch.tensor(train_labels, dtype=torch.float32).unsqueeze(1)
y_test = torch.tensor(test_labels, dtype=torch.float32).unsqueeze(1)

In [7]:


# Create DataLoaders
batch_size = 64
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [8]:
# Model with regularization
class SentimentModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size, model_type="LSTM"):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.dropout1 = nn.Dropout(0.3)

        if model_type == "RNN":
            self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        elif model_type == "LSTM":
            self.rnn = nn.LSTM(embed_size, hidden_size, batch_first=True)
        else:
            self.rnn = nn.GRU(embed_size, hidden_size, batch_first=True)

        self.dropout2 = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.dropout1(self.embedding(x))
        rnn_out, _ = self.rnn(embedded)
        last_out = self.dropout2(rnn_out[:, -1, :])
        return self.sigmoid(self.fc(last_out))

In [9]:
# Training configuration
def train_model(model, train_loader, test_loader, epochs=10, lr=0.001):
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    best_acc = 0
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 5)  # Gradient clipping
            optimizer.step()
            total_loss += loss.item()

        # Validation
        model.eval()
        with torch.no_grad():
            correct, total = 0, 0
            for inputs, labels in test_loader:
                outputs = model(inputs)
                predicted = (outputs > 0.5).float()
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        acc = 100 * correct / total
        if acc > best_acc:
            best_acc = acc

        print(f"Epoch {epoch+1}/{epochs}")
        print(f"Train Loss: {total_loss/len(train_loader):.4f}")
        print(f"Val Accuracy: {acc:.2f}%")
        print("------------------------")

    return best_acc



In [11]:
# Hyperparameters
vocab_size = len(word_to_index)
embed_size = 128
hidden_size = 64
output_size = 1

# Initialize and train models
print("Training LSTM Model...")
lstm_model = SentimentModel(vocab_size, embed_size, hidden_size, output_size, "LSTM")
lstm_acc = train_model(lstm_model, train_loader, test_loader, epochs=20)
print(f"\nBest LSTM Validation Accuracy: {lstm_acc:.2f}%")


Training LSTM Model...
Epoch 1/20
Train Loss: 0.6949
Val Accuracy: 49.60%
------------------------
Epoch 2/20
Train Loss: 0.6902
Val Accuracy: 50.30%
------------------------
Epoch 3/20
Train Loss: 0.6868
Val Accuracy: 49.90%
------------------------
Epoch 4/20
Train Loss: 0.6825
Val Accuracy: 50.40%
------------------------
Epoch 5/20
Train Loss: 0.6767
Val Accuracy: 49.70%
------------------------
Epoch 6/20
Train Loss: 0.6660
Val Accuracy: 51.30%
------------------------
Epoch 7/20
Train Loss: 0.6539
Val Accuracy: 50.80%
------------------------
Epoch 8/20
Train Loss: 0.6349
Val Accuracy: 50.70%
------------------------
Epoch 9/20
Train Loss: 0.6162
Val Accuracy: 52.00%
------------------------
Epoch 10/20
Train Loss: 0.5958
Val Accuracy: 52.60%
------------------------
Epoch 11/20
Train Loss: 0.5787
Val Accuracy: 51.90%
------------------------
Epoch 12/20
Train Loss: 0.5552
Val Accuracy: 52.20%
------------------------
Epoch 13/20
Train Loss: 0.5370
Val Accuracy: 52.80%
----------

In [12]:

print("Training RNN Model...")
rnn_model = SentimentModel(vocab_size, embed_size, hidden_size, output_size, "RNN")
rnn_acc = train_model(rnn_model, train_loader, test_loader, epochs=20)
print(f"\nBest RNN Validation Accuracy: {rnn_acc:.2f}%")

Training RNN Model...
Epoch 1/20
Train Loss: 0.7050
Val Accuracy: 49.80%
------------------------
Epoch 2/20
Train Loss: 0.6932
Val Accuracy: 50.20%
------------------------
Epoch 3/20
Train Loss: 0.6889
Val Accuracy: 50.50%
------------------------
Epoch 4/20
Train Loss: 0.6859
Val Accuracy: 51.20%
------------------------
Epoch 5/20
Train Loss: 0.6828
Val Accuracy: 50.40%
------------------------
Epoch 6/20
Train Loss: 0.6710
Val Accuracy: 51.20%
------------------------
Epoch 7/20
Train Loss: 0.6667
Val Accuracy: 50.60%
------------------------
Epoch 8/20
Train Loss: 0.6578
Val Accuracy: 51.20%
------------------------
Epoch 9/20
Train Loss: 0.6522
Val Accuracy: 52.30%
------------------------
Epoch 10/20
Train Loss: 0.6438
Val Accuracy: 51.00%
------------------------
Epoch 11/20
Train Loss: 0.6897
Val Accuracy: 49.90%
------------------------
Epoch 12/20
Train Loss: 0.6731
Val Accuracy: 51.80%
------------------------
Epoch 13/20
Train Loss: 0.6664
Val Accuracy: 51.40%
-----------

In [13]:
print("Training GRU Model...")
gru_model = SentimentModel(vocab_size, embed_size, hidden_size, output_size, "GRU")
gru_acc = train_model(gru_model, train_loader, test_loader, epochs=20)
print(f"\nBest GRU Validation Accuracy: {gru_acc:.2f}%")

Training GRU Model...
Epoch 1/20
Train Loss: 0.6993
Val Accuracy: 49.60%
------------------------
Epoch 2/20
Train Loss: 0.6902
Val Accuracy: 50.70%
------------------------
Epoch 3/20
Train Loss: 0.6845
Val Accuracy: 50.90%
------------------------
Epoch 4/20
Train Loss: 0.6792
Val Accuracy: 50.80%
------------------------
Epoch 5/20
Train Loss: 0.6771
Val Accuracy: 50.70%
------------------------
Epoch 6/20
Train Loss: 0.6645
Val Accuracy: 51.30%
------------------------
Epoch 7/20
Train Loss: 0.6531
Val Accuracy: 52.20%
------------------------
Epoch 8/20
Train Loss: 0.6413
Val Accuracy: 52.10%
------------------------
Epoch 9/20
Train Loss: 0.6121
Val Accuracy: 53.50%
------------------------
Epoch 10/20
Train Loss: 0.5980
Val Accuracy: 51.40%
------------------------
Epoch 11/20
Train Loss: 0.5757
Val Accuracy: 53.70%
------------------------
Epoch 12/20
Train Loss: 0.5592
Val Accuracy: 54.10%
------------------------
Epoch 13/20
Train Loss: 0.5346
Val Accuracy: 53.10%
-----------

In [18]:
from transformers import pipeline
explainer =pipeline("text-generation", model="gpt2")
# Load the model for question answering
lstm_acc = lstm_acc  # example accuracy for LSTM
rnn_acc = rnn_acc   # example accuracy for RNN
gru_acc = gru_acc   # example accuracy for GRU

# Define the context and the question with filled-in values
context = f"""
After training on the dataset, the LSTM achieved {lstm_acc}%, RNN achieved {rnn_acc}%,
          and GRU achieved {gru_acc}% accuracy. Explain why GRU performed the best.
"""
prompt = "Which model performs best"

explanation = explainer(prompt, max_length=100)

print(explanation[0]["generated_text"])

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Which model performs best when you only have to run one test that contains some data, and not all models, but if at all. When you run 10 test models in the same test suite, it is very efficient to run the 10 multiple tests that contain some data. This can be true for the other 5 tests as well.

The good news is that with the right type checks, you don't need to change anything. This will help you to run your tests as well, thus making
