In [None]:
# !pip install torchtext
# !pip install datasets

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset

In [2]:
print('Using PyTorch version:', torch.__version__)
if torch.cuda.is_available():
    print('Using GPU, device name:', torch.cuda.get_device_name(0))
    device = torch.device('cuda')
else:
    print('No GPU found, using CPU instead.')
    device = torch.device('cpu')

Using PyTorch version: 2.8.0+cu126
Using GPU, device name: NVIDIA L4


Implementation of Transformer Block as a Layer

In [3]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_size, num_heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim=embed_size, num_heads=num_heads)
        self.ffn = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size)
        )
        self.layernorm1 = nn.LayerNorm(embed_size)
        self.layernorm2 = nn.LayerNorm(embed_size)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x):
        attention_output, _ = self.attention(x, x, x) # check
        attention_output = self.droupout1(attention_output)
        out1 = self.layernorm1(x + attention_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

# transformer = TransformerBlock(embed_size=512, num_heads=8, dropout=0.1, forward_expansion=4)

Implement Embedding Layer

In [4]:
class TokenAndPositionEmbedding(nn.Module):
    def __init__(self, max_len, vocab_size, embed_size):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_embedding = nn.Embedding(vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_len, embed_size)

    def forward(self, x):
        max_len = x.size(1)
        positions = torch.arange(max_len, dtype=torch.long).unsqueeze(0)
        return self.token_embedding(x) + self.position_embedding(positions)

# embedding = TokenAndPositionEmbedding(max_len=100, vocab_size=10000, embed_size=512)

Load Dataset

In [5]:
from tokenizers import Tokenizer
from tokenizers import models, trainers, pre_tokenizers, normalizers

train_dataset = load_dataset('imdb', split='train')
test_dataset = load_dataset('imdb', split='test')

train_dataset[0]['text']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, ev

In [6]:
vocab_size = 20000 # we only consider the top 20k words
max_len = 200 # we only consider the first 200 words of each movie review

from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer


tokenizer = Tokenizer(WordLevel(unk_token='[UNK]'))
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
tokenizer.normalizer = normalizers.Sequence([normalizers.NFD(),
                                             normalizers.Lowercase(),
                                             normalizers.StripAccents()])

trainer = trainers.WordLevelTrainer(vocab_size=vocab_size,
                                    min_frequency=1,
                                    special_tokens=['[UNK]'])

tokenizer.train_from_iterator(train_dataset['text'], trainer)

# tokenizer.encode(train_dataset[0]['text']).tokens

In [7]:
def transform_text(text):
    x = tokenizer.encode(text)
    x.truncate(max_len)
    x.pad(max_len)
    return x.ids

def apply_transform(x):
    return {'input_ids': transform_text(x['text']), 'label_id': float(x['label'])}

train_dataset = train_dataset.map(apply_transform, remove_columns=['text', 'label']).with_format('torch')
test_dataset = test_dataset.map(apply_transform, remove_columns=['text', 'label']).with_format('torch')

print(len(train_dataset), "Training samples")
print(len(test_dataset), "Testing samples")

25000 Training samples
25000 Testing samples


In [8]:
batch_size = 32
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                                           drop_last=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False,
                                          drop_last=True)

Create classifier model using transformer layer

In [9]:
emb_dim = 32 # Embedding size for each token
num_heads = 2 # Number of attention heads
# ff_dim = 32 # Feedforward dimension
num_transformer_blocks = 1 # Number of transformer blocks

class TransformerModel(nn.Module):
    def __init__(self):
        super(TransformerModel, self).__init__()
        self.embedding = TokenAndPositionEmbedding(max_len, vocab_size, emb_dim)
        # self.transformer_blocks = nn.ModuleList([TransformerBlock(emb_dim, num_heads, 0.1, 4) for _ in range(num_transformer_blocks)])
        self.transformer_blocks = TransformerBlock(emb_dim, num_heads, 0.1, 4)
        self.global_pool = nn.AdaptiveAvgPool1d(1)
        self.dropout = nn.Dropout(0.1)
        self.fc1 = nn.Linear(emb_dim, 20)
        self.fc2 = nn.Linear(20, 2)
        self.softmax = nn.Softmax(dim=1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.embedding(x)
        for transformer in self.transformer_blocks:
            x = transformer(x)
        x = self.global_pool(x).squeeze(2)
        x = self.dropout(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return self.softmax(x)

model = TransformerModel().to(device)
# model

In [None]:
from tqdm import tqdm

def train(model, train_loader, criterion, optimizer):
    model.train()
    num_batches = 0
    num_items = 0

    total_loss = 0
    total_correct = 0
    for batch in tqdm(train_loader):

        input_ids = batch['input_ids'].to(device)
        labels = batch['label_id'].to(device)


        # Forward pass
        output = model(input_ids)
        print(output.shape)
        print(labels.shape)

        # Calculate the loss
        loss = criterion(output, labels.long())
        total_loss += loss.item()
        num_batches += 1

        # Calculate the accuracy
        predicted = torch.argmax(output, dim=1)
        total_correct += (predicted == labels).sum().item()
        num_items += labels.size(0)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    train_loss = total_loss/num_batches
    train_acc = total_correct/num_items
    print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}')
    return train_loss.item(), train_acc

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

losses = []
accuracies = []
epochs = 5

for epoch in range(epochs):
    print(f'Epoch {epoch+1}/{epochs}')
    train_loss, train_acc = train(model, train_loader, criterion, optimizer)
    losses.append(train_loss)
    accuracies.append(train_acc)