In [1]:
import sys
!{sys.executable} -m pip uninstall -y torch torchvision torchaudio torchtext
!{sys.executable} -m pip cache purge
!{sys.executable} -m pip install -U pip
!{sys.executable} -m pip install torch torchvision torchaudio torchtext


Found existing installation: torch 2.5.1
Uninstalling torch-2.5.1:
  Successfully uninstalled torch-2.5.1
Found existing installation: torchvision 0.20.1
Uninstalling torchvision-0.20.1:
  Successfully uninstalled torchvision-0.20.1
Found existing installation: torchaudio 2.5.1
Uninstalling torchaudio-2.5.1:
  Successfully uninstalled torchaudio-2.5.1
Found existing installation: torchtext 0.18.0
Uninstalling torchtext-0.18.0:
  Successfully uninstalled torchtext-0.18.0
Files removed: 1356 (2302.6 MB)
Collecting torch
  Downloading torch-2.10.0-cp311-none-macosx_11_0_arm64.whl.metadata (31 kB)
Collecting torchvision
  Downloading torchvision-0.25.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (5.4 kB)
Collecting torchaudio
  Downloading torchaudio-2.10.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.9 kB)
Collecting torchtext
  Downloading torchtext-0.18.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (7.9 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl

In [2]:
import sys
print(sys.executable)


/opt/homebrew/opt/python@3.11/bin/python3.11


Installing libraries

In [3]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from datasets import load_dataset
from tqdm.auto import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


  from .autonotebook import tqdm as notebook_tqdm


device(type='cpu')

Dataset Loading & Preprocessing (IMDB)
- Tokenization
- Vocabulary Construction
- Encoding & Padding
- DataLoaders


In [None]:
dataset = load_dataset("imdb")

def basic_tokenize(text):
    return text.lower().split()

from collections import Counter

max_vocab = 20000
counter = Counter()
for ex in dataset["train"].select(range(20000)):
    counter.update(basic_tokenize(ex["text"]))

vocab = {"<pad>": 0, "<unk>": 1}
for i, (tok, _) in enumerate(counter.most_common(max_vocab - len(vocab)), start=len(vocab)):
    vocab[tok] = i

pad_id = vocab["<pad>"]
unk_id = vocab["<unk>"]

# Encode function to convert text to list of token IDs
def encode(text, max_len=256):
    toks = basic_tokenize(text)[:max_len]
    ids = [vocab.get(t, unk_id) for t in toks]
    return ids

# Collate function to create batches with padding
def collate(batch, max_len=256):
    # batch has a list of dicts with text and label
    ids_list = [encode(x["text"], max_len=max_len) for x in batch]
    labels = torch.tensor([x["label"] for x in batch], dtype=torch.long)

    lengths = torch.tensor([len(ids) for ids in ids_list], dtype=torch.long)
    max_in_batch = max(lengths).item()

    x = torch.full((len(ids_list), max_in_batch), pad_id, dtype=torch.long)
    for i, ids in enumerate(ids_list):
        x[i, :len(ids)] = torch.tensor(ids, dtype=torch.long)

    return x, lengths, labels

train_n = min(25000, len(dataset["train"]))   # IMDB train set has 25k examples
test_n  = min(5000,  len(dataset["test"]))

# DataLoaders for training and testing
train_loader = DataLoader(
    dataset["train"].shuffle(seed=0).select(range(train_n)),
    batch_size=32,
    shuffle=True,
    collate_fn=lambda b: collate(b, max_len=256)
)

test_loader = DataLoader(
    dataset["test"].select(range(test_n)),
    batch_size=64,
    shuffle=False,
    collate_fn=lambda b: collate(b, max_len=256)
)

Model Architecture
- Positional Encoding
- Token Embedding
- Transformer Encoder Layers
- Sequence Pooling
- Classification Head

In [9]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(dropout)

        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))  # (1, max_len, d_model)

    def forward(self, x):
        # x: (B, T, D)
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

class TransformerSentiment(nn.Module):
    def __init__(self, vocab_size, d_model=256, nhead=8, num_layers=4, dim_ff=512, dropout=0.1):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, d_model, padding_idx=pad_id)
        self.pos = PositionalEncoding(d_model, dropout=dropout)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead,
            dim_feedforward=dim_ff, dropout=dropout,
            batch_first=True, activation="gelu"
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.classifier = nn.Linear(d_model, 2)

    def forward(self, x):
        # x: (B, T)
        mask = (x == pad_id)  # True where pad, for src_key_padding_mask
        h = self.emb(x)       # (B, T, D)
        h = self.pos(h)
        h = self.encoder(h, src_key_padding_mask=mask)  # (B, T, D)

        # simple pooling: mean over non-pad tokens
        nonpad = (~mask).float().unsqueeze(-1)          # (B, T, 1)
        pooled = (h * nonpad).sum(dim=1) / nonpad.sum(dim=1).clamp(min=1.0)
        return self.classifier(pooled)

model = TransformerSentiment(vocab_size=len(vocab)).to(device)


Training and Testing the Model

In [None]:
# instantiate the model 
model = TransformerSentiment(vocab_size=len(vocab)).to(device)
trainable_params = [p for p in model.parameters() if p.requires_grad]
opt = torch.optim.Adam(trainable_params, lr=3e-4)
criterion = nn.CrossEntropyLoss()


def run_epoch(loader, train=True):
    model.train(train)
    total_loss, correct, total = 0.0, 0, 0
    pbar = tqdm(loader, leave=False)

    for xb, lengths, yb in pbar:
        xb, yb = xb.to(device), yb.to(device)

        if train:
            opt.zero_grad(set_to_none=True)

        logits = model(xb)
        loss = criterion(logits, yb)

        if train:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()

        bs = xb.size(0)
        total_loss += loss.item() * bs
        correct += (logits.argmax(-1) == yb).sum().item()
        total += bs

        pbar.set_postfix(loss=total_loss/total, acc=correct/total)

    return total_loss/total, correct/total

for epoch in range(3):
    train_loss, train_acc = run_epoch(train_loader, train=True)
    test_loss, test_acc = run_epoch(test_loader, train=False)
    print(f"Epoch {epoch+1}: train acc={train_acc:.3f} | test acc={test_acc:.3f}")


                                                                        

Epoch 1: train acc=0.727 | test acc=0.756


                                                                        

Epoch 2: train acc=0.819 | test acc=0.830


                                                                        

Epoch 3: train acc=0.848 | test acc=0.810




In [7]:
import torch; print(torch.__version__); print(torch.__file__)


2.10.0
/opt/homebrew/lib/python3.11/site-packages/torch/__init__.py
