In [1]:
# some commands in th is notebook require torchtext 0.12.0
# !pip install  torchtext --upgrade --quiet
# !pip install torchdata --quiet
# !pip install torchinfo --quiet

In [3]:
import collections
import math
from dataclasses import dataclass
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as functional
import torchtext
import torchdata
from torch.utils.data import DataLoader
from tqdm import tqdm
import torchinfo

seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)

cuda


## Data processing

In [4]:
from torchtext.datasets import IMDB
train_iter, test_iter = IMDB()
num_classes = len(set([label for (label, text) in train_iter]))
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

In [5]:
# see an example of the dateset
next(iter(train_iter))

(1,
 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far betwee

In [6]:
# convert the labels to be in range(0, num_classes)
y_train = torch.tensor([label-1 for (label, text) in train_iter])
y_test  = torch.tensor([label-1 for (label, text) in test_iter])

# tokenize the texts, and truncate the number of words in each text to max_seq_len
max_seq_len = 512
x_train_texts = [tokenizer(text.lower())[0:max_seq_len]
                 for (label, text) in train_iter]
x_test_texts  = [tokenizer(text.lower())[0:max_seq_len]
                 for (label, text) in test_iter]

In [7]:
# build the vocabulary and word-to-integer map
counter = collections.Counter()
for text in x_train_texts:
    counter.update(text)

vocab_size = 95833
most_common_words = np.array(counter.most_common(vocab_size - 2))
vocab = most_common_words[:,0]

# indexes for the padding token, and unknown tokens
PAD = 0
UNK = 1
word_to_id = {vocab[i]: i + 2 for i in range(len(vocab))}

In [8]:
# map the words in the training and test texts to integers
x_train = [torch.tensor([word_to_id.get(word, UNK) for word in text],dtype=torch.int32)
           for text in x_train_texts]
x_test  = [torch.tensor([word_to_id.get(word, UNK) for word in text],dtype=torch.int32)
          for text in x_test_texts]
x_train = torch.nn.utils.rnn.pad_sequence(x_train,batch_first=True, padding_value = PAD)
x_test = torch.nn.utils.rnn.pad_sequence(x_test,batch_first=True, padding_value = PAD)

In [9]:
x_train.shape

torch.Size([25000, 512])

In [10]:
# constructing the dataset in order to be compatible with torch.utils.data.Dataloader
class IMDBDataset:
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        return self.features[item], self.labels[item]


train_dataset = IMDBDataset(x_train, y_train)
test_dataset  = IMDBDataset(x_test, y_test)

In [11]:
def collate_stack_fn(batch):
    xx, yy = zip(*batch)
    return  torch.stack(xx), torch.stack(yy)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True, collate_fn = collate_stack_fn)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=True, collate_fn = collate_stack_fn)

## Building the encoder-only transformer model for text classification

In [12]:
class MultiHeadedSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        n_heads, d_embed, dropout_attn, dropout_proj, device = config.n_heads, config.d_embed, config.dropout_attn, config.dropout_proj, config.device

        assert d_embed % n_heads == 0        
        self.hid_dim = d_embed
        self.n_heads = n_heads
        self.head_dim = d_embed // n_heads
        
        self.fc_q = nn.Linear(d_embed, d_embed)
        self.fc_k = nn.Linear(d_embed, d_embed)
        self.fc_v = nn.Linear(d_embed, d_embed)
        self.proj = nn.Linear(d_embed, d_embed)
        
        self.dropout_attn = nn.Dropout(dropout_attn)
        self.dropout_proj = nn.Dropout(dropout_proj)
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)
        
    def forward(self, x, mask = None):       
        batch_size = x.shape[0]

        Q,K,V = self.fc_q(x), self.fc_k(x), self.fc_v(x) #[batch size, query len, hid dim
    
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3) # [batch size, n heads, query len, head dim]
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)  
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3) 
                
        attention = torch.matmul(Q, K.permute(0, 1, 3, 2)) /  self.scale  #attention = [batch size, n heads, query len, key len]
        
        if mask is not None:
            attention = attention.masked_fill(mask == 0, float('-inf'))
        
        attention = self.dropout_attn(torch.softmax(attention, dim = -1)) # Attention Dropout;  attention = [batch size, n heads, query len, key len]
                
        x = torch.matmul(attention, V) #x = [batch size, n heads, query len, head dim]
        x = x.permute(0, 2, 1, 3).contiguous() #x = [batch size, query len, n heads, head dim]
        x = x.view(batch_size, -1, self.hid_dim) #x = [batch size, query len, hid dim] (hidden concat of all heads)
        x = self.dropout_proj(self.proj(x)) #x = [batch size, query len, hid dim]

        return x #,attention

class Encoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.tok_embed = nn.Embedding(config.enc_vocab_size, config.d_embed)
        self.pos_embed = nn.Parameter(torch.zeros(1, config.context_len, config.d_embed))
        self.enc_block = nn.ModuleList([EncoderBlock(config) for _ in range(config.enc_blocks)])
        self.dropout_pos = nn.Dropout(config.dropout_pos)
        self.norm = nn.LayerNorm(config.d_embed)

    def forward(self, input, mask=None):
        x = self.tok_embed(input)
        x_pos = self.pos_embed[:, :x.size(1), :]
        x = self.dropout_pos(x + x_pos)
        for layer in self.enc_block:
            x = layer(x, mask)
        return self.norm(x)


class EncoderBlock(nn.Module):
    def __init__(self, config):
        super(EncoderBlock, self).__init__()
        
        self.mhsa = MultiHeadedSelfAttention(config)
        self.pwff = nn.Sequential(
            nn.Linear(config.d_embed, config.d_ff),
            nn.ReLU(),
            nn.Linear(config.d_ff, config.d_embed),
            nn.Dropout(config.dropout_pwff)
        )
        self.ln_mhsa = nn.LayerNorm(config.d_embed)
        self.ln_pwff = nn.LayerNorm(config.d_embed)
        self.dropout = nn.Dropout(config.dropout_res)

    def forward(self, x, mask=None):
        x = x + self.dropout(self.mhsa(self.ln_mhsa(x)))   # Pre-LN
        x = x + self.dropout(self.pwff(self.ln_pwff(x)))
        return x


class Transformer(nn.Module):
    def __init__(self, config, num_classes):
        super().__init__()
        self.encoder = Encoder(config)
        self.linear = nn.Linear(config.d_embed, num_classes)

    def forward(self, x, pad_mask=None):
        x = self.encoder(x, pad_mask)
        return  self.linear(torch.mean(x,-2))

In [13]:
@dataclass
class ModelConfig:
    enc_vocab_size: int
    d_embed: int
    d_ff: int
    n_heads: int
    enc_blocks: int
    context_len: int
    dropout_res: float
    dropout_attn: float
    dropout_proj: float
    dropout_pos: float
    dropout_pwff: float
    device: str
  
def make_model(config):
    model = Transformer(config, num_classes).to(DEVICE)
    # initialize model parameters
    # it seems that this initialization is very important!
    for p in model.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
    return model

## Train the model

In [14]:
def train_epoch(model, dataloader):
    model.train()
    losses, acc, count = [], 0, 0
    pbar = tqdm(enumerate(dataloader), total=len(dataloader))
    for idx, (x, y)  in  pbar:
        optimizer.zero_grad()
        features= x.to(DEVICE)
        labels  = y.to(DEVICE)
        pad_mask = (features == PAD).view(features.size(0), 1, 1, features.size(-1))
        pred = model(features, pad_mask)

        loss = loss_fn(pred, labels)
        loss.backward()
        optimizer.step()

        losses.append(loss.item())
        acc += (pred.argmax(1) == labels).sum().item()
        count += len(labels)
        # report progress
        if idx>0 and idx%50 == 0:
            pbar.set_description(f'train loss={loss.item():.4f}, train_acc={acc/count:.4f}')
    return np.mean(losses), acc/count

def train(model, train_loader, test_loader, epochs):
    for ep in range(epochs):
        train_loss, train_acc = train_epoch(model, train_loader)
        val_loss, val_acc = evaluate(model, test_loader)
        print(f'ep {ep}: val_loss={val_loss:.4f}, val_acc={val_acc:.4f}')
        
def evaluate(model, dataloader):
    model.eval()
    losses, acc, count = [], 0, 0
    with torch.no_grad():
        for x, y in tqdm(dataloader):
            features = x.to(DEVICE)
            labels  = y.to(DEVICE)
            pad_mask = (features == PAD).view(features.size(0), 1, 1, features.size(-1))
            pred = model(features, pad_mask)
            loss = loss_fn(pred,labels).to("cpu")
            losses.append(loss.item())
            acc += (pred.argmax(1) == labels).sum().item()
            count += len(labels)
    return np.mean(losses), acc/count

In [15]:
for x, y in tqdm(test_loader):
  pass

100%|██████████| 196/196 [00:00<00:00, 1266.82it/s]


In [16]:
config = ModelConfig(enc_vocab_size = vocab_size,
                     d_embed = 32,
                     d_ff = 4*32,
                     n_heads = 4,
                     enc_blocks = 1,
                     context_len = max_seq_len,
                     dropout_res = 0.1,
                     dropout_attn = 0.1,
                     dropout_proj= 0.1,
                     dropout_pos=  0.1,
                     dropout_pwff = 0.1,
                     device="cuda"
                     )

In [17]:
model = make_model(config)
print(torchinfo.summary(model))
optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss()

Layer (type:depth-idx)                             Param #
Transformer                                        --
├─Encoder: 1-1                                     16,384
│    └─Embedding: 2-1                              3,066,656
│    └─ModuleList: 2-2                             --
│    │    └─EncoderBlock: 3-1                      12,704
│    └─Dropout: 2-3                                --
│    └─LayerNorm: 2-4                              64
├─Linear: 1-2                                      66
Total params: 3,095,874
Trainable params: 3,095,874
Non-trainable params: 0


In [18]:
model

Transformer(
  (encoder): Encoder(
    (tok_embed): Embedding(95833, 32)
    (enc_block): ModuleList(
      (0): EncoderBlock(
        (mhsa): MultiHeadedSelfAttention(
          (fc_q): Linear(in_features=32, out_features=32, bias=True)
          (fc_k): Linear(in_features=32, out_features=32, bias=True)
          (fc_v): Linear(in_features=32, out_features=32, bias=True)
          (proj): Linear(in_features=32, out_features=32, bias=True)
          (dropout_attn): Dropout(p=0.1, inplace=False)
          (dropout_proj): Dropout(p=0.1, inplace=False)
        )
        (pwff): Sequential(
          (0): Linear(in_features=32, out_features=128, bias=True)
          (1): ReLU()
          (2): Linear(in_features=128, out_features=32, bias=True)
          (3): Dropout(p=0.1, inplace=False)
        )
        (ln_mhsa): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
        (ln_pwff): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
 

In [19]:
train(model, train_loader, test_loader, epochs=4)

train loss=0.4075, train_acc=0.7828: 100%|██████████| 196/196 [00:14<00:00, 13.14it/s]
100%|██████████| 196/196 [00:04<00:00, 45.03it/s]


ep 0: val_loss=0.2996, val_acc=0.8790


train loss=0.1572, train_acc=0.9618: 100%|██████████| 196/196 [00:14<00:00, 13.64it/s]
100%|██████████| 196/196 [00:04<00:00, 44.61it/s]


ep 1: val_loss=0.4848, val_acc=0.8582


train loss=0.0070, train_acc=0.9928: 100%|██████████| 196/196 [00:14<00:00, 13.51it/s]
100%|██████████| 196/196 [00:04<00:00, 43.93it/s]


ep 2: val_loss=0.5999, val_acc=0.8489


train loss=0.0010, train_acc=0.9978: 100%|██████████| 196/196 [00:14<00:00, 13.32it/s]
100%|██████████| 196/196 [00:04<00:00, 42.91it/s]

ep 3: val_loss=0.7828, val_acc=0.8436





In [21]:
review_label = ["neg","pos"]

def classify_review(news):
    x_text = tokenizer(news.lower())[0:max_seq_len]
    x_int = torch.tensor([[word_to_id.get(word, UNK) for word in x_text]]).to(DEVICE)

    model.eval()
    with torch.no_grad():
        pred = model(x_int).argmax(1).item()
    print(f"This is a {review_label[pred]} review")

# The model correctly classifies a theoretical physics news as Sci/Tec news, :-)
review_pos = """Confidently directed, dark, brooding, and packed with impressive action sequences and a complex story,
 The Dark Knight includes a career-defining turn from Heath Ledger as well as other Oscar worthy performances,
  TDK remains not only the best Batman movie, but comic book movie ever created.
"""
classify_review(review_pos)


review_neg = """Plot holes the size of the grand canyon,
 overall terrible acting, and about 45 minutes of useless fluff at the end.
 Had this been the real world,
 The Joker would have been caught about 15 minutes into the movie and the credits would have rolled.
 Too bad that didn't happen.
"""
classify_review(review_neg)

This is a pos review
This is a neg review
