# Classification with Transformer Encoder

## Sarthak Rastogi

I implemented the encoder part of a transformer using pytorch and torchtext.

A big part of the code in this notebook is taken from the torchtext docs directly.

In [None]:
%%capture
!pip install torch=='1.7.1'
!pip install torchtext=='0.8.1'

In [None]:
import numpy as np 
import pandas as pd
import math
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

from torch import nn
import torch
from torchtext import data
from torch.nn  import functional as F
import torch.optim as  optim 
import torchtext

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import warnings
warnings.filterwarnings('ignore')

In [None]:
dummy_data = ['you won a billion dollars , great work !',
'click here for cs685 midterm answers',
'read important cs685 news',
'send me your bank account info asap']

dummy_labels = torch.tensor([1, 1, 0,1])#, dtype=torch.float64, requires_grad=True)

In [None]:
import pandas as pd
traindata = pd.DataFrame()
traindata['text'] = dummy_data
traindata['label'] = dummy_labels
traindata

Unnamed: 0,text,label
0,"you won a billion dollars , great work !",1
1,click here for cs685 midterm answers,1
2,read important cs685 news,0
3,send me your bank account info asap,1


Instatiating data field classes from torchtext

In [None]:
TEXT = torchtext.data.Field(
    lower=True, include_lengths=False, batch_first=True
)
LABEL = torchtext.data.Field(sequential=False)

In [None]:
class DataFrameDataset(data.Dataset):

    def __init__(self, df, text_field, label_field, is_test=False, **kwargs):
        fields = [('text', text_field), ('label', label_field)]
        examples = []
        for i, row in df.iterrows():
            label = row.label
            text = row.text
            examples.append(data.Example.fromlist([text, label], fields))
        super().__init__(examples, fields, **kwargs)
train_data = DataFrameDataset(traindata, TEXT,LABEL)

def nppred(l):
  return np.array([0 if i==1 else 1 for i in l])

Building the vocabulary from Glove pre-trained word embeddings.

In [None]:
TEXT.build_vocab(
    train_data,
    vectors=torchtext.vocab.GloVe(name="6B", dim=50, max_vectors=50_000),
    max_size=50_000,
)

LABEL.build_vocab(train_data)

train_iter= data.BucketIterator(train_data, batch_size = 4)

Code for Encoder and the fully connected layers that come after it.

https://pytorch.org/tutorials/beginner/transformer_tutorial.html

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, vocab_size=5000, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(vocab_size, d_model)
        position = torch.arange(0, vocab_size, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float()
            * (-math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, : x.size(1), :]
        return self.dropout(x)

In [None]:
class Net(nn.Module):
    def __init__(
        self,
        embeddings,
        nhead=8,
        dim_feedforward=2048,
        num_layers=2,
        dropout=0.1,
        activation="relu",
        classifier_dropout=0.1,
    ):

        super().__init__()

        vocab_size, d_model = embeddings.size()
        assert d_model % nhead == 0, "nheads must divide evenly into d_model"

        self.emb = nn.Embedding.from_pretrained(embeddings, freeze=False)

        self.pos_encoder = PositionalEncoding(
            d_model=d_model,
            dropout=dropout,
            vocab_size=vocab_size,
        )

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer,
            num_layers=num_layers,
        )
        self.classifier = nn.Linear(d_model, 2)
        self.d_model = d_model

    def forward(self, x):
        x = self.emb(x) * math.sqrt(self.d_model)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x)
        x = x.mean(dim=1)
        x = self.classifier(x)

        return x

Calculating four metrics

In [None]:
def calculateMetrics(ypred,ytrue):
  print("true", ytrue)
  print("predicted", ypred)
  print("accuracy:", accuracy_score(ytrue,ypred), "f1:", f1_score(ytrue,ypred), "precision:", precision_score(ytrue,ypred), "recall:", recall_score(ytrue,ypred))
def nppred(l):
  return np.array([0 if i==1 else 1 for i in l])

## Task 1
Transformer encoder with 2 blocks and 5-head self attention

Model definition

In [None]:
epochs = 6
criterion = nn.CrossEntropyLoss()

model = Net(
    TEXT.vocab.vectors,
    nhead=5,
    dim_feedforward=50,
    num_layers=2,
    dropout=0.0,
    classifier_dropout=0.0,
).to(device)

optimizer = torch.optim.Adam(
    (p for p in model.parameters() if p.requires_grad), lr=0.0001
)

Training

In [None]:
for epoch in range(epochs):
    print(epoch)
    epoch_loss = 0
    epoch_correct = 0
    epoch_count = 0
    for idx, batch in enumerate(iter(train_iter)):
        predictions = model(batch.text.to(device))
        labels = batch.label.to(device) - 1
        loss = criterion(predictions, labels)
        epoch_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

    print(epoch_loss)
    print(nppred(predictions.argmax(axis=1)))
    print(nppred(labels))

    print(calculateMetrics(nppred(predictions.argmax(axis=1)), nppred(labels)))

0
0.7129457592964172
[1 1 1 0]
[1 0 1 1]
true [1 0 1 1]
predicted [1 1 1 0]
accuracy: 0.5 f1: 0.6666666666666666 precision: 0.6666666666666666 recall: 0.6666666666666666
None
1
0.700797438621521
[1 1 1 0]
[1 0 1 1]
true [1 0 1 1]
predicted [1 1 1 0]
accuracy: 0.5 f1: 0.6666666666666666 precision: 0.6666666666666666 recall: 0.6666666666666666
None
2
0.6892946362495422
[1 1 1 1]
[1 1 1 0]
true [1 1 1 0]
predicted [1 1 1 1]
accuracy: 0.75 f1: 0.8571428571428571 precision: 0.75 recall: 1.0
None
3
0.6784327030181885
[1 1 1 1]
[0 1 1 1]
true [0 1 1 1]
predicted [1 1 1 1]
accuracy: 0.75 f1: 0.8571428571428571 precision: 0.75 recall: 1.0
None
4
0.6682129502296448
[1 1 1 1]
[0 1 1 1]
true [0 1 1 1]
predicted [1 1 1 1]
accuracy: 0.75 f1: 0.8571428571428571 precision: 0.75 recall: 1.0
None
5
0.6586313843727112
[1 1 1 1]
[1 1 1 0]
true [1 1 1 0]
predicted [1 1 1 1]
accuracy: 0.75 f1: 0.8571428571428571 precision: 0.75 recall: 1.0
None



### accuracy: 0.75 f1: 0.8571428571428571 precision: 0.75 recall: 1.0

## Task 2

Increase the number of heads to 10

In [None]:
epochs = 6
criterion = nn.CrossEntropyLoss()

model = Net(
    TEXT.vocab.vectors,
    nhead=10,
    dim_feedforward=50,
    num_layers=2,
    dropout=0.0,
    classifier_dropout=0.0,
).to(device)



lr = 1e-4
optimizer = torch.optim.Adam(
    (p for p in model.parameters() if p.requires_grad), lr=lr
)

In [None]:
for epoch in range(epochs):
    print(epoch)
    epoch_loss = 0
    epoch_correct = 0
    epoch_count = 0
    for idx, batch in enumerate(iter(train_iter)):
        predictions = model(batch.text.to(device))
        labels = batch.label.to(device) - 1
        loss = criterion(predictions, labels)
        epoch_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

    print(epoch_loss)
    print(nppred(predictions.argmax(axis=1)))
    print(nppred(labels))

    print(calculateMetrics(nppred(predictions.argmax(axis=1)), nppred(labels)))

0
0.5497027039527893
[1 1 1 1]
[1 1 0 1]
true [1 1 0 1]
predicted [1 1 1 1]
accuracy: 0.75 f1: 0.8571428571428571 precision: 0.75 recall: 1.0
None
1
0.543393075466156
[1 1 1 1]
[0 1 1 1]
true [0 1 1 1]
predicted [1 1 1 1]
accuracy: 0.75 f1: 0.8571428571428571 precision: 0.75 recall: 1.0
None
2
0.5372782945632935
[1 1 1 1]
[1 1 1 0]
true [1 1 1 0]
predicted [1 1 1 1]
accuracy: 0.75 f1: 0.8571428571428571 precision: 0.75 recall: 1.0
None
3
0.5313563942909241
[1 1 1 1]
[1 1 1 0]
true [1 1 1 0]
predicted [1 1 1 1]
accuracy: 0.75 f1: 0.8571428571428571 precision: 0.75 recall: 1.0
None
4
0.5256075263023376
[1 1 1 1]
[0 1 1 1]
true [0 1 1 1]
predicted [1 1 1 1]
accuracy: 0.75 f1: 0.8571428571428571 precision: 0.75 recall: 1.0
None
5
0.5200061798095703
[1 1 1 1]
[1 1 1 0]
true [1 1 1 0]
predicted [1 1 1 1]
accuracy: 0.75 f1: 0.8571428571428571 precision: 0.75 recall: 1.0
None


### accuracy: 0.75 f1: 0.8571428571428571 precision: 0.75 recall: 1.0

On increasing the no. of heads, the model achieves 0.86 f1 score in the first epoch itself.

The model is identifying the spam sentences well, but it's not able to differentiate non-spam sentences.