# Transformers – Improving Natural
# Language Processing with
# Attention Mechanisms

# Adding an attention mechanism to RNNs

In [1]:
import torch

sentence = torch.tensor(
    [0, # can
     7, # you
     1, # help
     2, # me
     5, # to
     6, # translate
     4, # this
     3]) # sentence

sentence

  from .autonotebook import tqdm as notebook_tqdm


tensor([0, 7, 1, 2, 5, 6, 4, 3])

In [2]:
torch.manual_seed(123)
embed = torch.nn.Embedding(10, 16)
embedded_sentence = embed(sentence).detach()
embedded_sentence

tensor([[ 3.3737e-01, -1.7778e-01, -3.0353e-01, -5.8801e-01,  3.4861e-01,
          6.6034e-01, -2.1964e-01, -3.7917e-01,  7.6711e-01, -1.1925e+00,
          6.9835e-01, -1.4097e+00,  1.7938e-01,  1.8951e+00,  4.9545e-01,
          2.6920e-01],
        [-9.4053e-01, -4.6806e-01,  1.0322e+00, -2.8300e-01,  4.9275e-01,
         -1.4078e-02, -2.7466e-01, -7.6409e-01,  1.3966e+00, -9.9491e-01,
         -1.5822e-03,  1.2471e+00, -7.7105e-02,  1.2774e+00, -1.4596e+00,
         -2.1595e+00],
        [-7.7020e-02, -1.0205e+00, -1.6896e-01,  9.1776e-01,  1.5810e+00,
          1.3010e+00,  1.2753e+00, -2.0095e-01,  4.9647e-01, -1.5723e+00,
          9.6657e-01, -1.1481e+00, -1.1589e+00,  3.2547e-01, -6.3151e-01,
         -2.8400e+00],
        [-1.3250e+00,  1.7843e-01, -2.1338e+00,  1.0524e+00, -3.8848e-01,
         -9.3435e-01, -4.9914e-01, -1.0867e+00,  8.8054e-01,  1.5542e+00,
          6.2662e-01, -1.7549e-01,  9.8284e-02, -9.3507e-02,  2.6621e-01,
         -5.8504e-01],
        [ 2.5529e-01

In [3]:
omega = torch.empty(8, 8)
for i, x_i in enumerate(embedded_sentence):
    for j, x_j in enumerate(embedded_sentence):
        omega[i, j] = torch.dot(x_i, x_j)
omega

tensor([[ 9.7601,  1.7326,  4.7543, -1.3587,  0.4752, -1.6717,  1.0227, -0.1286],
        [ 1.7326, 16.0787,  9.0642, -0.3370,  1.1368,  1.1972,  1.6485, -1.2789],
        [ 4.7543,  9.0642, 22.6615, -0.8519,  7.7799,  2.7483, -0.6832,  1.6236],
        [-1.3587, -0.3370, -0.8519, 13.9473, -1.4198, 10.9659, -0.5887,  2.3869],
        [ 0.4752,  1.1368,  7.7799, -1.4198, 13.7511, -6.8568, -2.5114, -3.3468],
        [-1.6717,  1.1972,  2.7483, 10.9659, -6.8568, 24.6738, -3.8294,  4.9581],
        [ 1.0227,  1.6485, -0.6832, -0.5887, -2.5114, -3.8294, 15.8691,  2.0269],
        [-0.1286, -1.2789,  1.6236,  2.3869, -3.3468,  4.9581,  2.0269, 18.7382]])

In [4]:
omega_mat = embedded_sentence.matmul(embedded_sentence.T)

In [5]:
torch.allclose(omega_mat, omega)

True

In [6]:
import torch.nn.functional as F

attention_weights = F.softmax(omega, dim=1)
attention_weights.shape

torch.Size([8, 8])

In [7]:
attention_weights.sum(dim=1)

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])

In [8]:
x_2 = embedded_sentence[1, :]
context_vec_2 = torch.zeros(x_2.shape)

for j in range(8):
    x_j = embedded_sentence[j, :]
    context_vec_2 += attention_weights[1, j] * x_j
context_vec_2

tensor([-9.3975e-01, -4.6856e-01,  1.0311e+00, -2.8192e-01,  4.9373e-01,
        -1.2896e-02, -2.7327e-01, -7.6358e-01,  1.3958e+00, -9.9543e-01,
        -7.1287e-04,  1.2449e+00, -7.8077e-02,  1.2765e+00, -1.4589e+00,
        -2.1601e+00])

In [9]:
context_vectors = torch.matmul(
    attention_weights, embedded_sentence)

In [10]:
torch.allclose(context_vec_2, context_vectors[1])

True

# Parameterizing the self-attention mechanism: scaled dot-product attention

In [11]:
torch.manual_seed(123)

d = embedded_sentence.shape[1]
U_query = torch.rand(d, d)
U_key = torch.rand(d, d)
U_value = torch.rand(d, d)

In [12]:
x_2 = embedded_sentence[1]
query_2 = U_query.matmul(x_2)

key_2 = U_key.matmul(x_2)
value_2 = U_value.matmul(x_2)

keys = U_key.matmul(embedded_sentence.T).T
values = U_value.matmul(embedded_sentence.T).T

keys = U_key.matmul(embedded_sentence.T).T
torch.allclose(key_2, keys[1])
values = U_value.matmul(embedded_sentence.T).T
torch.allclose(value_2, values[1])


True

In [13]:
omega_23 = query_2.dot(keys[2])
omega_23

tensor(14.3667)

In [14]:
omega_2 = query_2.matmul(keys.T)
omega_2

tensor([-25.1623,   9.3602,  14.3667,  32.1482,  53.8976,  46.6626,  -1.2131,
        -32.9392])

In [15]:
attention_weights_2 = F.softmax(omega_2 / d**0.5, dim=0)
attention_weights_2

tensor([2.2317e-09, 1.2499e-05, 4.3696e-05, 3.7242e-03, 8.5596e-01, 1.4026e-01,
        8.8897e-07, 3.1935e-10])

In [16]:
torch.manual_seed(123)

d = embedded_sentence.shape[1]
one_U_querry = torch.rand(d, d)
h = 8
multihead_U_query = torch.rand(h, d, d)
multihead_U_key = torch.rand(h, d, d)
multihead_U_value = torch.rand(h, d, d)

In [17]:
multihead_query_2 = multihead_U_query.matmul(x_2)
multihead_query_2.shape

torch.Size([8, 16])

In [18]:
multihead_key_2 = multihead_U_key.matmul(x_2)
multihead_value_2 = multihead_U_value.matmul(x_2)
multihead_key_2[2]

tensor([-1.9619, -0.7701, -0.7280, -1.6840, -1.0801, -1.6778,  0.6763,  0.6547,
         1.4445, -2.7016, -1.1364, -1.1204, -2.4430, -0.5982, -0.8292, -1.4401])

In [19]:
stacked_inputs = embedded_sentence.T.repeat(8, 1, 1)
stacked_inputs.shape

torch.Size([8, 16, 8])

In [20]:
multihead_keys = torch.bmm(multihead_U_key, stacked_inputs)
multihead_keys.shape

torch.Size([8, 16, 8])

In [21]:
multihead_keys = multihead_keys.permute(0, 2, 1)
multihead_keys.shape

torch.Size([8, 8, 16])

In [22]:
multihead_keys[2, 1]

tensor([-1.9619, -0.7701, -0.7280, -1.6840, -1.0801, -1.6778,  0.6763,  0.6547,
         1.4445, -2.7016, -1.1364, -1.1204, -2.4430, -0.5982, -0.8292, -1.4401])

In [23]:
multihead_values = torch.matmul(
    multihead_U_value, stacked_inputs)
multihead_values = multihead_values.permute(0, 2, 1)

In [24]:
multihead_z_2 = torch.rand(8, 16)

linear = torch.nn.Linear(8 * 16, 16)
context_vector_2 = linear(multihead_z_2.flatten())
context_vector_2.shape


torch.Size([16])

# Building large-scale language models by leveraging unlabeled data

# GPT

In [25]:
from transformers import pipeline, set_seed

generator = pipeline('text-generation', model='gpt2')
set_seed(123)
generator('Hey readers, today is',
          max_lenght=20,
          num_return_sequences=3)



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hey readers, today is not the last time we'll be seeing one of our favorite indie rock bands playing…\n\nSo we do have something news for you!\n\nHuge thanks to our friends at Cencept, the official soundtrack artist"},
 {'generated_text': 'Hey readers, today is Christmas. This is not Christmas, because Christmas is so long and I hope everyone still has the peace of mind of the two million years ago, but rather, this is a year of great things for you on board your journey'},
 {'generated_text': "Hey readers, today is CTA Day!\n\nWe're proud to be hosting a special event on July 26th. Here are all sorts of fun facts you can learn about CTA at your local CTA stop (but don't think that"}]

In [26]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
text = 'Let us encode this sentence'
encoded_input = tokenizer(text, return_tensors='pt')
encoded_input

{'input_ids': tensor([[ 5756,   514, 37773,   428,  6827]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

In [27]:
from transformers import GPT2Model

model = GPT2Model.from_pretrained('gpt2')
output = model(**encoded_input)
output['last_hidden_state'].shape

torch.Size([1, 5, 768])

# Loading the IMDb movie review dataset

In [28]:
import gzip
import shutil
import time

import pandas as pd
import requests
import torch
import torch.nn.functional as F
import torchtext

import transformers
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification


torch.backends.cudnn.deterministic = True
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

# DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE = torch.device('cpu')

NUM_EPOCHS = 3

# url = ("https://github.com/rasbt/"
#        "machine-learning-book/raw/"
#        "main/ch08/movie_data.csv.gz")
# filename = url.split('/')[-1]

# with open(filename, "wb") as f:
#     r = requests.get(url)
#     f.write(r.content)
    
# with gzip.open('movie_data.csv.gz', 'rb') as f_in:
#     with open('movie_data.csv', 'wb') as f_out:
#         shutil.copyfileobj(f_in, f_out)
        

df = pd.read_csv('movie_data.csv')
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [29]:
train_texts = df.iloc[:35000]['review'].values
train_labels = df.iloc[:35000]['sentiment'].values

valid_texts = df.iloc[35000:40000]['review'].values


valid_labels = df.iloc[35000:40000]['sentiment'].values

test_texts = df.iloc[40000:]['review'].values
test_labels = df.iloc[40000:]['sentiment'].values

# Tokenizing the dataset

In [30]:
tokenizer = DistilBertTokenizerFast.from_pretrained(
    'distilbert-base-uncased')

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
valid_encodings = tokenizer(list(valid_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)

In [31]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)
    

train_dataset = IMDbDataset(train_encodings, train_labels)
valid_dataset = IMDbDataset(valid_encodings, valid_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=16, shuffle=True)
valid_loader = torch.utils.data.DataLoader(
    valid_dataset, batch_size=16, shuffle=False)
test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=16, shuffle=False)

# Loading and fine-tuning a pre-trained BERT model

In [32]:
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased')
# model.to(DEVICE)
# model.train()

# optim = torch.optim.Adam(model.parameters(), lr=5e-5)

# def compute_accuracy(model, data_loader, device):
#     with torch.no_grad():
#         correct_pred, num_examples = 0, 0
#         for batch_idx, batch in enumerate(data_loader):
#             ### Prepare data
#             input_ids = batch['input_ids'].to(device)
#             attention_mask = \
#                 batch['attention_mask'].to(device)
#             labels = batch['labels'].to(device)
            
#             outputs = model(input_ids, 
#                 attention_mask=attention_mask)
#             logits = outputs['logits']
#             predicted_labels = torch.argmax(logits, 1)
#             num_examples += labels.size(0)
#             correct_pred += \
#                 (predicted_labels == labels).sum()
#         return correct_pred.float()/num_examples * 100
    

# start_time = time.time()

# for epoch in range(NUM_EPOCHS):
    
#     model.train()
    
#     for batch_idx, batch in enumerate(train_loader):
        
#         ### Prepare data
#         input_ids = batch['input_ids'].to(DEVICE)
#         attention_mask = batch['attention_mask'].to(DEVICE)
#         labels = batch['labels'].to(DEVICE)
        
#         ### Forward pass
#         outputs = model(input_ids,
#                         attention_mask=attention_mask,
#                         labels=labels)
#         loss, logits = outputs['loss'], outputs['logits']
        
#         ### Backward pass
#         optim.zero_grad()
#         loss.backward()
#         optim.step()
        
#         ### Logging
#         if not batch_idx % 250:
#             print(f'Epoch: {epoch + 1:04d}/{NUM_EPOCHS:04d}'
#                   f' | Batch'
#                   f'{batch_idx:04d}/'
#                   f'{len(train_loader):04d} | '
#                   f'Loss: {loss:.4f}')

# model.eval()

# with torch.set_grad_enabled(False):
#     print(f'Training accuracy: '
#           f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
#           f'\nValid accuracy: '
#           f'{compute_accuracy(model, valid_loader, DEVICE):.2f}%')

#     print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    
# print(f'Total Traning Time: {(time.time() - start_time)/60:.2f} min')
# print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifi

# Fine-tuning a transformer more conveniently using the Trainer
# API

In [33]:
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased')
model.to(DEVICE)
model.train()

optim = torch.optim.Adam(model.parameters(), lr=5e-5)

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=10)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    optimizers=(optim, None) # optim and learning rate scheduler
)


from datasets import load_metric
import numpy as np

metric = load_metric('accuracy')

def compute_metrics(eval_pres):
    logits, labels = eval_pred
    
    #note:logits are a numpy array, not a pytorch tensor
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(
        predictions=predictions, references=labels)

trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    optimizers=(optim, None) # optim and learning rate scheduler
)

        
start_time = time.time()
trainer.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifi

RuntimeError: CUDA out of memory. Tried to allocate 192.00 MiB (GPU 0; 1.95 GiB total capacity; 814.69 MiB already allocated; 200.31 MiB free; 844.00 MiB reserved in total by PyTorch)

In [None]:
print(f'Total TrainingTime:'
      f'{(time.time() - start_time)/60:.2f} min')

In [None]:
print(trainer.evaluate())

In [None]:
model.eval()
model.to(DEVICE)

print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments('test_trainer', evaluation_strategy='epoch')

In [None]:
trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
)