In [1]:
!nvidia-smi

Wed Apr  7 11:42:29 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.67       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   54C    P8    11W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/81/91/61d69d58a1af1bd81d9ca9d62c90a6de3ab80d77f27c5df65d9a2c1f5626/transformers-4.5.0-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.2MB 16.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/08/cd/342e584ee544d044fb573ae697404ce22ede086c9e87ce5960772084cad0/sacremoses-0.0.44.tar.gz (862kB)
[K     |████████████████████████████████| 870kB 50.4MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 54.8MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.44-cp37-none-any.whl size=886084 sha256=48292b8d1c

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchtext
from torchtext.legacy import data, datasets

import random
import numpy as np
import time
import spacy

import transformers
from transformers import BertTokenizer, BertModel

print(f'torch : {torch.__version__}  |  torchtext : {torchtext.__version__}  |  spacy : {spacy.__version__}  |  np : {np.__version__}  |  transformers : {transformers.__version__}')

torch : 1.8.1+cu101  |  torchtext : 0.9.1  |  spacy : 2.2.4  |  np : 1.19.5  |  transformers : 4.5.0


## Preparing Data

In [4]:
# download & check pre-trained BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

print(type(tokenizer))
print(tokenizer)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…


<class 'transformers.models.bert.tokenization_bert.BertTokenizer'>
PreTrainedTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


In [5]:
sent =  'hello World How ARE yoU?'

# tokenize sample sentence
tokens = tokenizer.tokenize(sent)
print(type(tokens))
print(tokens)
print(tokenizer.convert_tokens_to_ids(tokens))
print()

# convert sample sentence into BatchEncoding
batch_indices = tokenizer(sent)
print(type(batch_indices))
print(batch_indices)
print(tokenizer.decode(batch_indices['input_ids']))

<class 'list'>
['hello', 'world', 'how', 'are', 'you', '?']
[7592, 2088, 2129, 2024, 2017, 1029]

<class 'transformers.tokenization_utils_base.BatchEncoding'>
{'input_ids': [101, 7592, 2088, 2129, 2024, 2017, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}
[CLS] hello world how are you? [SEP]


In [6]:
# save special tokens
cls_token = tokenizer.cls_token
sep_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

# save special token ids
cls_token_id = tokenizer.convert_tokens_to_ids(cls_token)
sep_token_id = tokenizer.convert_tokens_to_ids(sep_token)
pad_token_id = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_id = tokenizer.convert_tokens_to_ids(unk_token)

# cls_token_id = tokenizer.cls_token_id
# sep_token_id = tokenizer.sep_token_id
# pad_token_id = tokenizer.pad_token_id
# unk_token_id = tokenizer.unk_token_id

# check each special token & id
for tk in ['cls', 'sep', 'pad', 'unk']:
  tmp_token = eval(tk+'_token')
  tmp_id = eval(tk+'_token_id')
  print(f'{tmp_token} - {tmp_id}', end='\t')

[CLS] - 101	[SEP] - 102	[PAD] - 0	[UNK] - 100	

In [7]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']
print('max_input_length :', max_input_length)


def tokenize_and_cut(sentence, tokenizer=tokenizer, maxlen=max_input_length-2):
  tokens = tokenizer.tokenize(sentence)
  tokens = tokens[:maxlen]
  return tokens

max_input_length : 512


In [8]:
%%time

# set random seed for reproducibility
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# download and split dataset (train, valid, test)
TEXT = data.Field(
    tokenize=tokenize_and_cut, preprocessing=tokenizer.convert_tokens_to_ids,
    batch_first=True, use_vocab=False,
    init_token=cls_token_id, eos_token=sep_token_id, pad_token=pad_token_id, unk_token=unk_token_id
)
LABEL = data.LabelField(dtype=torch.float)

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data = train_data.split(random_state=random.seed(SEED))

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:03<00:00, 21.8MB/s]


CPU times: user 4min 32s, sys: 9.54 s, total: 4min 41s
Wall time: 4min 45s


In [9]:
# check the type and size of dataset
print(f'>>> type : {type(train_data)}')
print(f'>>> Number of training examples: {len(train_data)}')   # 17500 (35%)
print(f'>>> Number of validation examples: {len(valid_data)}') # 7500  (15%)
print(f'>>> Number of testing examples: {len(test_data)}')     # 25000 (50%)
print()

# check one sample data
tmp_ex = train_data[6]
tmp_dict = vars(tmp_ex)

print('< sample data >')
print('>>> type :', type(tmp_ex))
print('>>> length :', len(tmp_dict['text']))
print('>>> tokens :', tokenizer.convert_ids_to_tokens(tmp_dict['text'][:50]))
for key in tmp_dict:
  print(f'>>> {key} : {tmp_dict[key][:50]}')

>>> type : <class 'torchtext.legacy.data.dataset.Dataset'>
>>> Number of training examples: 17500
>>> Number of validation examples: 7500
>>> Number of testing examples: 25000

< sample data >
>>> type : <class 'torchtext.legacy.data.example.Example'>
>>> length : 493
>>> tokens : ['it', "'", 's', 'got', 'christopher', 'lee', ',', 'it', "'", 's', 'got', 'huge', 'banks', 'of', '1970s', 'computers', 'that', 'make', 'tel', '##ety', '##pe', 'noises', 'as', 'letters', 'appear', 'on', 'the', 'screen', ',', 'it', "'", 's', 'got', 'radioactive', 'isotope', '##s', 'that', 'not', 'only', 'glow', 'in', 'the', 'dark', 'but', 'emi', '##t', 'pulsing', 'thru', '##mming', 'noises']
>>> text : [2009, 1005, 1055, 2288, 5696, 3389, 1010, 2009, 1005, 1055, 2288, 4121, 5085, 1997, 3955, 7588, 2008, 2191, 10093, 27405, 5051, 14950, 2004, 4144, 3711, 2006, 1996, 3898, 1010, 2009, 1005, 1055, 2288, 17669, 28846, 2015, 2008, 2025, 2069, 8652, 1999, 1996, 2601, 2021, 12495, 2102, 23139, 27046, 25057, 14950]
>>>

In [10]:
# %%time
# build vocabulary (only LABEL)
LABEL.build_vocab(train_data)
print(f">>> Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")
print(LABEL.vocab.stoi)

>>> Unique tokens in LABEL vocabulary: 2
defaultdict(None, {'neg': 0, 'pos': 1})


In [11]:
# create the iterators
BATCH_SIZE = 128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device
)

display(device, type(train_iterator), len(train_iterator), len(train_data)/BATCH_SIZE)

device(type='cuda')

torchtext.legacy.data.iterator.BucketIterator

137

136.71875

## Build the Model

In [12]:
# function for model summary
def model_summary(model, print_model=True):
  num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
  print(f'>>> The model has {num_parameters:,} trainable parameters')
  if print_model:
    print(model)


bert = BertModel.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [13]:
class BertGRUSentiment(nn.Module):
  def __init__(self, bert, hidden_dim, output_dim, n_layers, dropout, bidirectional):
    super().__init__()
    embedding_dim = bert.config.hidden_size

    self.bert = bert
    self.rnn = nn.GRU(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=n_layers,
                      batch_first=True, dropout=dropout, bidirectional=bidirectional)
    self.dropout = nn.Dropout(dropout)
    self.fc = nn.Linear(hidden_dim*2 if bidirectional else hidden_dim, output_dim)
  
  def forward(self, text_ts):
    # text_ts = [batch size, sent len]

    with torch.no_grad():
      embedded = self.bert(text_ts)[0]
    # embedded = [batch size, sent len, emb dim]

    _, hidden = self.rnn(embedded)
    # hidden = [n_layers * n_directions, batch size, emb dim]

    if self.rnn.bidirectional:
      cat = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
    else:
      cat = self.dropout(hidden[-1, :, :])
    # cat = [batch size, hidden dim]

    out = self.fc(cat)
    # out = [batch size, output dim]

    return out

In [14]:
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
DROPOUT = 0.25
BIDIRECTIONAL = True

model = BertGRUSentiment(bert, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT, BIDIRECTIONAL).to(device)
model_summary(model)

>>> The model has 112,241,409 trainable parameters
BertGRUSentiment(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm)

In [15]:
%%time

# test tensor shape in each step
ts = torch.randint(low=1, high=250, size=(8, 256)).to(device)
print('>>> text_ts =', ts.shape)
ts = model.bert(ts)
print('>>> bert output len =', len(ts))
print('>>> bert last hidden state =', ts.last_hidden_state.shape)
print('>>> bert pooler output =', ts.pooler_output.shape)
ts = ts.last_hidden_state  # ts[0]
print('>>> embedded =', ts.shape)
_, ts = model.rnn(ts)
print('>>> hidden =', ts.shape)
ts = model.dropout(torch.cat((ts[-2, :, :], ts[-1, :, :]), dim=1))
print('>>> cat =', ts.shape)
ts = model.fc(ts)
print('>>> out =', ts.shape)
print(ts.sigmoid())

>>> text_ts = torch.Size([8, 256])
>>> bert output len = 2
>>> bert last hidden state = torch.Size([8, 256, 768])
>>> bert pooler output = torch.Size([8, 768])
>>> embedded = torch.Size([8, 256, 768])
>>> hidden = torch.Size([4, 8, 256])
>>> cat = torch.Size([8, 512])
>>> out = torch.Size([8, 1])
tensor([[0.4648],
        [0.4908],
        [0.4545],
        [0.4893],
        [0.4036],
        [0.4922],
        [0.5260],
        [0.4402]], device='cuda:0', grad_fn=<SigmoidBackward>)
CPU times: user 111 ms, sys: 80.3 ms, total: 192 ms
Wall time: 380 ms


In [16]:
for name, param in model.named_parameters():
    if name.startswith('bert'):
        param.requires_grad = False
    print(name)

bert.embeddings.word_embeddings.weight
bert.embeddings.position_embeddings.weight
bert.embeddings.token_type_embeddings.weight
bert.embeddings.LayerNorm.weight
bert.embeddings.LayerNorm.bias
bert.encoder.layer.0.attention.self.query.weight
bert.encoder.layer.0.attention.self.query.bias
bert.encoder.layer.0.attention.self.key.weight
bert.encoder.layer.0.attention.self.key.bias
bert.encoder.layer.0.attention.self.value.weight
bert.encoder.layer.0.attention.self.value.bias
bert.encoder.layer.0.attention.output.dense.weight
bert.encoder.layer.0.attention.output.dense.bias
bert.encoder.layer.0.attention.output.LayerNorm.weight
bert.encoder.layer.0.attention.output.LayerNorm.bias
bert.encoder.layer.0.intermediate.dense.weight
bert.encoder.layer.0.intermediate.dense.bias
bert.encoder.layer.0.output.dense.weight
bert.encoder.layer.0.output.dense.bias
bert.encoder.layer.0.output.LayerNorm.weight
bert.encoder.layer.0.output.LayerNorm.bias
bert.encoder.layer.1.attention.self.query.weight
bert.enc

In [17]:
model_summary(model, print_model=False)

for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)

>>> The model has 2,759,169 trainable parameters
rnn.weight_ih_l0
rnn.weight_hh_l0
rnn.bias_ih_l0
rnn.bias_hh_l0
rnn.weight_ih_l0_reverse
rnn.weight_hh_l0_reverse
rnn.bias_ih_l0_reverse
rnn.bias_hh_l0_reverse
rnn.weight_ih_l1
rnn.weight_hh_l1
rnn.bias_ih_l1
rnn.bias_hh_l1
rnn.weight_ih_l1_reverse
rnn.weight_hh_l1_reverse
rnn.bias_ih_l1_reverse
rnn.bias_hh_l1_reverse
fc.weight
fc.bias


## Train the Model

In [18]:
# function for calculating binary accuracy
def binary_accuracy(preds, y):
  pred_class = preds.sigmoid().round()
  acc = (pred_class == y).float().mean()
  return acc


# function for training in each epoch
def train(model, iterator, optimizer, criterion):
  epoch_loss = 0
  epoch_acc = 0
  model.train()

  for batch in iterator:
    pred = model(batch.text).squeeze(1)
    loss = criterion(pred, batch.label)
    acc = binary_accuracy(pred, batch.label)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    epoch_loss += loss
    epoch_acc  += acc

  return epoch_loss/len(iterator), epoch_acc/len(iterator)


# function for evaluation (vaildation or test)
def evaluate(model, iterator, criterion):
  loss = 0
  acc = 0
  model.eval()

  with torch.no_grad():
    for batch in iterator:
      pred = model(batch.text).squeeze(1)
      loss += criterion(pred, batch.label)
      acc += binary_accuracy(pred, batch.label)
  return loss/len(iterator), acc/len(iterator)


# function for calculating min, sec from start & end time
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time/60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs

In [19]:
import gc
gc.collect()

153

In [20]:
%%time

optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss().to(device)
outfile_dir = 'tut6-model.pt'

n_epoch = 3
best_val_loss = float('inf')
for epoch in range(n_epoch):
  start_time = time.time()
  train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
  valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
  end_time = time.time()

  epoch_mins, epoch_secs = epoch_time(start_time, end_time)

  print(f'Epoch : {epoch+1:02}  |  Epoch time : {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss : {train_loss:.3f}  |  Train Acc : {train_acc*100:.2f}%')
  print(f'\tValid Loss : {valid_loss:.3f}  |  Valid Acc : {valid_acc*100:.2f}%')
  
  if valid_loss < best_val_loss:
    torch.save(model.state_dict(), outfile_dir)
    print('>>> Saved best model in epoch', epoch+1)
  else:
    print('>>> Finished learning without saving')

  gc.collect()

Epoch : 01  |  Epoch time : 15m 54s
	Train Loss : 0.473  |  Train Acc : 76.45%
	Valid Loss : 0.267  |  Valid Acc : 89.32%
>>> Saved best model in epoch 1
Epoch : 02  |  Epoch time : 16m 4s
	Train Loss : 0.274  |  Train Acc : 89.01%
	Valid Loss : 0.253  |  Valid Acc : 89.55%
>>> Saved best model in epoch 2
Epoch : 03  |  Epoch time : 16m 4s
	Train Loss : 0.232  |  Train Acc : 90.65%
	Valid Loss : 0.223  |  Valid Acc : 91.09%
>>> Saved best model in epoch 3
CPU times: user 22min 23s, sys: 25min 51s, total: 48min 15s
Wall time: 48min 7s


## Load and Test saved model

In [21]:
%%time

# function for checking weight equality
def check_weights(model1, model2):
  flag = True
  for p1, p2 in zip(model1.parameters(), model2.parameters()):
    if not p1.data.equal(p2.data):
      flag = False
  return flag

# load model
loaded_model = BertGRUSentiment(bert, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT, BIDIRECTIONAL).to(device)
loaded_model.load_state_dict(torch.load(outfile_dir))

# check weight equality
print('>>> All weights are equal') if check_weights(model, loaded_model) else print('>>> WARNING!!! Not equal wieghts!')

# test model performance
test_loss, test_acc = evaluate(loaded_model, test_iterator, criterion)
print(f'>>> Test Loss : {test_loss:.3f}  |  Test Acc : {test_acc*100:.2f}%')

>>> All weights are equal
>>> Test Loss : 0.213  |  Test Acc : 91.45%
CPU times: user 4min 13s, sys: 4min 28s, total: 8min 42s
Wall time: 8min 40s


## Predict New Sentences
- The predict_sentiment function is different from previous notebook since **the input sentence has to be at least as long as the largest filter height** used in CNN model

In [23]:
# function for predicting sentiment
def predict_sentiment(model, tokenizer, text):
  model.eval()

  # preprocess input text
  tokenized = tokenize_and_cut(text, tokenizer=tokenizer)
  indexed = [cls_token_id] + tokenizer.convert_tokens_to_ids(tokenized) + [sep_token_id]
  tensor = torch.LongTensor(indexed).unsqueeze(0).to(device)

  # predict sentiment with model
  prob = model(tensor).sigmoid().item()
  return prob


# predict sentiment
for sent in ["I'm so surprised by the movie", "I'm not so surprised by the movie", "I'm not surprised by the movie",
             "I'm so surprised by this thing", "I'm not so surprised by this thing", "I'm not surprised by this thing",
             "This film is terrible", "This film is great"]:
  prob = predict_sentiment(loaded_model, tokenizer, sent)
  print(f'>>> {sent:<35} : {prob:.3f}')

>>> I'm so surprised by the movie       : 0.786
>>> I'm not so surprised by the movie   : 0.574
>>> I'm not surprised by the movie      : 0.498
>>> I'm so surprised by this thing      : 0.916
>>> I'm not so surprised by this thing  : 0.554
>>> I'm not surprised by this thing     : 0.528
>>> This film is terrible               : 0.024
>>> This film is great                  : 0.967


In [24]:
# CNN benchmark
# >>> I'm so surprised by the movie       : 0.506
# >>> I'm not so surprised by the movie   : 0.493
# >>> I'm not surprised by the movie      : 0.492

# >>> I'm so surprised by this thing      : 0.507
# >>> I'm not so surprised by this thing  : 0.494
# >>> I'm not surprised by this thing     : 0.486

# >>> This film is terrible               : 0.066
# >>> This film is great                  : 0.983