In [0]:
import torch

In [2]:
if torch.cuda.is_available:
  print ('Number of GPUs is :',torch.cuda.device_count())
  print ('Name of GPU is :', torch.cuda.get_device_name())
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

Number of GPUs is : 1
Name of GPU is : Tesla T4


In [3]:
# install transformers
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/10/aeefced99c8a59d828a92cc11d213e2743212d3641c87c82d61b035a7d5c/transformers-2.3.0-py3-none-any.whl (447kB)
[K     |████████████████████████████████| 450kB 3.3MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/a6/b4/7a41d630547a4afd58143597d5a49e07bfd4c42914d8335b2a5657efc14b/sacremoses-0.0.38.tar.gz (860kB)
[K     |████████████████████████████████| 870kB 14.7MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 20.5MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.38-cp36-none-any.whl size=884629 sha256=154575b57827d3fb2bbabe

In [4]:
# install wget
!pip install wget

Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-cp36-none-any.whl size=9681 sha256=b49a427c03bf1b878555177d7f6f359c12d585de359202b799baa31246201d68
  Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [5]:
# download cola dataset
import wget
import os

wget.download('https://nyu-mll.github.io/CoLA/cola_public_1.1.zip')
!unzip cola_public_1.1.zip -d .

Archive:  cola_public_1.1.zip
   creating: ./cola_public/
  inflating: ./cola_public/README    
   creating: ./cola_public/tokenized/
  inflating: ./cola_public/tokenized/in_domain_dev.tsv  
  inflating: ./cola_public/tokenized/in_domain_train.tsv  
  inflating: ./cola_public/tokenized/out_of_domain_dev.tsv  
   creating: ./cola_public/raw/
  inflating: ./cola_public/raw/in_domain_dev.tsv  
  inflating: ./cola_public/raw/in_domain_train.tsv  
  inflating: ./cola_public/raw/out_of_domain_dev.tsv  


In [0]:
import pandas as pd

#load cola dataset in dataframe
df = pd.read_csv('./cola_public/raw/in_domain_train.tsv', delimiter='\t', header=None, names=['A','label','B','sentence'])

In [7]:
df.head()

Unnamed: 0,A,label,B,sentence
0,gj04,1,,"Our friends won't buy this analysis, let alone..."
1,gj04,1,,One more pseudo generalization and I'm giving up.
2,gj04,1,,One more pseudo generalization or I'm giving up.
3,gj04,1,,"The more we study verbs, the crazier they get."
4,gj04,1,,Day by day the facts are getting murkier.


In [0]:
labels = df.label.values
sentences = df.sentence.values

In [9]:
print ('Number of sentences is ', len(sentences))
print ('Number of labels is ', len(labels))

Number of sentences is  8551
Number of labels is  8551


<img src="https://colab.research.google.com/drive/1Y4o3jh3ZH70tl6mCd76vz_IxX23biCPP#scrollTo=86C9objaKu8f"/>


The sentences in our dataset obviously have varying lengths, so how does BERT handle this?

BERT has two constraints:
1. All sentences must be padded or truncated to a single, fixed length.
2. The maximum sentence length is 512 tokens.

Padding is done with a special `[PAD]` token, which is at index 0 in the BERT vocabulary. The below illustration demonstrates padding out to a "MAX_LEN" of 8 tokens.

<img src="http://www.mccormickml.com/assets/BERT/padding_and_mask.png" width="600">

The "Attention Mask" is simply an array of 1s and 0s indicating which tokens are padding and which aren't (seems kind of redundant, doesn't it?! Again, I don't currently know why).

I've experimented with running this notebook with two different values of MAX_LEN, and it impacted both the training speed and the  test set accuracy.

With a Tesla K80 and:

```
MAX_LEN = 128  -->  Training epochs take ~5:28 each, score is 0.535
MAX_LEN = 64   -->  Training epochs take ~2:57 each, score is 0.566
```
These results suggest to me that the padding tokens aren't simply skipped over--that they are in fact fed through the model and incorporated in the results (thereby impacting both model speed and accuracy). I'll have to dig into the architecture more to understand this.






In [10]:
print ('max length of input sentence is:',max([len(s) for s in sentences]))

max length of input sentence is: 231


In [11]:
# Tokenization for BERT
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# print out the special tokens for [CLS], [SEP] and [MASK]
with open('distil-bert-vocab.txt','w') as f:
    for k,v in tokenizer.vocab.items():
        if v==100 or v==101 or v==102 or v==103:
          print (k,v)

input_sentences = []

for s in sentences:
  encoded = tokenizer.encode(s, 
                             add_special_tokens=True, 
                             max_length=64, 
                             pad_to_max_length=True)
  input_sentences.append(encoded)

assert len(input_sentences)==len(sentences)

[UNK] 100
[CLS] 101
[SEP] 102
[MASK] 103


In [12]:
print (input_sentences[100])

[101, 2065, 2017, 4521, 2062, 1010, 2017, 2215, 7978, 2135, 2625, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [13]:
# create attention mark 
attention_masks = []

for s in input_sentences:
  a_mask = []
  for t in s:
    if t>0:
      a_mask.append(1)
    else:
      a_mask.append(0)      
  attention_masks.append(a_mask)

print (attention_masks[0])
print (input_sentences[0])

print (len(attention_masks))
print (len(labels))
print (len(input_sentences))

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[101, 2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
8551
8551
8551


In [0]:
#split data in test and trian
from sklearn.model_selection import train_test_split

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_sentences, labels, test_size=0.1, random_state=99)
train_mask, validation_mask, _,_ = train_test_split(attention_masks, labels, random_state=99, test_size=0.1)

In [0]:
# Convert data to Pytorch format
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_mask = torch.tensor(train_mask)
validation_mask = torch.tensor(validation_mask)

In [0]:
# create dataloaders for loading data in batches
from torch.utils.data import DataLoader, RandomSampler, TensorDataset, SequentialSampler

# create training dataloader
train_data = TensorDataset(train_inputs, train_mask, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

# create validation dataloader
validation_data = TensorDataset(validation_inputs, validation_mask, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=32)


In [17]:
# Train our classification model
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels=2,
                                                      output_hidden_states=False,
                                                      output_attentions=False
                                                      )
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [18]:
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

epochs = 4
total_steps = len(train_dataloader)*epochs
print ('Total number of steps are:', total_steps)

optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8)

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps
                                            )

Total number of steps are: 964


In [0]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [0]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [21]:
# Training 
import time
import numpy as np
import random

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

loss_values = []
validation_accuracy_values = []

for epoch_num in range (0, epochs):

  print ('-----------------------')
  print ('-------Training--------')
  print ('-----------------------')
  
  print ('Epoch {:}/{:}'.format(epoch_num+1, epochs))
  
  t0 = time.time()
  total_loss = 0

  #put model in training mode
  model.train()

  # for each batch of training data
  for step, batch in enumerate (train_dataloader):

    # report progress after every 100 steps
    if (step % 50==0):
      elapsedTime = time.time()-t0
      print ('\tBatch {:}/{:} in progress'.format(step, len(train_dataloader)))

    b_input_ids = batch[0].to(device)
    b_attention_ids = batch[1].to(device)
    b_labels = batch[2].to(device)

    #clear all previous gradients
    model.zero_grad()

    #we get loss in outputs
    outputs = model(b_input_ids,
                    token_type_ids=None,
                    attention_mask=b_attention_ids,
                    labels=b_labels)
    
    loss = outputs[0]
    total_loss+=loss.item()

    #this is where backpropogation happens
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(),1.0)
    optimizer.step()
    scheduler.step()
  
  average_loss = total_loss / len(train_dataloader)
  loss_values.append(average_loss)

  print ('')
  print ('\tAverage training loss {0:.2f}'.format(average_loss))
  print ('\tEpoch training time {:}'.format(format_time(time.time()-t0)))

  print ('\t-----------------------')
  print ('\t-------Validation--------')
  print ('\t-----------------------')

  model.eval()
  eval_accuracy = 0
  tv0 = time.time()

  for v_step, v_batch in enumerate(validation_dataloader):

    b_v_input_id = v_batch[0].to(device)
    b_v_attention_mask = v_batch[1].to(device)
    b_v_label = v_batch[2].to(device)

    with torch.no_grad():
      outputs = model(b_v_input_id,
                      token_type_ids=None,
                      attention_mask=b_v_attention_mask)
      
    logits = outputs[0]

    logits = logits.detach().cpu().numpy()
    label_ids = b_v_label.cpu().numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    eval_accuracy += tmp_eval_accuracy

  average_eval_accuracy = eval_accuracy / len(validation_dataloader)
  validation_accuracy_values.append(average_eval_accuracy)
  #print (average_eval_accuracy)
  print ('\tValidation accuracy {0:.2f}'.format(average_eval_accuracy))
  print ('\tValidation took {:}'.format(format_time(time.time()-tv0)))

print ('Training Complete!!')

-----------------------
-------Training--------
-----------------------
Epoch 1/4
	Batch 0/241 in progress
	Batch 50/241 in progress
	Batch 100/241 in progress
	Batch 150/241 in progress
	Batch 200/241 in progress

	Average training loss 0.49
	Epoch training time 0:01:31
	-----------------------
	-------Validation--------
	-----------------------
	Validation accuracy 0.83
	Validation took 0:00:04
-----------------------
-------Training--------
-----------------------
Epoch 2/4
	Batch 0/241 in progress
	Batch 50/241 in progress
	Batch 100/241 in progress
	Batch 150/241 in progress
	Batch 200/241 in progress

	Average training loss 0.30
	Epoch training time 0:01:40
	-----------------------
	-------Validation--------
	-----------------------
	Validation accuracy 0.85
	Validation took 0:00:04
-----------------------
-------Training--------
-----------------------
Epoch 3/4
	Batch 0/241 in progress
	Batch 50/241 in progress
	Batch 100/241 in progress
	Batch 150/241 in progress
	Batch 200/24

In [22]:
# save trained model on disk
import os

output_dir = './outputs_dir'
if not os.path.exists(output_dir):
  os.mkdir(output_dir)

model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('./outputs_dir/vocab.txt',
 './outputs_dir/special_tokens_map.json',
 './outputs_dir/added_tokens.json')

In [24]:
!ls -l ./outputs_dir/

total 427964
-rw-r--r-- 1 root root         2 Jan 26 17:33 added_tokens.json
-rw-r--r-- 1 root root       682 Jan 26 17:33 config.json
-rw-r--r-- 1 root root 437983349 Jan 26 17:33 pytorch_model.bin
-rw-r--r-- 1 root root       112 Jan 26 17:33 special_tokens_map.json
-rw-r--r-- 1 root root        58 Jan 26 17:33 tokenizer_config.json
-rw-r--r-- 1 root root    231508 Jan 26 17:33 vocab.txt


In [26]:
#load saved model from disk
t_model = BertForSequenceClassification.from_pretrained(output_dir)
t_tokenizer = BertTokenizer.from_pretrained(output_dir)
t_model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [77]:
test_str = ['english goes to me']
for s in test_str:
  test_input_encoded = t_tokenizer.encode(s, add_special_tokens=True, max_length=64, pad_to_max_length=True)

# create attention mark 
test_attention_masks = []

for t in test_input_encoded:
  if t>0:
    test_attention_masks.append(1)
  else:
    test_attention_masks.append(0)      

test_input_tensor = torch.tensor(test_input_encoded).unsqueeze(0)
test_attention_masks_tensor = torch.tensor(test_attention_masks).unsqueeze(0)

test_input_tensor = test_input_tensor.to(device)
test_attention_masks_tensor = test_attention_masks_tensor.to(device)

t_model.eval()
with torch.no_grad():
  t_output=t_model(test_input_tensor)  

logits= t_output[0]
logits=logits.cpu().numpy()
print (np.argmax(logits))

1
