In [33]:
import torch
import pandas as pd

In [34]:
def check_gpu_set_device():
    try:
        if torch.cuda.is_available:
          print ('Number of GPUs is :',torch.cuda.device_count())
          print ('Name of GPU is :', torch.cuda.get_device_name())
          device = torch.device("cuda")
        else:
          device = torch.device("cpu")
    except : 
        print ('Exception occured while checking for GPU support..')
        device = torch.device("cpu")
    
    return device

device = check_gpu_set_device()

Number of GPUs is : 1
Name of GPU is : Tesla T4


In [35]:
!pip install transformers



In [36]:
!pip install wget



In [37]:
# download cola dataset
import wget
import os

wget.download('https://nyu-mll.github.io/CoLA/cola_public_1.1.zip')
#!unzip cola_public_1.1.zip -d .

'cola_public_1.1 (1).zip'

In [38]:
!unzip cola_public_1.1.zip -d .

Archive:  cola_public_1.1.zip
replace ./cola_public/README? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: ./cola_public/README    
replace ./cola_public/tokenized/in_domain_dev.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: ./cola_public/tokenized/in_domain_dev.tsv  
replace ./cola_public/tokenized/in_domain_train.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: ./cola_public/tokenized/in_domain_train.tsv  
replace ./cola_public/tokenized/out_of_domain_dev.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: ./cola_public/tokenized/out_of_domain_dev.tsv  
replace ./cola_public/raw/in_domain_dev.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: ./cola_public/raw/in_domain_dev.tsv  
replace ./cola_public/raw/in_domain_train.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: ./cola_public/raw/in_domain_train.tsv  
replace ./cola_public/raw/out_of_domain_dev.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: ./cola_public/raw/out_of_dom

In [39]:
!ls

 cola_public		    cola_public_1.1.zip     output_dir
'cola_public_1.1 (1).zip'   distil-bert-vocab.txt   sample_data


In [40]:

#load cola dataset in dataframe
df = pd.read_csv('/content/cola_public/raw/in_domain_train.tsv', delimiter='\t', header=None, names=['A','label','B','sentence'])

In [41]:
df.head()

Unnamed: 0,A,label,B,sentence
0,gj04,1,,"Our friends won't buy this analysis, let alone..."
1,gj04,1,,One more pseudo generalization and I'm giving up.
2,gj04,1,,One more pseudo generalization or I'm giving up.
3,gj04,1,,"The more we study verbs, the crazier they get."
4,gj04,1,,Day by day the facts are getting murkier.


In [42]:
labels = df.label.values
sentences = df.sentence.values

In [43]:
print ('Number of sentences is ', len(sentences))
print ('Number of labels is ', len(labels))

Number of sentences is  8551
Number of labels is  8551


In [44]:
print ('max length of input sentence is:',max([len(s) for s in sentences]))

max length of input sentence is: 231


In [45]:
# Tokenization for BERT
from transformers import DistilBertTokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# print out the special tokens for [CLS], [SEP] and [MASK]
with open('distil-bert-vocab.txt','w') as f:
    for k,v in tokenizer.vocab.items():
        if v==100 or v==101 or v==102 or v==103:
          print (k,v)

input_sentences = []

for s in sentences:
  encoded = tokenizer.encode(s, 
                             add_special_tokens=True, 
                             max_length=64, 
                             pad_to_max_length=True)
  input_sentences.append(encoded)

assert len(input_sentences)==len(sentences)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[UNK] 100
[CLS] 101
[SEP] 102
[MASK] 103




In [46]:
print (input_sentences[100])

[101, 2065, 2017, 4521, 2062, 1010, 2017, 2215, 7978, 2135, 2625, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [47]:
# create attention mark 
attention_masks = []

for s in input_sentences:
  a_mask = []
  for t in s:
    if t>0:
      a_mask.append(1)
    else:
      a_mask.append(0)      
  attention_masks.append(a_mask)

print (attention_masks[0])
print (input_sentences[0])

print (len(attention_masks))
print (len(labels))
print (len(input_sentences))

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[101, 2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
8551
8551
8551


In [48]:

#split data in test and trian
from sklearn.model_selection import train_test_split

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_sentences, labels, test_size=0.1, random_state=42)
train_mask, validation_mask, _,_ = train_test_split(attention_masks, labels, random_state=42, test_size=0.1)

In [49]:
# Convert data to Pytorch format
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_mask = torch.tensor(train_mask)
validation_mask = torch.tensor(validation_mask)

In [50]:

# create dataloaders for loading data in batches
from torch.utils.data import DataLoader, RandomSampler, TensorDataset, SequentialSampler

# create training dataloader
train_data = TensorDataset(train_inputs, train_mask, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

# create validation dataloader
validation_data = TensorDataset(validation_inputs, validation_mask, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=32)

In [51]:

# Train our classification model
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',
                                                      num_labels=2,
                                                      output_hidden_states=False,
                                                      output_attentions=False
                                                      )
try:
    model.cuda()
except:
    print ('torch not compiled with cuda, ignoring.')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [52]:

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

epochs = 2
total_steps = len(train_dataloader)*epochs
print ('Total number of steps are:', total_steps)

optimizer = AdamW(model.parameters(),
                  lr = 5e-5,
                  eps = 1e-7)

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps
                                            )

Total number of steps are: 482


In [53]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ssye
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [54]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [55]:
# Training 
import time
import numpy as np
import random

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

loss_values = []
validation_accuracy_values = []

for epoch_num in range (0, epochs):

  print ('-----------------------')
  print ('-------Training--------')
  print ('-----------------------')
  
  print ('==========Epoch {:}/{:}============='.format(epoch_num+1, epochs))
  
  t0 = time.time()
  total_loss = 0

  #put model in training mode
  model.train()

  # for each batch of training data
  for step, batch in enumerate (train_dataloader):

    # report progress after every 100 steps
    if (step % 50==0):
      elapsedTime = time.time()-t0
      print ('\tBatch {:}/{:} in progress'.format(step, len(train_dataloader)))

    b_input_ids = batch[0].to(device)
    b_attention_ids = batch[1].to(device)
    b_labels = batch[2].to(device)

    #clear all previous gradients
    model.zero_grad()

    #we get loss in outputs
    outputs = model(b_input_ids,
                    attention_mask=b_attention_ids,
                    labels=b_labels)
    
    loss = outputs[0]
    total_loss+=loss.item()

    #this is where backpropogation happens
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(),1.0)
    optimizer.step()
    scheduler.step()
  
  average_loss = total_loss / len(train_dataloader)
  loss_values.append(average_loss)

  print ('')
  print ('\tAverage training loss {0:.2f}'.format(average_loss))
  print ('\tEpoch training time {:}'.format(format_time(time.time()-t0)))

  print ('\t-----------------------')
  print ('\t-------Validation--------')
  print ('\t-----------------------')

  model.eval()
  eval_accuracy = 0
  tv0 = time.time()

  for v_step, v_batch in enumerate(validation_dataloader):

    b_v_input_id = v_batch[0].to(device)
    b_v_attention_mask = v_batch[1].to(device)
    b_v_label = v_batch[2].to(device)

    with torch.no_grad():
      outputs = model(b_v_input_id,
                      attention_mask=b_v_attention_mask)
      
    logits = outputs[0]

    logits = logits.detach().cpu().numpy()
    label_ids = b_v_label.cpu().numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    eval_accuracy += tmp_eval_accuracy

  average_eval_accuracy = eval_accuracy / len(validation_dataloader)
  validation_accuracy_values.append(average_eval_accuracy)
  #print (average_eval_accuracy)
  print ('\tValidation accuracy {0:.2f}'.format(average_eval_accuracy))
  print ('\tValidation took {:}'.format(format_time(time.time()-tv0)))

print ('Training Complete!!')

-----------------------
-------Training--------
-----------------------
	Batch 0/241 in progress
	Batch 50/241 in progress
	Batch 100/241 in progress
	Batch 150/241 in progress
	Batch 200/241 in progress

	Average training loss 0.52
	Epoch training time 0:00:40
	-----------------------
	-------Validation--------
	-----------------------
	Validation accuracy 0.80
	Validation took 0:00:01
-----------------------
-------Training--------
-----------------------
	Batch 0/241 in progress
	Batch 50/241 in progress
	Batch 100/241 in progress
	Batch 150/241 in progress
	Batch 200/241 in progress

	Average training loss 0.29
	Epoch training time 0:00:41
	-----------------------
	-------Validation--------
	-----------------------
	Validation accuracy 0.80
	Validation took 0:00:01
Training Complete!!


In [56]:

#Save model to disk
import os

output_dir = './output_dir'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('./output_dir/tokenizer_config.json',
 './output_dir/special_tokens_map.json',
 './output_dir/vocab.txt',
 './output_dir/added_tokens.json')

In [57]:
#load saved model from disk
t_model = DistilBertForSequenceClassification.from_pretrained(output_dir)
t_tokenizer = DistilBertTokenizer.from_pretrained(output_dir)

try:
    t_model.cuda()
except:
    print ('torch not compiled with cuda, ignore')

In [58]:
test_str = ['Home was gone by John']
for s in test_str:
  test_input_encoded = t_tokenizer.encode(s, add_special_tokens=True, max_length=64, pad_to_max_length=True)

# create attention mark 
test_attention_masks = []

for t in test_input_encoded:
  if t>0:
    test_attention_masks.append(1)
  else:
    test_attention_masks.append(0)      

test_input_tensor = torch.tensor(test_input_encoded).unsqueeze(0)
test_attention_masks_tensor = torch.tensor(test_attention_masks).unsqueeze(0)

test_input_tensor = test_input_tensor.to(device)
test_attention_masks_tensor = test_attention_masks_tensor.to(device)

t_model.eval()
with torch.no_grad():
  t_output=t_model(test_input_tensor)  

logits= t_output[0]
logits=logits.cpu().numpy()
print (logits)
print (np.argmax(logits,axis=1).flatten())

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[[0.02040055 0.16851176]]
[1]


