<a href="https://colab.research.google.com/github/rajibmondal/BERT-Fine-Tuning-with-PyTorch-for-Sentence-classification/blob/master/Sentence_Classification_with_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))


Found GPU at: /device:GPU:0


In [0]:
!pip install pytorch-pretrained-bert pytorch-nlp


Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |████████████████████████████████| 133kB 3.5MB/s 
[?25hCollecting pytorch-nlp
[?25l  Downloading https://files.pythonhosted.org/packages/df/ae/b6d18c3f37da5a78e83701469e6153811f4b0ecb3f9387bb3e9a65ca48ee/pytorch_nlp-0.4.1-py3-none-any.whl (82kB)
[K     |████████████████████████████████| 92kB 29.3MB/s 
[?25hCollecting regex (from pytorch-pretrained-bert)
[?25l  Downloading https://files.pythonhosted.org/packages/ff/60/d9782c56ceefa76033a00e1f84cd8c586c75e6e7fea2cd45ee8b46a386c5/regex-2019.08.19-cp36-cp36m-manylinux1_x86_64.whl (643kB)
[K     |████████████████████████████████| 645kB 39.7MB/s 
Installing collected packages: regex, pytorch-pretrained-bert, pytorch-nlp
Successfully installed pytorch-nlp-0.4.1 pytorch-pretrained-bert-0.6.2 regex-2019.8.19


In [0]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline


Using TensorFlow backend.


In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)


'Tesla K80'

In [0]:
# Upload the train file from your local drive
from google.colab import files
uploaded = files.upload()


Saving in_domain_train.tsv to in_domain_train.tsv


In [0]:
df = pd.read_csv("in_domain_train.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])


In [0]:
df.shape

(8551, 4)

In [0]:
df.sample(10)

Unnamed: 0,sentence_source,label,label_notes,sentence
103,cj99,0,*,"The most you want, the least you eat."
6336,c_13,1,,John loves himself.
1267,r-67,0,*,What table will he put the chair between and s...
2902,l-93,1,,I detached the handle.
3190,l-93,1,,Brutus murdered Julius Caesar.
5309,b_82,0,*,"Because she's so pleasant, Mary I really like ..."
8424,ad03,1,,I am to eat macaroni.
8274,ad03,0,*,He kicked yourself
3760,ks08,1,,John gave the boys the CDs.
6270,c_13,1,,Susan begged Bill to let her sing in the concert.


In [0]:
#create a sentence and lebel lists
sentences = df.sentence.values
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
labels = df.label.values


In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(sente) for sente in sentences]
print('tokenizer the first sentence:',tokenized_texts[0])



tokenizer the first sentence: ['[CLS]', 'our', 'friends', 'won', "'", 't', 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose', '.', '[SEP]']


In [0]:
# Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway. 
# In the original paper, the authors used a length of 512.
MAX_LEN = 128

In [0]:
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids_without_padding = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

In [0]:
#pad our input tokens
input_ids = pad_sequences(input_ids_without_padding, maxlen = MAX_LEN, dtype='long', truncating='post', padding='post')
input_ids.shape

(8551, 128)

In [0]:
#create attention masks
attention_masks = []

#Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

In [0]:
# Use train_test_split to split our data into train and validation sets for training
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=123, test_size=0.1)

train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids, random_state = 123, test_size=0.1)

In [0]:
print('train_inputs:',train_inputs.shape)
print('validation_inputs:',validation_inputs.shape)
print('train_labels:',train_labels.shape)
print('validation_labels:',validation_labels.shape)
print('train_masks:',train_masks.shape)
print('validation_masks:',validation_masks.shape)

train_inputs: (7695, 128)
validation_inputs: (856, 128)
train_labels: (7695,)
validation_labels: (856,)


AttributeError: ignored

In [0]:
#convert all our data into torch tensors
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)


In [0]:
print('train_inputs:',train_inputs.shape)
print('validation_inputs:',validation_inputs.shape)
print('train_labels:',train_labels.shape)
print('validation_labels:',validation_labels.shape)
print('train_masks:',train_masks.shape)
print('validation_masks:',validation_masks.shape)

train_inputs: torch.Size([7695, 128])
validation_inputs: torch.Size([856, 128])
train_labels: torch.Size([7695])
validation_labels: torch.Size([856])
train_masks: torch.Size([7695, 128])
validation_masks: torch.Size([856, 128])


In [0]:
#select the batch size for training For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memor

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = RandomSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


###Train Model

In [0]:
#Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.cuda()

100%|██████████| 407873900/407873900 [00:07<00:00, 52708730.47B/s]


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
   

In [0]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [{'params':[p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
 'weight_decay_rate':0.01},
{'params':[p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
 'weight_decay_rate':0.00}]

In [0]:
# This variable contains all of the hyperparemeter information our training loop needs

optimizer = BertAdam(optimizer_grouped_parameters, lr=2e-5, warmup=.1)

t_total value of -1 results in schedule not being applied


In [0]:
#Function to calculate the accuracy of our predictions vs lebel

def flat_accuracy(preds, labels):
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat)/len(labels_flat)

In [0]:
#store our losses an accuracy for plotting
train_loss_set = []
#number of training epochs
epochs = 2

#trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
  
  
  
  #Training

  #set our model to training mode(as opposesd to evaluation mode)
  model.train()

  #Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0,0


  #train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    #add batch to gpu
    batch = tuple(t.to(device) for t in batch)
    #unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    #clear out the gradients(by default they accumalate)
    optimizer.zero_grad()
    
    #Forward pass
    loss = model(b_input_ids, token_type_ids=None, attention_mask = b_input_mask, labels=b_labels)
    train_loss_set.append(loss.item())
    
    #Backward pass
    loss.backward()
    
    #update parametersnand take a step using the computed gradient
    optimizer.step()
    
    #Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1
    
  print("Train loss:{}".format(tr_loss/nb_tr_steps))
  
  
  
  
  
  #validation

  #put our model to evaluation mode to evaluate loss on the validation
  model.eval()

  #Tracking variables
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0,0


  #train the data for one epoch
  for batch in validation_dataloader:
    #add batch to gpu
    batch = tuple(t.to(device) for t in batch)
    #unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    ##telling the model not to compute or store gradients
    with torch.no_grad():
      
      #Forward pass, calculate logit predictions
      logits = model(b_input_ids, token_type_ids=None, attention_mask = b_input_mask)
    
    #Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    tmp_eval_accuracy = flat_accuracy(logits,label_ids)
    
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1
  
  print('-' *80)
  print("validation Accuracy:{}".format(eval_accuracy/nb_eval_steps))
    



Epoch:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Train loss:0.2575856793934626




Epoch:  50%|█████     | 1/2 [06:25<06:25, 385.02s/it][A[A

--------------------------------------------------------------------------------
validation Accuracy:0.8541666666666666
Train loss:0.11756880803968897




Epoch: 100%|██████████| 2/2 [12:49<00:00, 384.99s/it][A[A

[A[A

--------------------------------------------------------------------------------
validation Accuracy:0.8530092592592593


In [1]:
print('hello')

hello
