<a href="https://colab.research.google.com/github/shraddha-an/nlp/blob/main/bert_huggingface.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **BERT Model with Hugging Face Transformers Library**

# **1) Installation**

In [None]:
# Installations
!pip install transformers

In [None]:
# Importing libraries
# Data Handling
import pandas as pd, numpy as np

# Visualization
import seaborn as sb, matplotlib.pyplot as plt

# NLP preprocess
from gensim.utils import simple_preprocess

In [None]:
import torch
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


# **2) Data Preprocessing**

In [None]:
# Importing data
dataset = pd.read_csv('train.csv')[['Body', 'Y']].rename(columns = {'Body': 'questions', 'Y': 'category'})
ds = pd.read_csv('valid.csv')[['Body', 'Y']].rename(columns = {'Body': 'questions', 'Y': 'category'})


In [None]:
# NLP Preprocess
dataset.iloc[:, 0] = dataset.iloc[:, 0].apply(lambda x: ' '.join(simple_preprocess(x)))
ds.iloc[:, 0] = ds.iloc[:, 0].apply(lambda x: ' '.join(simple_preprocess(x)))

# Splitting into X/y
X_train = dataset.iloc[:, 0]
X_test = ds.iloc[:, 0]

y_train = dataset.iloc[:, 1]
y_test = ds.iloc[:, 1]

# Label Encoding y
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
y_train = enc.fit_transform(y_train)
y_test = enc.transform(y_test)

X_train.head(7), X_test.tail(9)

(0    already familiar with repeating tasks every se...
 1    like to understand why java optionals were des...
 2    am attempting to overlay title over an image w...
 3    the question is very simple but just could not...
 4    using custom need to implement scale animation...
 5    have defined integer pointer inside the struct...
 6    want to disable nd saturday th saturday sunday...
 Name: questions, dtype: object,
 14991    have tried googling this but no luck basically...
 14992    accidentally disabled the public checkbox on p...
 14993    had bunch of files called png png png png used...
 14994    this is numerical tic tac toe game try to make...
 14995    have menu and like the div right controls and ...
 14996    try to multiply an integer by double but obtai...
 14997    urls py urls py file from django contrib impor...
 14998    have controller inside which server is connect...
 14999    so was recently helping someone out with some ...
 Name: questions, dtype: object)

# **3) BERT Tokenizer Fast**

In [None]:
# Loading the Bert fast Tokenizer & Bert Model
from transformers import BertTokenizerFast, TFBertModel

model_name = 'bert-base-uncased'
bert_tokenizer = BertTokenizerFast.from_pretrained(model_name)

#model = TFBertModel.from_pretrained(model_name)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
# Setting max length of our sequencess
max_len = 100

# Batch tokenizing
train_tokens = bert_tokenizer.batch_encode_plus(X_train.tolist(),
                                 max_length = max_len,
                                 pad_to_max_length = True,
                                 truncation = True)

test_tokens = bert_tokenizer.batch_encode_plus(X_test.tolist(),
                                 max_length = max_len,
                                 pad_to_max_length = True,
                                 truncation = True)



In [None]:
# Converting the integer sequences to torch tensors
import torch

train_sequence = torch.tensor(train_tokens['input_ids'])
train_mask = torch.tensor(train_tokens['attention_mask'])
train_y = torch.tensor(y_train.tolist())

test_sequence = torch.tensor(test_tokens['input_ids'])
test_mask = torch.tensor(test_tokens['attention_mask'])
test_y = torch.tensor(y_test.tolist())

In [None]:
# Creating Data Loader objects that'll supply the model with batches of sampled training data
from torch.utils.data import DataLoader, TensorDataset, RandomSampler

batch_size = 16

# Creating training data loader
train_data = TensorDataset(train_sequence, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train = DataLoader(train_data, sampler = train_sampler, batch_size = batch_size)

In [None]:
# Looking at 1 example of the TensorDataset
train_data[0]

(tensor([  101,  2525,  5220,  2007, 15192,  8518,  2296,  3823,  2011,  2478,
          9262, 21183,  4014, 25309,  1998,  9262, 21183,  4014, 25309, 10230,
          2243,  2021, 11082,  2360,  2215,  2000,  6140,  7592,  2088,  2000,
          1996, 10122,  2296,  6721,  3823,  2013,  6854,  1999,  2978,  1997,
          5481,  1998,  2123,  2031,  2151,  3642,  2000,  2265,  2061,  2521,
          2151,  2393,  2052,  2022, 19804, 24108,  3064,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0

# **4) Model Architecture**

In [None]:
# Loading the Bert model for sequence classification
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(model_name, num_labels = 3,
                                      output_attentions = False, output_hidden_states = False)

model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())
print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
    
print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30522, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

In [None]:
# Setting Optimizer & Learning Rate parameters
from transformers import AdamW

optimizer = AdamW(params = model.parameters(), lr = 2e-5, eps = 1e-8)

# Epochs
epochs = 2

# Setting total no of training steps = no of batches * epochs
train_steps = len(train) * epochs

# Creating the learning rate scheduler
from transformers import get_linear_schedule_with_warmup

scheduler = get_linear_schedule_with_warmup(optimizer, num_training_steps = train_steps, num_warmup_steps = 0)


In [None]:
print(len([i for i in model.parameters()]))

201


In [None]:
# Calculate elapsed time
import time
import datetime
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds = elapsed_rounded))

# **5) Training**

In [None]:
import random
# This training code is based on the `run_glue.py` script here:
# Set the seed value all over the place to make this reproducible.
seed_val = 0
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Storing the average loss after each epoch so we can plot them.
loss_values = []


In [None]:
# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train):
        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train), elapsed))
        
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
       
        model.zero_grad()    

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
       
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple.
        loss = outputs[0]

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()
        
    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        


Training...
  Batch    40  of  2,813.    Elapsed: 0:00:12.
  Batch    80  of  2,813.    Elapsed: 0:00:23.
  Batch   120  of  2,813.    Elapsed: 0:00:35.
  Batch   160  of  2,813.    Elapsed: 0:00:47.
  Batch   200  of  2,813.    Elapsed: 0:00:59.
  Batch   240  of  2,813.    Elapsed: 0:01:12.
  Batch   280  of  2,813.    Elapsed: 0:01:24.
  Batch   320  of  2,813.    Elapsed: 0:01:36.
  Batch   360  of  2,813.    Elapsed: 0:01:49.
  Batch   400  of  2,813.    Elapsed: 0:02:01.
  Batch   440  of  2,813.    Elapsed: 0:02:14.
  Batch   480  of  2,813.    Elapsed: 0:02:26.
  Batch   520  of  2,813.    Elapsed: 0:02:38.
  Batch   560  of  2,813.    Elapsed: 0:02:51.
  Batch   600  of  2,813.    Elapsed: 0:03:03.
  Batch   640  of  2,813.    Elapsed: 0:03:15.
  Batch   680  of  2,813.    Elapsed: 0:03:28.
  Batch   720  of  2,813.    Elapsed: 0:03:40.
  Batch   760  of  2,813.    Elapsed: 0:03:52.
  Batch   800  of  2,813.    Elapsed: 0:04:05.
  Batch   840  of  2,813.    Elapsed: 0:04:17.


In [None]:
# Plotting training loss over epochs
import plotly.express as px

f = pd.DataFrame(loss_values)
f.columns=['Loss']
fig = px.line(f, x=f.index, y=f.Loss)
fig.update_layout(title='Training loss of the Model',
                   xaxis_title='Epoch',
                   yaxis_title='Loss')
fig.show()

# **6) Evaluating on Test Set**

In [None]:
# Creating data loader for test set
from torch.utils.data import SequentialSampler

test_data = TensorDataset(test_sequence, test_mask, test_y)
test_sampler = SequentialSampler(test_data)
test = DataLoader(test_data, sampler = test_sampler, batch_size = batch_size)

In [None]:
print(len(test) * batch_size)

15008


In [None]:
# Prediction on test set
print('Predicting labels for {:,} test sentences...'.format(len(test_sequence)))

# Evaluating
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in test:
  # Add batch to GPU 
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction

  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids =  None, 
                      attention_mask = b_input_mask)
  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

print('DONE.')

Predicting labels for 15,000 test sentences...
DONE.


# **7) Metrics**

In [None]:
from sklearn.metrics import matthews_corrcoef
matthews_set = []

# For each input batch...
for i in range(len(true_labels)):
  
  # The predictions for this batch are a 2-column ndarray (one column for "0" 
  # and one column for "1"). Pick the label with the highest value and turn this
  # in to a list of 0s and 1s.
  pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
  
  # Calculate and store the coef for this batch.  
  matthews = matthews_corrcoef(true_labels[i], pred_labels_i)                
  matthews_set.append(matthews)

# Combine the predictions for each batch into a single list of 0s and 1s.
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

# Combine the correct labels for each batch into a single list.
flat_true_labels = [item for sublist in true_labels for item in sublist]

# Calculate the MCC
mcc = matthews_corrcoef(flat_true_labels, flat_predictions)
print('MCC: %.3f' % mcc)


invalid value encountered in double_scalars



MCC: 0.769


In [None]:
# Accuracy Score
from sklearn.metrics import accuracy_score as acc

print(acc(flat_true_labels, flat_predictions))
#len(flat_predictions)


0.8458
