# Transfer Learning Transformer based BERT model 

In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
from tqdm import tqdm
import re
import numpy as np
import torch.nn as nn
from transformers import BertModel
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, roc_curve, auc
import torch.nn.functional as F

In [2]:
# TRAINING HYPERPARAMETERS
PROCESSED_DATASET = "./processed_dataset"
# Hardcoded after determning thw length due to memory issues
MAX_LENGTH = 103
BATCH_SIZE = 8
EPOCHS = 2
MODEL_DIR = "./models"
TRAIN_CHUNK = 100000
VAL_CHUNK = 20000
LRN_RATE = 5e-5
# change the argument to "cpu" to run on CPU
DEVICE = torch.device("cuda")

In [3]:
# A cleanup preprocessing of the text before tokenizing using BERT

def text_cleanup(tweet):
    tweet = re.sub(r'(@.*?)[\s]', ' ', tweet)
    tweet = re.sub(r'&amp;', '&', tweet)
    tweet = re.sub(r'\s+', ' ', tweet).strip()
    return tweet

## Bert Tokenizer 

In [4]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [5]:
def tokenization_function(data_frame, col_name):
    # Attention masks
    masks = []
    labels = []
    ids = []
    fixed_length = 0
    for idx, tweet in enumerate(data_frame[col_name]):
        if type(tweet) is str:
            tweet_encoder = bert_tokenizer.encode_plus(text = text_cleanup(tweet), add_special_tokens=True, 
                                                  return_attention_mask=True, truncation=True, 
                                                  pad_to_max_length=True)
            masks.append(tweet_encoder.get('attention_mask'))
            ids.append(tweet_encoder.get('input_ids'))
            labels.append(data_frame.iloc[idx,1])
    # Pad with fix length
    MAX_LENGTH = max([ len(i) for i in ids])
    ids = [np.pad(ele, mode= 'constant', pad_width = (0,MAX_LENGTH - len(ele)), constant_values=0).tolist() for ele in ids]
    masks = [np.pad(ele, mode= 'constant', pad_width = (0,MAX_LENGTH - len(ele)), constant_values=0).tolist() for ele in masks]
        
    # Convert masks and ids to torch tensors
    return torch.tensor(ids), torch.tensor(masks), torch.tensor(labels)

In [6]:
def create_dataloader(df_train_chunk, df_val_chunk):
    
    train_input_seq, train_mask, train_labels = tokenization_function(df_train_chunk, "Processed_text")
    val_input_seq, val_mask, val_labels = tokenization_function(df_val_chunk, "Processed_text")

    
    # Training Set Dataloader 
    tr_tensors = TensorDataset(train_input_seq, train_mask, train_labels)
    tr_sampler = RandomSampler(tr_tensors)
    tr_dataloader = DataLoader(tr_tensors, sampler=tr_sampler, batch_size=BATCH_SIZE)

    # Validation Set Dataloader 
    val_tensors = TensorDataset(val_input_seq, val_mask, val_labels)
    val_sampler = RandomSampler(val_tensors)
    val_dataloader = DataLoader(val_tensors, sampler=val_sampler, batch_size=BATCH_SIZE)
    
    return tr_dataloader, val_dataloader

In [7]:
# Sentiment classifier class

class Sentiment_Classifier(nn.Module):

    def __init__(self, freeze_model=False):
        super(Sentiment_Classifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        dim_in, hidden, dim_out = 768, 50, 2

        # Load the pretrained Bert model
        self.Bert_model = BertModel.from_pretrained('bert-base-uncased')

        # Feed-Forward Classifier, one layer deep
        self.classifier = nn.Sequential(nn.Linear(dim_in, hidden),nn.ReLU(), nn.Linear(hidden, dim_out))

        # Freeze the BERT model
        if freeze_model:
            for param in self.Bert_model.parameters():
                param.requires_grad = False
        
    def forward(self, ids, masks):
        
        output_vec = self.Bert_model(input_ids=ids,
                            attention_mask=masks)
        
        # classification probs , last hidden state output_vec
        class_prob_hidden_state = output_vec[0][:, 0, :]


        # Compute logits
        logits = self.classifier(class_prob_hidden_state)

        return logits

In [8]:
def model_initializer(tr_dataloader):
    
    # To train keep freeze_model false
    classifier = Sentiment_Classifier()
    
    # Target device
    classifier.to(DEVICE)
    
    # Instantiate the optimizer
    classifier_optimizer = AdamW(classifier.parameters(), lr=LRN_RATE, eps=1e-8)
    
    # Learning rate scheduler
    lr_scheduler = get_linear_schedule_with_warmup(classifier_optimizer, num_warmup_steps=0, 
                                                   num_training_steps=EPOCHS*len(tr_dataloader))
    
    return classifier, classifier_optimizer, lr_scheduler

def train(model, tr_loader, val_loader, epochs):
    
    print("Start training...\n")
    train_loss = []
    val_loss = []
    val_acc = []
    for epoch in tqdm(range(epochs)):
        
        acc_loss, batch_loss, batch_counts = 0, 0, 0
        model.train()
        loss_func = nn.CrossEntropyLoss()

        # Iterate over each batch and start training
        for step, batch in enumerate(tr_dataloader):
            batch_counts +=1
            # Transfer the batch data to the GPU
            batch_ids, batch_masks, batch_labels = tuple(t.to(DEVICE) for t in batch)
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(batch_ids, batch_masks)
            # Compute loss and accumulate the loss 
            loss = loss_func(logits, batch_labels)
            batch_loss += loss.item()
            acc_loss += loss.item()
        
            # Back Propagation
            loss.backward()

            # Prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update training parameters
            optimizer.step()
            scheduler.step()

            # Print training metrics after 500 batches
            if ((step+1) % 500 == 0) or ((step+1) == len(tr_dataloader)):

                # Print training results
                print(f"Epoch : {epoch + 1} | Step : {step+1} | Train Loss : {batch_loss / batch_counts}")

                # Reset batch parametes
                train_loss.append(batch_loss)
                batch_loss, batch_counts = 0, 0

        # Evaluate
        print(f"Evaluation after {epoch + 1} : ")
        avg_val_loss, avg_val_acc = evaluate(model, val_loader)
        val_loss.append(avg_val_loss)
        val_loss.append(avg_val_acc)
                
                
        print(f"Saving the model after epoch : {epoch+1}")
        torch.save(model, os.path.join(MODEL_DIR, f"model_{epoch+1}.pth"))

        #  Average training loss (Entire training set)
        avg_train_loss = acc_loss / len(tr_dataloader)
    
    print("Training complete!")
    return train_loss, val_loss, val_acc
    

def evaluate(model, val_loader):
    
    model.eval()
    val_loss = []
    val_acc = []
    loss_func = nn.CrossEntropyLoss()
    for idx, batch in enumerate(val_loader):
        # Load batch to GPU
        batch_ids, batch_masks, batch_labels = tuple(t.to(DEVICE) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(batch_ids, batch_masks)

        # Compute loss
        loss = loss_func(logits, batch_labels)
        val_loss.append(loss.item())

        # Get the predictions
        pred = torch.argmax(logits, dim=1).flatten()

        # Accuracy
        #accuracy = (predictions == batch_labels).cpu().numpy().mean() * 100
        acc = (pred == batch_labels).cpu().numpy().mean() * 100
        val_acc.append(acc)

    # Compute the average accuracy and loss over the validation set.
    avg_val_loss = np.mean(val_loss)
    avg_val_acc = np.mean(val_acc)
    
    print(f"The Average validation set val loss : {avg_val_loss}")
    print(f"The Average validation set val acc : {avg_val_acc}")
    return avg_val_loss, avg_val_acc

def predictor(model, data_loader):
    
    model.eval()

    logits_list = []

    # For each batch in our test set...
    for batch in data_loader:
        batch_ids, batch_masks = tuple(t.to(DEVICE) for t in batch)[:2]
        with torch.no_grad():
            logits = model(batch_ids, batch_masks)
        logits_list.append(logits)

    # concatenate all the logits
    logits_list = torch.cat(logits_list, dim=0)

    # Apply softmax to calculate probabilities
    probs = F.softmax(logits_list, dim=1).cpu().numpy()

    return probs
    

def roc_plot(probs, y_labels):
    
    preds = probs[:, 1]
    FPR, TPR, thres = roc_curve(y_labels, preds)
    roc_auc = auc(FPR, TPR)
    print(f"AUC: {roc_auc:.4f}")
    
    # Plot curve
    plt.title('Receiver Operating Characteristic')
    plt.plot(FPR, TPR, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'upper left')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

### Train and Evaluate model 

In [9]:
df_train = pd.read_csv(os.path.join(PROCESSED_DATASET, "train_set_labels.csv"), chunksize=TRAIN_CHUNK)
df_val = pd.read_csv(os.path.join(PROCESSED_DATASET, "val_set_labels.csv"), chunksize=VAL_CHUNK)
df_tr_chunks = [chunks for chunks in df_train]
df_val_chunks = [chunks for chunks in df_val]


In [10]:
# create dataloaders
tr_dataloader, val_dataloader = create_dataloader(df_tr_chunks[0], df_val_chunks[0])
# Instantiate model class
classifier, optimizer, scheduler = model_initializer(tr_dataloader)
# train the network
train_loss_list, val_loss_list, val_acc_list = train(classifier, tr_dataloader, val_dataloader, EPOCHS)



  0%|          | 0/2 [00:00<?, ?it/s]

Start training...

Epoch : 1 | Step : 500 | Train Loss : 0.4340430860966444
Epoch : 1 | Step : 1000 | Train Loss : 0.3549506512284279
Epoch : 1 | Step : 1500 | Train Loss : 0.36018231700360775
Epoch : 1 | Step : 2000 | Train Loss : 0.3439831671416759
Epoch : 1 | Step : 2500 | Train Loss : 0.3558241471201181
Epoch : 1 | Step : 3000 | Train Loss : 0.34639341439306737
Epoch : 1 | Step : 3500 | Train Loss : 0.3266608180105686
Epoch : 1 | Step : 4000 | Train Loss : 0.3480121290385723
Epoch : 1 | Step : 4500 | Train Loss : 0.3250024095848203
Epoch : 1 | Step : 5000 | Train Loss : 0.3068869077116251
Epoch : 1 | Step : 5500 | Train Loss : 0.32014237661659717
Epoch : 1 | Step : 6000 | Train Loss : 0.3419086788147688
Epoch : 1 | Step : 6500 | Train Loss : 0.32640956791490316
Epoch : 1 | Step : 7000 | Train Loss : 0.3222518877983093
Epoch : 1 | Step : 7500 | Train Loss : 0.31973231276124714
Epoch : 1 | Step : 8000 | Train Loss : 0.30662565389275553
Epoch : 1 | Step : 8500 | Train Loss : 0.3083843

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
 50%|█████  

Epoch : 2 | Step : 500 | Train Loss : 0.21251691346615553
Epoch : 2 | Step : 1000 | Train Loss : 0.2118600305840373
Epoch : 2 | Step : 1500 | Train Loss : 0.21215352526307105
Epoch : 2 | Step : 2000 | Train Loss : 0.22038110157102347
Epoch : 2 | Step : 2500 | Train Loss : 0.20431781712919472
Epoch : 2 | Step : 3000 | Train Loss : 0.2154346244931221
Epoch : 2 | Step : 3500 | Train Loss : 0.21865491896867753
Epoch : 2 | Step : 4000 | Train Loss : 0.2173833377957344
Epoch : 2 | Step : 4500 | Train Loss : 0.22271438686549663
Epoch : 2 | Step : 5000 | Train Loss : 0.19130625396221876
Epoch : 2 | Step : 5500 | Train Loss : 0.21073254026472568
Epoch : 2 | Step : 6000 | Train Loss : 0.1931838103532791
Epoch : 2 | Step : 6500 | Train Loss : 0.21253269450366497
Epoch : 2 | Step : 7000 | Train Loss : 0.18592183271795512
Epoch : 2 | Step : 7500 | Train Loss : 0.20686367166787387
Epoch : 2 | Step : 8000 | Train Loss : 0.21419953181594611
Epoch : 2 | Step : 8500 | Train Loss : 0.20509617814421655
Ep

100%|██████████| 2/2 [2:45:03<00:00, 4951.77s/it]  

Training complete!





### The network achieves a 92.48% Validation accuracy with 2 epochs of Fine training. 