<a href="https://colab.research.google.com/github/ramkumarr02/Bert-PyTorch/blob/master/BERT_Pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Env Setup

## Packages

In [1]:
!pip install transformers



In [0]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn import metrics

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset

from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import AdamW
from transformers import BertConfig
from transformers import get_linear_schedule_with_warmup
from transformers import BertModel

from tqdm import tqdm

from datetime import datetime
from dateutil.relativedelta import relativedelta

import warnings
warnings.filterwarnings('ignore')

## Mount Drive and Read Data

In [3]:
from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Deep Learning/Language Models/BERT/IMDB Dataset.csv')
df_copy = df.copy()
df.head(2)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive


# Classes & Functions

## Data Class

In [0]:
class Bert_data_prep:
    def __init__(self,x, y, tokenizer, max_len = 250):
        self.x = x.values
        self.y = y.values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return(len(self.x))

    def __getitem__(self, item):
        x = self.x[item]
    
        x_encoded = tokenizer.encode_plus(x,
                                        add_special_tokens=True, 
                                        max_length = max_len, 
                                        pad_to_max_length = True, 
                                        return_tensors = 'pt'
                                        )  
        
        ids = x_encoded["input_ids"]
        mask = x_encoded["attention_mask"]
        token_type_ids = x_encoded["token_type_ids"]

        ids             = torch.squeeze(ids,0)
        mask            = torch.squeeze(mask,0)
        token_type_ids  = torch.squeeze(token_type_ids,0)

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.y[item], dtype=torch.long)
        }

## Test Data Class


In [0]:
class test_data_prep:
    def __init__(self, x, tokenizer, max_len = 250):
        self.x = x.values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return(len(self.x))

    def __getitem__(self, item):
        x = self.x[item]
    
        x_encoded = tokenizer.encode_plus(x,
                                        add_special_tokens=True, 
                                        max_length = max_len, 
                                        pad_to_max_length = True, 
                                        return_tensors = 'pt'
                                        )  
        
        ids = x_encoded["input_ids"]
        mask = x_encoded["attention_mask"]
        token_type_ids = x_encoded["token_type_ids"]

        ids             = torch.squeeze(ids,0)
        mask            = torch.squeeze(mask,0)
        token_type_ids  = torch.squeeze(token_type_ids,0)

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long)
        }

## Custom BERT Network

In [0]:
class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, 1)
    
    def forward(self, ids, mask, token_type_ids):
        _, o2 = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
        bo = self.bert_drop(o2)
        output = self.out(bo)
        return(output)

## Accuracy Functions

In [0]:
def accuracy(outs, targs):

    outs = torch.sigmoid(outs).cpu().detach().numpy().tolist()
    targs= targs.cpu().detach().numpy().tolist()
    outs = np.array(outs) >= 0.5
    acc = metrics.accuracy_score(targs, outs)

    return(acc)

In [0]:
def flat_accuracy(preds, labels):
    #preds = preds.detach().cpu().numpy()
    preds = preds.detach().to('cpu').numpy()
    labels = labels.to('cpu').numpy()
    
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Code Engine


## Data Prep

In [8]:
df.sentiment = df.sentiment.replace(['positive', 'negative'],[1,0])
print(len(df))

50000


In [0]:
df = df.sample(n=2000, random_state=1)

In [0]:
train_x, valid_x, train_y, valid_y = train_test_split(df['review'], df['sentiment'],train_size = 0.8,random_state = 1)

## Find Max Batch size 

In [11]:
doc = [len(train_x), len(valid_x)]

train_len = (len(train_x))
valid_len = (len(valid_x))

train_list = []
valid_list = []

for divisor in range(1,11):
    if train_len % 2**divisor == 0 & train_len % 2 == 0:
        train_list.append(divisor)

for divisor in range(1,11):
    if valid_len % 2**divisor == 0 & valid_len % 2 == 0:
        valid_list.append(divisor)

max_power = max(set(train_list).intersection(valid_list))
max_power

4

## Parameters

In [12]:
max_str = max(df.review, key = len)
max_str_index = list(df.review).index(max_str)
#max_len = len(df.review[max_str_index].split())
max_len = 256

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

#power = max_power
power = 3

train_batch_size = 2**power
valid_batch_size = 2**power

print(f'train_batch_size : {train_batch_size}')
print(f'valid_batch_size : {valid_batch_size}')

train_batch_size : 8
valid_batch_size : 8


## Convert to Bert Format

In [0]:
train_df = Bert_data_prep(x = train_x, y = train_y, tokenizer = tokenizer, max_len = max_len)
valid_df = Bert_data_prep(x = valid_x, y = valid_y, tokenizer = tokenizer, max_len = max_len)

## Data *Loader*

In [0]:
train_data_loader = torch.utils.data.DataLoader(train_df, batch_size=train_batch_size)
valid_data_loader = torch.utils.data.DataLoader(valid_df, batch_size=valid_batch_size)

In [0]:
model = BERTBaseUncased()
model.cuda()
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)

In [0]:
epochs = 4
total_steps = len(train_data_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer= optimizer, num_warmup_steps=0, num_training_steps= total_steps)

## Model Training

In [0]:
device = torch.device('cuda:0')
model = model.to(device)

In [0]:
nl = '\n'

In [19]:
start = datetime.now()

loss_list = []
accuracy_list = []
for epoch in tqdm(range(epochs)):    
    model.train()
    for i, batch_train in enumerate(train_data_loader):
        ids             = batch_train['ids']
        mask            = batch_train['mask']
        targets         = batch_train['targets'].type(torch.FloatTensor)
        token_type_ids  = batch_train['token_type_ids']           

        ids             = ids.to(device, dtype = torch.long)
        targets         = targets.to(device, dtype = torch.float)
        mask            = mask.to(device, dtype = torch.long)
        token_type_ids  = token_type_ids.to(device, dtype = torch.long) 
        #token_type_ids = None

        optimizer.zero_grad()
        train_outputs = model(ids = ids, mask = mask, token_type_ids = token_type_ids)
        loss = nn.BCEWithLogitsLoss()(train_outputs, targets.view(-1, 1))
        loss.backward()

        optimizer.step()
        scheduler.step()    

    model.eval()
    with torch.no_grad():
        for j, batch_valid in enumerate(valid_data_loader):
            ids             = batch_valid['ids']
            mask            = batch_valid['mask']
            targets         = batch_valid['targets'].type(torch.FloatTensor)
            token_type_ids  = batch_valid['token_type_ids']

            ids             = ids.to(device, dtype = torch.long)
            targets         = targets.to(device, dtype = torch.float)
            mask            = mask.to(device, dtype = torch.long)
            token_type_ids  = token_type_ids.to(device, dtype = torch.long) 
            #token_type_ids = None

            outputs = model(ids = ids, mask = mask, token_type_ids = token_type_ids) 

            
        loss_list.append(loss.data)

        #outs = torch.sigmoid(outputs).cpu().detach().numpy().tolist()
        #targs= targets.cpu().detach().numpy().tolist()
        print(outputs)
        accuracy_val = accuracy(outputs, targets)        

        #accuracy = flat_accuracy(outputs, targets)
            
        accuracy_list.append(accuracy_val)

        #print('\n',outputs)
        print(f'{nl}Epoch : {epoch}, Loss : {loss.data} , Accuracy : {accuracy_val}') 

end = datetime.now()
time_elapsed = relativedelta(end, start)
print(f'{nl} time_elapsed : {time_elapsed}')

 25%|██▌       | 1/4 [00:52<02:38, 52.86s/it]

tensor([[-3.1795],
        [ 3.3373],
        [-3.4162],
        [ 2.6159],
        [-3.2857],
        [ 1.1427],
        [ 3.3986],
        [ 2.7590]], device='cuda:0')

Epoch : 0, Loss : 0.22424104809761047 , Accuracy : 1.0


 50%|█████     | 2/4 [01:45<01:45, 52.83s/it]

tensor([[-4.1975],
        [ 4.5461],
        [-4.6061],
        [ 5.0110],
        [-4.3377],
        [ 1.9100],
        [ 4.9645],
        [ 4.0220]], device='cuda:0')

Epoch : 1, Loss : 0.0781005322933197 , Accuracy : 1.0


 75%|███████▌  | 3/4 [02:38<00:52, 52.82s/it]

tensor([[-5.0675],
        [ 5.3890],
        [-5.3279],
        [ 5.5838],
        [-5.0936],
        [ 0.0412],
        [ 5.4964],
        [ 4.8824]], device='cuda:0')

Epoch : 2, Loss : 0.026963582262396812 , Accuracy : 1.0


100%|██████████| 4/4 [03:31<00:00, 52.80s/it]

tensor([[-5.0410],
        [ 5.5050],
        [-5.5293],
        [ 5.7374],
        [-5.2366],
        [ 1.6677],
        [ 5.7237],
        [ 5.0632]], device='cuda:0')

Epoch : 3, Loss : 0.034449417144060135 , Accuracy : 1.0

 time_elapsed : relativedelta(minutes=+3, seconds=+31, microseconds=+183155)





## Testing

In [0]:
test_list = ['That is good',
             'It could have been better',
             'Not bad',
             'Not good',
             'The story was confusing but engaging',
             'The story was convoluted and confusing']

In [0]:
test_df = pd.DataFrame(test_list, columns = ['comments'])
batch_size = len(test_df)

In [0]:
test_data = test_data_prep(x = test_df['comments'], tokenizer = tokenizer, max_len = max_len)
test_data_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size)

In [0]:
 for j, batch_valid in enumerate(test_data_loader):
            ids             = batch_valid['ids']
            mask            = batch_valid['mask']
            #targets         = batch_valid['targets'].type(torch.FloatTensor)
            token_type_ids  = batch_valid['token_type_ids']

            ids             = ids.to(device, dtype = torch.long)
            #targets         = targets.to(device, dtype = torch.float)
            mask            = mask.to(device, dtype = torch.long)
            token_type_ids  = token_type_ids.to(device, dtype = torch.long)            

            outputs = model(ids = ids, mask = mask, token_type_ids = token_type_ids) 
            outputs = torch.sigmoid(outputs).cpu().detach().numpy().tolist()            

In [0]:
results = ['Positive' if x[0] > 0.5 else 'Neg' for x in outputs]

In [111]:
test_df['results'] = results
test_df

Unnamed: 0,comments,results
0,That is good,Positive
1,It could have been better,Neg
2,Not bad,Neg
3,Not good,Neg
4,The story was confusing but engaging,Positive
5,The story was convoluted and confusing,Neg
