<a href="https://colab.research.google.com/github/ramkumarr02/Bert-PyTorch/blob/master/BERT_Pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Env Setup

## Packages

In [1]:
!pip install transformers



In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset

from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import AdamW
from transformers import BertConfig
from transformers import get_linear_schedule_with_warmup
from transformers import BertModel

from tqdm import tqdm

from datetime import datetime
from dateutil.relativedelta import relativedelta

import warnings
warnings.filterwarnings('ignore')

## Mount Drive and Read Data

In [3]:
from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Deep Learning/Language Models/BERT/IMDB Dataset.csv')
df_copy = df.copy()
df.head(2)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive


# Classes & Functions

## Data Class

In [0]:
class Bert_data_prep:
    def __init__(self,x, y, tokenizer, max_len = 250):
        self.x = x.values
        self.y = y.values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return(len(self.x))

    def __getitem__(self, item):
        x = self.x[item]
    
        x_encoded = tokenizer.encode_plus(x,
                                        add_special_tokens=True, 
                                        max_length = max_len, 
                                        pad_to_max_length = True, 
                                        return_tensors = 'pt'
                                        )  
        
        ids = x_encoded["input_ids"]
        mask = x_encoded["attention_mask"]
        token_type_ids = x_encoded["token_type_ids"]
        #targets = torch.tensor(train_y, dtype=torch.float)

        ids             = torch.squeeze(ids,0)
        mask            = torch.squeeze(mask,0)
        token_type_ids  = torch.squeeze(token_type_ids,0)

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.y[item], dtype=torch.long)
        }

# Data Prep

In [0]:
df.sentiment = df.sentiment.replace(['positive', 'negative'],[1,0])

In [0]:
df = df.sample(n=10, random_state=1)

In [0]:
train_x, valid_x, train_y, valid_y = train_test_split(df['review'], df['sentiment'],train_size = 0.8,random_state = 1)

## Parameters

In [8]:
max_str = max(df.review, key = len)
max_str_index = list(df.review).index(max_str)
#max_len = len(df.review[max_str_index].split())
max_len = 256

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

train_batch_size = 2**4
valid_batch_size = 2**3

print(f'train_batch_size : {train_batch_size}')
print(f'valid_batch_size : {valid_batch_size}')

train_batch_size : 16
valid_batch_size : 8


In [0]:
train_df = Bert_data_prep(x = train_x, y = train_y, tokenizer = tokenizer, max_len = max_len)
valid_df = Bert_data_prep(x = valid_x, y = valid_y, tokenizer = tokenizer, max_len = max_len)

## Data *Loader*

In [0]:
train_data_loader = torch.utils.data.DataLoader(train_df, batch_size=train_batch_size)
valid_data_loader = torch.utils.data.DataLoader(valid_df, batch_size=valid_batch_size)

In [0]:
class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, 1)
    
    def forward(self, ids, mask, token_type_ids):
        _, o2 = self.bert(
            ids, 
            attention_mask=mask,
            token_type_ids=token_type_ids
        )
        bo = self.bert_drop(o2)
        output = self.out(bo)
        return(output)

In [0]:
model = BERTBaseUncased()
#model.cuda()
optimizer = AdamW(model.parameters())

In [17]:
'''for i, batch_train in enumerate(train_data_loader):
    ids             = batch_train['ids']
    mask            = batch_train['mask']
    targets         = batch_train['targets'].type(torch.FloatTensor)
    token_type_ids  = batch_train['token_type_ids']           

    outputs = model(ids = ids, mask = mask, token_type_ids = token_type_ids)
    print(outputs.dtype)
    print(targets.dtype)
    loss = nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))
    print(targets)
    print(outputs)
    print(loss)    
    break'''

torch.float32
torch.float32
tensor([0., 1., 0., 0., 0., 0., 0., 0.])
tensor([[0.4698],
        [0.7483],
        [0.3540],
        [0.3045],
        [0.4309],
        [0.4076],
        [0.3347],
        [0.5515]], grad_fn=<AddmmBackward>)
tensor(0.8520, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)


In [0]:
epochs = 2
total_steps = len(train_data_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer= optimizer, num_warmup_steps=0, num_training_steps= total_steps)

In [0]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [0]:
device = torch.device('cuda:0')
model = model.to(device)

In [0]:
nl = '\n'

In [0]:
start = datetime.now()

loss_list = []
accuracy_list = []
for epoch in tqdm(range(epochs)):    
    model.train()
    for i, batch_train in enumerate(train_data_loader):
        ids             = batch_train['ids']
        mask            = batch_train['mask']
        targets         = batch_train['targets'].type(torch.FloatTensor)
        token_type_ids  = batch_train['token_type_ids']           

        ids             = ids.to(device)
        targets         = targets.to(device)
        mask            = mask.to(device)
        token_type_ids  = token_type_ids.to(device) 

        optimizer.zero_grad()
        outputs = model(ids = ids, mask = mask, token_type_ids = token_type_ids)
        loss = nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))
        loss.backward()
        loss_list.append(loss.data)

        optimizer.step()
        scheduler.step()    

    model.eval()
    for i, batch_valid in enumerate(valid_data_loader):
        ids             = batch_valid['ids']
        mask            = batch_valid['mask']
        targets         = batch_valid['targets'].type(torch.FloatTensor)
        token_type_ids  = batch_valid['token_type_ids']

        ids             = ids.to(device)
        targets         = targets.to(device)
        mask            = mask.to(device)
        token_type_ids  = token_type_ids.to(device) 

        outputs = model(ids = ids, mask = mask, token_type_ids = token_type_ids)
        
        outputs = outputs.detach().cpu().numpy()
        targets = targets.to('cpu').numpy()

        accuracy = flat_accuracy(outputs, targets)
        accuracy_list.append(accuracy)

    print(f'{nl}Epoch : {epoch}, Loss : {loss.data} , Accuracy : {accuracy}') 

end = datetime.now()
time_elapsed = relativedelta(end, start)
print(f'{nl} time_elapsed : {time_elapsed}')

 50%|█████     | 1/2 [00:01<00:01,  1.09s/it]


Epoch : 0, Loss : 0.4775759279727936 , Accuracy : 1.0


100%|██████████| 2/2 [00:01<00:00,  1.00it/s]


Epoch : 1, Loss : 0.3826597332954407 , Accuracy : 0.0

 time_elapsed : relativedelta(seconds=+1, microseconds=+877389)





In [0]:
for i, batch_train in enumerate(train_data_loader):
        ids             = batch_train['ids']
        mask            = batch_train['mask']
        targets         = batch_train['targets']
        token_type_ids  = batch_train['token_type_ids']           

        ids             = ids.to(device)
        mask            = mask.to(device)
        targets         = targets.to(device)
        token_type_ids  = token_type_ids.to(device)         
        
        if i == 3:            
            print(ids)
            print(mask)
            print(targets)
            print(token_type_ids)
            
            break

In [0]:
outputs

array([[-0.25955155,  0.20437098],
       [-0.09783939,  0.04006562]], dtype=float32)

In [0]:
pred_flat = np.argmax(outputs, axis=1).flatten()
pred_flat

array([1, 1])

In [0]:
targets

tensor([0, 1, 0, 0, 0, 0, 0, 0], device='cuda:0')

In [0]:
labels_flat = targets.flatten()
labels_flat

tensor([0, 1, 0, 0, 0, 0, 0, 0], device='cuda:0')

In [0]:
np.sum(pred_flat == labels_flat) / len(labels_flat)

0.0

In [0]:
val = 5
text = df.head(val).review.values
x = text[val-1]
print(type(x),x)
x = " ".join(x.split())
print(type(x),x)

<class 'str'> Well, if you are one of those Katana's film-nuts (just like me) you sure will appreciate this metaphysical Katana swinging blood spitting samurai action flick.<br /><br />Starring Tadanobu Asano (Vital, Barren Illusion) & Ryu Daisuke (Kagemusha). This samurai war between Heiki's clan versus Genji's clan touch the zenith in the final showdown at Gojo bridge. The body-count is countless.<br /><br />Demons, magic swords, Shinto priests versus Buddhist monks and the beautiful visions provided by maestro Sogo Ishii will do the rest.<br /><br />A good Japanese flick for a rainy summer night.
<class 'str'> Well, if you are one of those Katana's film-nuts (just like me) you sure will appreciate this metaphysical Katana swinging blood spitting samurai action flick.<br /><br />Starring Tadanobu Asano (Vital, Barren Illusion) & Ryu Daisuke (Kagemusha). This samurai war between Heiki's clan versus Genji's clan touch the zenith in the final showdown at Gojo bridge. The body-count is c

In [0]:
x_encoded = tokenizer.encode_plus(text = x,
                                add_special_tokens=True, 
                                max_length = max_len, 
                                pad_to_max_length = True, 
                                return_attention_mask = True,
                                return_token_type_ids = True,
                                return_special_tokens_mask = True,
                                return_tensors = 'pt',
                                )  
ids = x_encoded["input_ids"]
mask = x_encoded["attention_mask"]
print(mask)

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


In [0]:
for key in list(x_encoded.keys()):
    print(key, "----",x_encoded[key])

special_tokens_mask ---- [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
input_ids ---- tensor([[  101,  2092,  1010,  2065,  2017,  2024,  2028,  1997,  2216, 29354,
          2532,  1005,  1055,  2143,  1011, 12264,  1006,  2074,  2066,  2033,
          1007,  2017,  2469,  20