<a href="https://colab.research.google.com/github/ramkumarr02/Bert-PyTorch/blob/master/BERT_Pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Env Setup

## Packages

In [0]:
!pip install transformers



In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset

from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import AdamW
from transformers import BertConfig
from transformers import get_linear_schedule_with_warmup

from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

## Mount Drive and Read Data

In [0]:
from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Deep Learning/Language Models/BERT/IMDB Dataset.csv')
df_copy = df.copy()
df.head(2)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive


# Classes & Functions

## Data Class

In [0]:
class Bert_data_prep:
    def __init__(self,x, y, tokenizer, max_len = 250):
        self.x = x.values
        self.y = y.values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return(len(self.x))

    def __getitem__(self, item):
        x = self.x[item]
    
        x_encoded = tokenizer.encode_plus(x,
                                        add_special_tokens=True, 
                                        max_length = max_len, 
                                        pad_to_max_length = True, 
                                        return_tensors = 'pt'
                                        )  
        
        ids = x_encoded["input_ids"]
        mask = x_encoded["attention_mask"]
        token_type_ids = x_encoded["token_type_ids"]
        targets = torch.tensor(train_y, dtype=torch.float)

        ids             = torch.squeeze(ids,0)
        mask            = torch.squeeze(mask,0)
        token_type_ids  = torch.squeeze(token_type_ids,0)

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.y[item], dtype=torch.long)
        }

# Data Prep

In [0]:
'''x = train_x.values
x = x[1]'''

'x = train_x.values\nx = x[1]'

In [0]:
'''x_encoded = tokenizer.encode_plus(x,
                                    add_special_tokens=True, 
                                    max_length = max_len, 
                                    pad_to_max_length = True, 
                                    return_tensors = 'pt'
                                    )  
    
ids = x_encoded["input_ids"]
mask = x_encoded["attention_mask"]
token_type_ids = x_encoded["token_type_ids"]
targets = torch.tensor(train_y, dtype=torch.float)'''

'x_encoded = tokenizer.encode_plus(x,\n                                    add_special_tokens=True, \n                                    max_length = max_len, \n                                    pad_to_max_length = True, \n                                    return_tensors = \'pt\'\n                                    )  \n    \nids = x_encoded["input_ids"]\nmask = x_encoded["attention_mask"]\ntoken_type_ids = x_encoded["token_type_ids"]\ntargets = torch.tensor(train_y, dtype=torch.float)'

In [0]:
df.sentiment = df.sentiment.replace(['positive', 'negative'],[1,0])

In [0]:
train_x, valid_x, train_y, valid_y = train_test_split(df['review'], df['sentiment'],train_size = 0.8,random_state = 1)

## Parameters

In [0]:
max_str = max(df.review, key = len)
max_str_index = list(df.review).index(max_str)
max_len = len(df.review[max_str_index].split())
max_len = 64

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

train_batch_size = 32
valid_batch_size = 16

In [0]:
train_df = Bert_data_prep(x = train_x, y = train_y, tokenizer = tokenizer, max_len = max_len)
valid_df = Bert_data_prep(x = valid_x, y = valid_y, tokenizer = tokenizer, max_len = max_len)

## Data *Loader*

In [0]:
train_data_loader = torch.utils.data.DataLoader(train_df, batch_size=train_batch_size)
valid_data_loader = torch.utils.data.DataLoader(valid_df, batch_size=valid_batch_size)

In [0]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 2)

optimizer = AdamW(model.parameters())

In [0]:
epochs = 4

total_steps = len(train_data_loader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer= optimizer, num_warmup_steps=0, num_training_steps= total_steps)

In [0]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [0]:
loss_list = []
accuracy_list = []
for epoch in tqdm(range(epochs)):    
    model.train()
    for i, batch in enumerate(train_data_loader):
        ids             = batch['ids']
        mask            = batch['mask']
        targets         = batch['targets']
        token_type_ids  = batch['token_type_ids']             

        optimizer.zero_grad()
        outputs = model(input_ids = ids, attention_mask = mask, token_type_ids = token_type_ids, labels = targets)
        loss = outputs[0]
        print(loss)
        break    
    break

  0%|          | 0/4 [00:00<?, ?it/s]

tensor(0.8023, grad_fn=<NllLossBackward>)





In [0]:
l = [ids, mask, token_type_ids, targets]

for i in l:
    print((i[1].dtype))

torch.int64
torch.int64
torch.int64
torch.float32


In [0]:
ids.shape

torch.Size([32, 1, 64])

In [0]:
torch.squeeze(ids,0).shape

torch.Size([32, 1, 64])

In [0]:
for i, batch in enumerate(train_data_loader):
    ids             = batch['ids']
    mask            = batch['mask']
    targets         = batch['targets']
    token_type_ids  = batch['token_type_ids']

    print(ids.shape)
    print(mask.shape)    
    print(token_type_ids.shape)
    print(targets.shape)
    break

torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32])


In [0]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))