<a href="https://colab.research.google.com/github/saptarshidatta96/MTech_Sem3/blob/main/BERT_Module2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 5.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 47.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 39.9 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 34.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
  

In [2]:
import glob
import numpy as np
import os
import pandas as pd
import sys
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, BertModel, BertForSequenceClassification

In [3]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [4]:
import pandas as pd
data = pd.read_csv("/content/gdrive/MyDrive/1-restaurant-train.csv", delimiter = '\t', header = None )
data.columns = ['Rating', 'text']
data.reset_index(drop=True, inplace=True)
data.head()
data.dropna(axis = 0, inplace = True)

In [5]:
data = data.sample(frac =0.02)
data['text'] = data['text'].astype(str)

In [6]:
data.head()

Unnamed: 0,Rating,text
56019,3,R.C. worked well for this small family dinner ...
47951,4,Haus Murphy's is Old World charm in the middle...
74034,5,For what you're going to find in Phoenix or Sc...
41838,5,I love this little place. The gelato (too man...
63193,3,"I would give the food 4-5 stars, the restauran..."


In [7]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = len(data
    ['Rating'].unique()), # number of unique labels for our multi-class classification problem
    output_attentions = False,
    output_hidden_states = False,
)
model.to(device)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [8]:
class ReviewsDataset(Dataset):
    def __init__(self, df, max_length=512):
        self.df = df
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        self.max_length = max_length 
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        # input=review, label=stars
        review = self.df.loc[idx, 'text']
        # labels are 0-indexed
        label = int(self.df.loc[idx, 'Rating']) - 1
        
        encoded = self.tokenizer(
            review,                      # review to encode
            add_special_tokens=True,
            max_length=self.max_length,  # Truncate all segments to max_length
            padding='max_length',        # pad all reviews with the [PAD] token to the max_length
            return_attention_mask=True,  # Construct attention masks.
            truncation=True
        )
        
        input_ids = encoded['input_ids']
        attn_mask = encoded['attention_mask']
        
        return {
            'input_ids': torch.tensor(input_ids),
            'attn_mask': torch.tensor(attn_mask), 
            'label': torch.tensor(label)
        }

In [9]:
train_dataset, test_dataset = train_test_split(data, test_size=0.2, random_state=1)
train_dataset, val_dataset = train_test_split(train_dataset, test_size=0.2, random_state=1)

train_dataset = train_dataset.reset_index(drop=True)
val_dataset = val_dataset.reset_index(drop=True)
test_dataset = test_dataset.reset_index(drop=True)

train_set = ReviewsDataset(train_dataset, 256)
val_set = ReviewsDataset(val_dataset, 256)
test_set = ReviewsDataset(test_dataset, 256)

print("# of samples in train set: {}".format(len(train_set)))
print("# of samples in val set: {}".format(len(val_set)))
print("# of samples in test set: {}".format(len(test_set)))

# of samples in train set: 1049
# of samples in val set: 263
# of samples in test set: 329


In [10]:
train_params = {
                'batch_size': 5,
                'shuffle': True,
                'num_workers': 2
                }
val_params = train_params

test_params = {
                'batch_size': 5,
                'shuffle': False,
                'num_workers': 2
              }

train_loader = DataLoader(train_set, **train_params)
val_loader = DataLoader(val_set, **val_params)
test_loader = DataLoader(test_set, **test_params)

In [11]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7f5b282d8f50>

In [12]:
# For weighted Cross Entropy Loss
# Penalize errors higher if they come from a class with lower frequency
star_groups = data.groupby('Rating')
star_distribution = []
for i in range(len(data['Rating'].unique())):
    star_distribution.append(len(star_groups.groups[i+1])/len(data))

star_distribution = torch.tensor(star_distribution, dtype=torch.float32)

# V3
weights = 1.0 / star_distribution
weights = weights / weights.sum()

# V4
# weights = 1.0 - star_distribution

print('{:<20}: {}'.format('Star distribution', star_distribution.tolist()))
print('{:<20}: {}'.format('Weights', weights.tolist()))

Star distribution   : [0.06764168292284012, 0.09140767902135849, 0.178549662232399, 0.3412553369998932, 0.3211456537246704]
Weights             : [0.3956196904182434, 0.2927585542201996, 0.1498764008283615, 0.07841747999191284, 0.08332785964012146]


In [13]:
# Define the optimizer
loss_function = torch.nn.CrossEntropyLoss(weight=weights.to(device), reduction='mean')
optimizer = torch.optim.Adam(params=model.parameters(), lr=2e-05)

In [14]:
def calculate_accuracy(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [15]:
for data in train_loader:
        print(data)
        break

{'input_ids': tensor([[ 101, 1000, 5632,  ...,    0,    0,    0],
        [ 101, 2026, 2034,  ...,    0,    0,    0],
        [ 101, 1000, 1045,  ...,    0,    0,    0],
        [ 101, 2307, 2173,  ...,    0,    0,    0],
        [ 101, 2027, 2079,  ...,    0,    0,    0]]), 'attn_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'label': tensor([3, 4, 3, 3, 2])}


In [16]:
def train(epoch):
    # number of batches run by model
    nb_tr_steps = 0
    # number of training examples run by model
    nb_tr_examples = 0
    # number of examples classified correctly by model
    n_correct = 0
    tr_loss = 0
    model.train()

    for batch, data in enumerate(train_loader):
        input_ids = data['input_ids'].to(device)
        mask = data['attn_mask'].to(device)
        labels = data['label'].to(device)

        outputs = model(input_ids, mask)
        loss = loss_function(outputs[0], labels)
        tr_loss += loss.item()

        # gets labels with highest probabilities and their corresponding indices
        big_val, big_idx = torch.max(outputs[0].data, dim=1)
        n_correct += calculate_accuracy(big_idx, labels)

        nb_tr_steps += 1
        nb_tr_examples+=labels.size(0)
        
        if batch % 100 == 0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print("Batch {} of epoch {} complete.".format(batch, epoch+1))
            print(f"Training Loss: {loss_step}   Training Accuracy: {accu_step}")

            if not os.path.exists('Checkpoint'):
              os.makedirs('Checkpoint')

            # Since a single epoch could take well over hours, we regularly save the model even during evaluation of training accuracy.
            torch.save(model.state_dict(), os.path.join('/content/gdrive/My Drive/BERT/', 'Checkpoint', 'checkpoint.dat'))
            print("Saving checkpoint at", os.path.join('/content/gdrive/My Drive/BERT/', 'Checkpoint', 'checkpoint.dat'))

        optimizer.zero_grad()
        loss.backward()
        # When using GPU
        optimizer.step()

    print('\n*****\n')
    print(f'The Total Accuracy for Epoch {epoch+1}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss: {epoch_loss}")
    print(f"Training Accuracy: {epoch_accu}\n")

    # Evaluate model after training it on this epoch
    #validate(model, val_loader)

    torch.save(model.state_dict(), os.path.join('/content/gdrive/My Drive/BERT/', 'Checkpoint', 'checkpoint.dat'))
    model.save_pretrained(os.path.join('/content/drive/My gdrive/BERT/', 'Model_V3', str(epoch+1)))
    print("Saving checkpoint at ", os.path.join('/content/gdrive/My Drive/BERT/', 'Checkpoint', 'checkpoint.dat'))
    print("Saving model at ", os.path.join('/content/gdrive/My Drive/BERT/', 'Model_V3', str(epoch+1)), '\n\n================================================\n')

    return

In [17]:
for epoch in range(4):
    train(epoch)

Batch 0 of epoch 1 complete.
Training Loss: 1.6654967069625854   Training Accuracy: 0.0
Saving checkpoint at /content/gdrive/My Drive/BERT/Checkpoint/checkpoint.dat
Batch 100 of epoch 1 complete.
Training Loss: 1.592125134892983   Training Accuracy: 23.762376237623762
Saving checkpoint at /content/gdrive/My Drive/BERT/Checkpoint/checkpoint.dat
Batch 200 of epoch 1 complete.
Training Loss: 1.4995423807433588   Training Accuracy: 29.45273631840796
Saving checkpoint at /content/gdrive/My Drive/BERT/Checkpoint/checkpoint.dat

*****

The Total Accuracy for Epoch 1: 29.647283126787418
Training Loss: 1.4952620741866884
Training Accuracy: 29.647283126787418

Saving checkpoint at  /content/gdrive/My Drive/BERT/Checkpoint/checkpoint.dat
Saving model at  /content/gdrive/My Drive/BERT/Model_V3/1 


Batch 0 of epoch 2 complete.
Training Loss: 1.3993662595748901   Training Accuracy: 60.0
Saving checkpoint at /content/gdrive/My Drive/BERT/Checkpoint/checkpoint.dat
Batch 100 of epoch 2 complete.
Train

In [18]:
import torch
torch.cuda.empty_cache()
torch.cuda.memory_summary(device=device, abbreviated=False)

