## IRE ASSIGNMENT 3
Snehal Kumar
2019101003

## Q6. BERT 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! pip install transformers -q
! pip install tokenizers -q

In [None]:
ROOT = "/content/drive/MyDrive/IRE Assgn3/"

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import transformers
from transformers import AutoModel, BertTokenizerFast, AdamW
import tokenizers
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.utils.class_weight import compute_class_weight

device = torch.device("cuda")

In [None]:
def seed_all(seed = 42):
  """
  Fix seed for reproducibility
  """
  # python RNG
  import random
  random.seed(seed)

  # pytorch RNGs
  import torch
  torch.manual_seed(seed)
  torch.backends.cudnn.deterministic = True
  if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

  # numpy RNG
  import numpy as np
  np.random.seed(seed)

### Model Configs


In [None]:
class config:
  SEED = 42
  TRAIN_FILE = ROOT + 'train.tsv'
  VAL_FILE = ROOT + 'dev.tsv'
  TEST_FILE = ROOT + 'test.tsv'
  SAVE_DIR = ROOT + 'outputs/'
  MAX_LEN = 25
  MODEL = 'bert-base-uncased'
  # CONFIG = CONFIGFOLDER + 'finetune_bert_config.json'
  TOKENIZER = BertTokenizerFast.from_pretrained('bert-base-uncased',lowercase=True)
  # TOKENIZER = tokenizers.BertWordPieceTokenizer(CONFIGFOLDER + "finetune_bert_vocab.txt", lowercase=True)
  EPOCHS = 10
  TRAIN_BATCH_SIZE = 32
  VALID_BATCH_SIZE = 32

### Data Processing

In [None]:
def process_data(file, ftype):
    data = pd.read_csv(str(file), sep='\t')
    data.columns = ['text', 'rating']
    data['rating'] = data['rating'].apply(lambda x: 1 if x >= 0.8 else 0)
    data_text, data_rats = data['text'], data['rating']
    tokens = config.TOKENIZER.batch_encode_plus(
        data_text.tolist(),
        max_length = config.MAX_LEN,
        pad_to_max_length = True,
        truncation = True
    )
    seqs = torch.tensor(tokens['input_ids'])
    mask = torch.tensor(tokens['attention_mask'])
    label = torch.tensor(data_rats.tolist())
    if ftype == "test":
      return seqs, mask, label
    tensor_data = TensorDataset(seqs, mask, label)
    if ftype == "train":
      data_sampler = RandomSampler(tensor_data)
    else:
      data_sampler = SequentialSampler(tensor_data)
    dataloader = DataLoader(tensor_data, sampler=data_sampler, batch_size=config.TRAIN_BATCH_SIZE)
    data_rats = data['rating']
    data_text = data['text']
    return data_rats,data_text, dataloader

In [None]:
train_labels, train_text, train_dataloader = process_data(config.TRAIN_FILE, "train")
val_labels, val_text, val_dataloader = process_data(config.VAL_FILE, "val")
test_seq, test_mask, test_labels = process_data(config.TEST_FILE, "test")



### BERT Model


In [None]:
class BertDWF(nn.Module):
  def __init__(self):
    super(BertDWF, self).__init__()
    self.model_config = transformers.BertConfig.from_pretrained(config.MODEL)
    # self.bert = transformers.BertModel.from_pretrained(config.MODEL, config=config.CONFIG)
    self.bert = transformers.BertModel.from_pretrained(config.MODEL)
    # for param in self.bert.parameters():
    #     param.requires_grad = False
    self.layer1 = nn.Linear(768, 512)
    self.layer2 = nn.Linear(512, 2)
    self.dropout = nn.Dropout(0.1)
    self.relu =  nn.ReLU()
    self.softmax = nn.LogSoftmax(dim=1)

  def forward(self, ids, mask):
    _, outs = self.bert(input_ids = ids, attention_mask=mask, return_dict=False)

    l1out = self.layer1(outs)
    l1relu = self.relu(l1out)
    l1drop = self.dropout(l1relu)

    l2out = self.layer2(l1drop)

    probs = self.softmax(l2out)

    return probs

In [None]:
model = BertDWF()
model = model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_labels), y=train_labels)

weights= torch.tensor(class_weights,dtype=torch.float)
weights = weights.to(device)

optimizer = AdamW(model.parameters(), lr = 2e-5)  

# define the loss function
cross_entropy  = nn.NLLLoss(weight=weights) 

### Train model

In [None]:
def train():
  model.train()
  total_loss, total_accuracy = 0, 0
  total_preds=[]
  
  for step,batch in enumerate(train_dataloader):
    # progress update 
    if step>0 and step % 100 == 0:
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
    batch = [r.to(device) for r in batch]
    sent_id, mask, labels = batch
    model.zero_grad()        

    # get model predictions
    preds = model(sent_id, mask)
    loss = cross_entropy(preds, labels)
    total_loss = total_loss + loss.item()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    preds=preds.detach().cpu().numpy()
    total_preds.append(preds)

  avg_loss = total_loss / len(train_dataloader)
  
  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)

  return avg_loss, total_preds

In [None]:
def evaluate():
  print("\nEvaluating...")
  
  # deactivate dropout layers
  model.eval()
  total_loss, total_accuracy = 0, 0
  # empty list to save the model predictions
  total_preds = []
  for step,batch in enumerate(val_dataloader):
    # Progress update
    if step>0 and step % 100 == 0:      
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))
    batch = [t.to(device) for t in batch]
    sent_id, mask, labels = batch
    # prediction
    with torch.no_grad():
      preds = model(sent_id, mask)
      loss = cross_entropy(preds,labels)
      total_loss = total_loss + loss.item()
      preds = preds.detach().cpu().numpy()
      total_preds.append(preds)
  # compute the validation loss
  avg_loss = total_loss / len(val_dataloader) 
  # reshape the predictions
  total_preds  = np.concatenate(total_preds, axis=0)

  return avg_loss, total_preds

In [None]:
# set initial loss to infinite
best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

for epoch in range(config.EPOCHS):
    print('\n Epoch {:} / {:}'.format(epoch + 1, config.EPOCHS))
    train_loss, _ = train()
    valid_loss, _ = evaluate()
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    print(f'\nTraining Loss: {train_loss:.3f} \n Validation Loss: {valid_loss:.3f}')


 Epoch 1 / 10
  Batch   100  of    547.
  Batch   200  of    547.
  Batch   300  of    547.
  Batch   400  of    547.
  Batch   500  of    547.

Evaluating...
  Batch   100  of    118.

Training Loss: 0.488 
 Validation Loss: 0.419

 Epoch 2 / 10
  Batch   100  of    547.
  Batch   200  of    547.
  Batch   300  of    547.
  Batch   400  of    547.
  Batch   500  of    547.

Evaluating...
  Batch   100  of    118.

Training Loss: 0.365 
 Validation Loss: 0.400

 Epoch 3 / 10
  Batch   100  of    547.
  Batch   200  of    547.
  Batch   300  of    547.
  Batch   400  of    547.
  Batch   500  of    547.

Evaluating...
  Batch   100  of    118.

Training Loss: 0.287 
 Validation Loss: 0.457

 Epoch 4 / 10
  Batch   100  of    547.
  Batch   200  of    547.
  Batch   300  of    547.
  Batch   400  of    547.
  Batch   500  of    547.

Evaluating...
  Batch   100  of    118.

Training Loss: 0.219 
 Validation Loss: 0.478

 Epoch 5 / 10
  Batch   100  of    547.
  Batch   200  of    547.
 

In [None]:
#load weights of best model
path = 'saved_weights.pt'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

### Test Data

In [None]:
# get predictions for test data
with torch.no_grad():
  preds = model(test_seq.to(device), test_mask.to(device))
  preds = preds.detach().cpu().numpy()

In [None]:
preds = np.argmax(preds, axis = 1)
print(classification_report(test_y, preds))

              precision    recall  f1-score   support

           0       0.92      0.78      0.84      2369
           1       0.71      0.89      0.79      1480

    accuracy                           0.82      3849
   macro avg       0.81      0.83      0.82      3849
weighted avg       0.84      0.82      0.82      3849

