# BERT

In this notebook, I will use Google's BERT (Bidrectional Encoder Representations from Transformers), which uses the attention. I'll try use PyTorch instead of Tensorflow.

In [None]:
# mounting google drive
from google.colab import drive
from os.path import join
ROOT = '/content/drive'
drive.mount(ROOT)

In [None]:
cd 'drive/My Drive/Colab Notebooks/tweet_sentiment_analysis'

In [None]:
#pip install transformers

In [None]:
#pip install pandas -U

In [None]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import *
from torch.nn.utils import clip_grad_norm_

from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup

In [None]:
# check GPU
torch.cuda.is_available()

In [None]:
X_train = pd.read_pickle('PKL/X_train_fin.pkl')
X_val = pd.read_pickle('PKL/X_val_fin.pkl')
y_train = pd.read_csv('DATA/y_train.csv', index_col=0)
y_val = pd.read_csv('DATA/y_val.csv', index_col=0)

In [None]:
X_train = X_train['tweet']
X_val = X_val['tweet']

In [None]:
y_train = y_train['sentiment']
y_val = y_val['sentiment']

## Preprocess
Turning texts into tokens

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', from_pt = True)

add paddings to ids

In [None]:
X_tr_ids = tokenizer.batch_encode_plus(X_train, padding = True,
                                       return_token_type_ids=False,
                                      return_tensors = 'pt', 
                                      max_length = 30, 
                                      pad_to_max_length = True)

In [None]:
X_val_ids = tokenizer.batch_encode_plus(X_val, padding = True,
                                       return_token_type_ids=False,
                                      return_tensors = 'pt', 
                                      max_length = 30, 
                                      pad_to_max_length = True)

Converting y_values

In [None]:
classes_ind = dict(zip(set(y_train), range(3)))
y_train = torch.tensor([classes_ind[y] for y in y_train])
y_val = torch.tensor([classes_ind[y] for y in y_val])

### Creating the tensor datasets for PyTorch
Not we have the tensors. Let's create the dataloaders.

In [None]:
X_train_set = TensorDataset(X_tr_ids['input_ids'], X_tr_ids['attention_mask'], y_train)
tr_dataloader = DataLoader(X_train_set, sampler = RandomSampler(X_train_set), 
                          batch_size = 32)


X_val_set = TensorDataset(X_val_ids['input_ids'], X_val_ids['attention_mask'], y_val)
val_dataloader = DataLoader(X_val_set, sampler = SequentialSampler(X_val_set), 
                          batch_size = 32)


### Modeling
Now time to fine tune

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                      num_labels = 3)

Below code is an adaptation of these two sources (https://towardsdatascience.com/bert-for-dummies-step-by-step-tutorial-fb90890ffe03, https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#sphx-glr-beginner-blitz-cifar10-tutorial-py)


In [None]:
model.cuda()

In [43]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [None]:
# parameters
optimizer = AdamW(optimizer_grouped_parameters, lr = 5e-5)
epochs = 2
steps = len(tr_dataloader) * epochs

In [None]:
from sklearn.metrics import f1_score, balanced_accuracy_score

In [None]:
# macro f1 score
def macro_f1 (preds, labels):
  pred_f = np.argmax(preds, axis = 1).flatten()
  labels_f = labels.flatten()
  return f1_score(labels_f, pred_f, average = 'macro')

def avg_accuracy (preds, labels):
  pred_f = np.argmax(preds, axis = 1).flatten()
  labels_f = labels.flatten()
  return balanced_accuracy_score(labels_f, pred_f)

In [None]:
device = torch.device('cuda')

In [None]:
# copying to GPU
def copy_GPU(data):
  input_ids = data[0].to(device)
  input_mask = data[1].to(device)
  labels = data[2].to(device)
  return input_ids, input_mask, labels

history = [] 
# training
for epoch in range(epochs):
  total_loss = 0.0
  model.train()
  for data in tr_dataloader:
    # reset gradient
    optimizer.zero_grad()

    input_ids, input_mask, labels = copy_GPU(data)

    # forward pass
    loss, logits = model(input_ids, attention_mask = input_mask, labels=labels)

    total_loss += loss.item()

    # backward pass
    loss.backward()
    
    #clip_grad_norm_(model.parameters(), 1.0)

    # update step
    optimizer.step()

  avg_tr_loss = total_loss / len(tr_dataloader)            
  print(f"Epoch {epoch+1} -- avg training loss: {round(avg_tr_loss, 4)}")
  
  #evaluation
  model.eval()
  total_val_loss = 0.0
  total_f1 = 0.0
  total_acc = 0.0
  for data in val_dataloader:
    input_ids, input_mask, labels = copy_GPU(data)

    with torch.no_grad():
     loss, logits = model(input_ids, attention_mask = input_mask, 
                          labels=labels)

     logits = logits.detach().cpu().numpy()
     label_ids = labels.to('cpu').numpy()
     
     total_val_loss += loss.item()
     total_f1 += macro_f1(logits, label_ids)
     total_acc += avg_accuracy(logits, label_ids)
  
  avg_val_f1 = total_f1 / len(val_dataloader)
  avg_val_acc = total_acc / len(val_dataloader)
  avg_val_loss = total_val_loss / len(val_dataloader)
  print(f"  avg validation loss: {round(avg_val_loss, 4)}")
  print(f"  avg validation accuracy: {round(avg_val_acc, 4)}")
  print(f"  avg validation f1 score: {round(avg_val_f1, 4)}")

  history.append({'epoch': epoch + 1, 
                  'training loss': avg_tr_loss, 
                  'validation loss': avg_val_loss, 
                  'validation accuracy': avg_val_acc,
                  'validation F1': avg_val_f1
                  })
print('training completed')

In [None]:
results = pd.DataFrame(history)

In [None]:
import matplotlib.pyplot as plt
% matplotlib inline

# Plot the learning curve.
plt.plot(results['training loss'], label="Train")
plt.plot(results['validation loss'], label="Validation")

# Label the plot.
plt.title("Training & Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks([0, 1, 2, 3], [1, 2, 3, 4])

plt.show()

In [None]:
# validation predictions
y_pred = []
y_test0 = []
for data in val_dataloader:
    input_ids, input_mask, labels = copy_GPU(data)

    with torch.no_grad():
     loss, logits = model(input_ids, attention_mask = input_mask, 
                          labels=labels)

     logits = logits.detach().cpu().numpy()
     label_ids = labels.to('cpu').numpy()
     
     y_pred.extend(logits)
     y_test0.extend(label_ids)


In [88]:
from sklearn.metrics import accuracy_score, cohen_kappa_score, classification_report

In [89]:
inv_classes = {v:k for k, v in classes_ind.items()}

In [90]:
def change_to_classes(target):
    tmp = np.argmax(target, axis = 1)
    return [inv_classes[x] for x in tmp]

def evaluate(y_pred, target):
    y_val = [inv_classes[x] for x in target]
    y_pred = change_to_classes(y_pred)
    print('Accuracy', round(accuracy_score(y_val, y_pred), 4))
    print('Cohens Kappa', round(cohen_kappa_score(y_val, y_pred), 4))

    print(classification_report(y_val, y_pred))

In [91]:
evaluate(y_pred, y_test0)

Accuracy 0.6711
Cohens Kappa 0.3069
                                    precision    recall  f1-score   support

                  Negative emotion       0.44      0.10      0.16        71
No emotion toward brand or product       0.69      0.86      0.76       666
                  Positive emotion       0.63      0.45      0.52       385

                          accuracy                           0.67      1122
                         macro avg       0.59      0.47      0.48      1122
                      weighted avg       0.65      0.67      0.64      1122



In [64]:
y_val

tensor([2, 1, 1,  ..., 2, 1, 1])