# Fine-tuned `bert-base-uncased` on SST

In [1]:
import os
import random

import pandas as pd
import numpy as np
import scipy as sp
import torch
import spacy
from torch.utils.data import \
    TensorDataset, \
    DataLoader
from transformers import \
    BertTokenizer, \
    BertForSequenceClassification, \
    AdamW, \
    BertConfig, \
    get_linear_schedule_with_warmup
import pytreebank
from tqdm import tqdm
import shap
from checklist.perturb import Perturb

In [3]:
# os.chdir('../..')

In [None]:
os.getcwd()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
os.chdir('drive/My Drive/Colab Notebooks/Github/ucl-nlp-group-project')

In [19]:
from src.data.dataload import load_sst, load_agnews
from src.models.bert_utils import \
    pad_sentence_at_end, \
    create_sentence_input_arrays, \
    SST_MAX_LENGTH, \
    SST_BERT_HYPERPARAMETERS, \
    SST_NUM_LABELS, \
    fine_tune_bert, \
    make_predictions

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


## SST

In [6]:
sst = load_sst()

In [7]:
train_sst, dev_sst, test_sst = sst.train_val_test
train_sst.shape, dev_sst.shape, test_sst.shape

((8544, 2), (1101, 2), (2210, 2))

In [8]:
train_sst.head()

Unnamed: 0,sentence,label
0,The Rock is destined to be the 21st Century 's...,3
1,The gorgeously elaborate continuation of `` Th...,4
2,Singer/composer Bryan Adams contributes a slew...,3
3,You 'd think by now America would have had eno...,2
4,Yet the act is still charming here .,3


In [9]:
(train_sst['label'].value_counts() / train_sst.shape[0]).sort_index()

0    0.127809
1    0.259597
2    0.190075
3    0.271770
4    0.150749
Name: label, dtype: float64

In [10]:
train_sst.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8544 entries, 0 to 8543
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  8544 non-null   object
 1   label     8544 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 133.6+ KB


### Tokenization

In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [12]:
train_encoded_sentences = []

for sentence in train_sst['sentence'].values:
    enc_sent_as_list = tokenizer.encode(sentence, add_special_tokens=True)
    train_encoded_sentences.append(enc_sent_as_list)

In [13]:
dev_encoded_sentences = []

for sentence in dev_sst['sentence'].values:
    enc_sent_as_list = tokenizer.encode(sentence, add_special_tokens=True)
    dev_encoded_sentences.append(enc_sent_as_list)

In [14]:
train_array, train_attention_mask_array = create_sentence_input_arrays(
    train_encoded_sentences, 
    SST_MAX_LENGTH
)

dev_array, dev_attention_mask_array = create_sentence_input_arrays(
    dev_encoded_sentences, 
    SST_MAX_LENGTH
)

In [15]:
train_array.shape, train_attention_mask_array.shape, dev_array.shape, dev_attention_mask_array.shape

((8544, 70), (8544, 70), (1101, 70), (1101, 70))

Convert to tensors

In [16]:
train_tensor = torch.tensor(train_array)
train_attention_mask_tensor = torch.tensor(train_attention_mask_array)
train_labels_tensor = torch.tensor(train_sst['label'].values)

dev_tensor = torch.tensor(dev_array)
dev_attention_mask_tensor = torch.tensor(dev_attention_mask_array)
dev_labels_tensor = torch.tensor(dev_sst['label'].values)

In [17]:
train_dataset = TensorDataset(train_tensor, train_attention_mask_tensor, train_labels_tensor)
dev_dataset = TensorDataset(dev_tensor, dev_attention_mask_tensor, dev_labels_tensor)

In [18]:
train_data_loader = DataLoader(train_dataset, batch_size=SST_BERT_HYPERPARAMETERS['batch_size'], shuffle=True)
dev_data_loader = DataLoader(dev_dataset, batch_size=SST_BERT_HYPERPARAMETERS['batch_size'])

## Fine-tune BERT

Run on Colab

In [20]:
bert_sst = fine_tune_bert(
    device, 
    train_data_loader, 
    dev_data_loader, 
    num_labels=SST_NUM_LABELS, 
    hyperparameter_dict=SST_BERT_HYPERPARAMETERS
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

KeyboardInterrupt: 

In [None]:
bert_sst.save_pretrained("models/fine-tuned-bert-base-sst")

## Load model

In [None]:
bert_sst = BertForSequenceClassification.from_pretrained("models/fine-tuned-bert-base-sst")

In [None]:
%%capture
bert_sst.to(device)

## Make predictions

In [21]:
train_sst.head()

Unnamed: 0,sentence,label
0,The Rock is destined to be the 21st Century 's...,3
1,The gorgeously elaborate continuation of `` Th...,4
2,Singer/composer Bryan Adams contributes a slew...,3
3,You 'd think by now America would have had eno...,2
4,Yet the act is still charming here .,3


In [None]:
predictions = make_predictions(
    train_sst, 
    bert_sst, 
    tokenizer, 
    'sentence', 
    device, 
    SST_MAX_LENGTH, 
    SST_BERT_HYPERPARAMETERS
)

In [None]:
(train_sst['label'].values == predictions).mean()