# Fine-tuned `bert-base-uncased` on AG News

In [1]:
import os
import random

import pandas as pd
import numpy as np
import scipy as sp
import torch
import spacy
from torch.utils.data import \
    TensorDataset, \
    DataLoader
from transformers import \
    BertTokenizer, \
    BertForSequenceClassification, \
    AdamW, \
    BertConfig, \
    get_linear_schedule_with_warmup
import pytreebank
from tqdm import tqdm
import shap
from checklist.perturb import Perturb

In [2]:
# os.chdir('../..')

In [None]:
os.getcwd()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [4]:
os.chdir('drive/My Drive/Colab Notebooks/Github/ucl-nlp-group-project')

In [5]:
from src.data.dataload import load_sst, load_agnews
from src.models.bert_utils import \
    pad_sentence_at_end, \
    create_sentence_input_arrays, \
    AGN_MAX_LENGTH, \
    AGN_BERT_HYPERPARAMETERS, \
    AGN_NUM_LABELS, \
    fine_tune_bert, \
    make_predictions

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


## AG News

In [7]:
agnews = load_agnews()

In [17]:
train_agn, dev_agn, test_agn = agnews.train_val_test
train_agn.shape, dev_agn.shape, test_agn.shape

Using custom data configuration default
Reusing dataset ag_news (/Users/stevengeorge/.cache/huggingface/datasets/ag_news/default/0.0.0/fb5c5e74a110037311ef5e904583ce9f8b9fbc1354290f97b4929f01b3f48b1a)


((108000, 2), (12000, 2), (7600, 2))

In [9]:
train_agn.head()

Unnamed: 0,label,text
115955,0,14 executed bodies found in Mosul MOSUL: The b...
9140,0,Iraqi ministers escape attacks BAGHDAD (BBC)--...
50651,0,Stocks Are Mixed As Oil Prices Climb NEW YORK ...
66129,0,"The missing Middle East: Bush, Kerry sidestep ..."
82828,1,Harbhajan and Kartik spin India to stunning vi...


In [19]:
train_agn.rename(columns={'text': 'sentence'}, inplace=True)
dev_agn.rename(columns={'text': 'sentence'}, inplace=True)

In [11]:
(train_agn['label'].value_counts() / train_agn.shape[0]).sort_index()

0    0.249944
1    0.250620
2    0.250120
3    0.249315
Name: label, dtype: float64

In [13]:
train_agn.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 108000 entries, 115955 to 69977
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   label     108000 non-null  int64 
 1   sentence  108000 non-null  object
dtypes: int64(1), object(1)
memory usage: 2.5+ MB


### Tokenization

In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [15]:
train_encoded_sentences = []

for sentence in train_agn['sentence'].values:
    enc_sent_as_list = tokenizer.encode(sentence, add_special_tokens=True)
    train_encoded_sentences.append(enc_sent_as_list)

In [20]:
dev_encoded_sentences = []

for sentence in dev_agn['sentence'].values:
    enc_sent_as_list = tokenizer.encode(sentence, add_special_tokens=True)
    dev_encoded_sentences.append(enc_sent_as_list)

In [21]:
train_array, train_attention_mask_array = create_sentence_input_arrays(
    train_encoded_sentences, 
    AGN_MAX_LENGTH
)

dev_array, dev_attention_mask_array = create_sentence_input_arrays(
    dev_encoded_sentences, 
    AGN_MAX_LENGTH
)

In [22]:
train_array.shape, train_attention_mask_array.shape, dev_array.shape, dev_attention_mask_array.shape

((108000, 380), (108000, 380), (12000, 380), (12000, 380))

Convert to tensors

In [24]:
train_tensor = torch.tensor(train_array)
train_attention_mask_tensor = torch.tensor(train_attention_mask_array)
train_labels_tensor = torch.tensor(train_agn['label'].values)

dev_tensor = torch.tensor(dev_array)
dev_attention_mask_tensor = torch.tensor(dev_attention_mask_array)
dev_labels_tensor = torch.tensor(dev_agn['label'].values)

In [25]:
train_dataset = TensorDataset(train_tensor, train_attention_mask_tensor, train_labels_tensor)
dev_dataset = TensorDataset(dev_tensor, dev_attention_mask_tensor, dev_labels_tensor)

In [27]:
train_data_loader = DataLoader(train_dataset, batch_size=AGN_BERT_HYPERPARAMETERS['batch_size'], shuffle=True)
dev_data_loader = DataLoader(dev_dataset, batch_size=AGN_BERT_HYPERPARAMETERS['batch_size'])

## Fine-tune BERT

Run on Colab

In [30]:
bert_agn = fine_tune_bert(
    device, 
    train_data_loader, 
    dev_data_loader, 
    num_labels=AGN_NUM_LABELS, 
    hyperparameter_dict=AGN_BERT_HYPERPARAMETERS
)

In [None]:
bert_agn.save_pretrained("models/fine-tuned-bert-base-agn")

## Load model

In [None]:
bert_agn = BertForSequenceClassification.from_pretrained("models/fine-tuned-bert-base-agn")

In [None]:
%%capture
bert_agn.to(device)

## Make predictions

In [21]:
train_agn.head()

Unnamed: 0,sentence,label
0,The Rock is destined to be the 21st Century 's...,3
1,The gorgeously elaborate continuation of `` Th...,4
2,Singer/composer Bryan Adams contributes a slew...,3
3,You 'd think by now America would have had eno...,2
4,Yet the act is still charming here .,3


In [None]:
predictions = make_predictions(
    train_agn, 
    bert_agn, 
    tokenizer, 
    'sentence', 
    device, 
    AGN_MAX_LENGTH, 
    AGN_BERT_HYPERPARAMETERS
)

In [None]:
(train_agn['label'].values == predictions).mean()