# **BERT FINETUNE**

## **ENVIRONMENT SETUP**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m93.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m91.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 KB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# Download the dataset

!rm arg_quality_rank_30k.csv
!wget "https://www.research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_arg_quality_rank_30k.zip"
!unzip *.zip
!rm *.zip
!rm readme.txt

rm: cannot remove 'arg_quality_rank_30k.csv': No such file or directory
--2023-04-10 12:28:11--  https://www.research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_arg_quality_rank_30k.zip
Resolving www.research.ibm.com (www.research.ibm.com)... 52.116.220.135
Connecting to www.research.ibm.com (www.research.ibm.com)|52.116.220.135|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_arg_quality_rank_30k.zip [following]
--2023-04-10 12:28:13--  https://research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_arg_quality_rank_30k.zip
Resolving research.ibm.com (research.ibm.com)... 52.116.220.135
Connecting to research.ibm.com (research.ibm.com)|52.116.220.135|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1524714 (1.5M) [application/zip]
Saving to: ‘IBM_Debater_(R)_arg_quality_rank_30k.zip’


2023-04-10 12:28:19 (329 KB/s) - ‘IBM_Debater_(R)_arg_quality_rank_30k.zi

## **IMPORT DATASET**

In [4]:
import pandas as pd

df = pd.read_csv("./arg_quality_rank_30k.csv")
df.head()

Unnamed: 0,argument,topic,set,WA,MACE-P,stance_WA,stance_WA_conf
0,"""marriage"" isn't keeping up with the times. a...",We should abandon marriage,train,0.846165,0.297659,1,1.0
1,.a multi-party system would be too confusing a...,We should adopt a multi-party system,train,0.891271,0.726133,-1,1.0
2,\ero-tolerance policy in schools should not be...,We should adopt a zero-tolerance policy in sch...,dev,0.721192,0.396953,-1,1.0
3,`people reach their limit when it comes to the...,Assisted suicide should be a criminal offence,train,0.730395,0.225212,-1,1.0
4,"100% agree, should they do that, it would be a...",We should abolish safe spaces,train,0.236686,0.004104,1,0.805517


### Convert to Classes

In [5]:
import numpy as np
from sklearn.cluster import KMeans

In [6]:
kmeans = KMeans(n_clusters=2, n_init="auto", max_iter=1000, random_state=431)
X = np.array(df["WA"]).reshape(-1, 1)
X = kmeans.fit_predict(X)
df["class"] = X

In [7]:
df.head()

Unnamed: 0,argument,topic,set,WA,MACE-P,stance_WA,stance_WA_conf,class
0,"""marriage"" isn't keeping up with the times. a...",We should abandon marriage,train,0.846165,0.297659,1,1.0,1
1,.a multi-party system would be too confusing a...,We should adopt a multi-party system,train,0.891271,0.726133,-1,1.0,1
2,\ero-tolerance policy in schools should not be...,We should adopt a zero-tolerance policy in sch...,dev,0.721192,0.396953,-1,1.0,1
3,`people reach their limit when it comes to the...,Assisted suicide should be a criminal offence,train,0.730395,0.225212,-1,1.0,1
4,"100% agree, should they do that, it would be a...",We should abolish safe spaces,train,0.236686,0.004104,1,0.805517,0


### Split into Train and Test Sets

In [8]:
df_train = df[df["set"] != "test"].reset_index(drop=True) # Combine train and dev into train set
df_train = df_train.drop(["set"], axis=1)

df_test = df[df["set"] == "test"].reset_index(drop=True)
df_test = df_test.drop(["set"], axis=1)

### Display Dataset Metrics

In [9]:
print(f"Length of dataset = {len(df)}")
print(f"Number of training data = {len(df_train)}")
print(f"Number of testing data = {len(df_test)}")

Length of dataset = 30497
Number of training data = 24182
Number of testing data = 6315


In [10]:
print(f"Number of Topics = {len(np.unique(df.topic))}")
print(f"Number of Topics in training data = {len(np.unique(df_train.topic))}")
print(f"Number of Topics in testing data = {len(np.unique(df_test.topic))}")

Number of Topics = 71
Number of Topics in training data = 56
Number of Topics in testing data = 15


In [11]:
print(f"Number of Classes = {len(np.unique(df['class']))}")

for label in np.unique(df["class"]):
    print(f"Number of Class {label} in training data = {len(df_train[df_train['class']==label])}")
    print(f"Number of Class {label} in testing data = {len(df_test[df_test['class']==label])}")

Number of Classes = 2
Number of Class 0 in training data = 6987
Number of Class 0 in testing data = 1979
Number of Class 1 in training data = 17195
Number of Class 1 in testing data = 4336


## **TEXT CLEANING**

In [12]:
import re

In [13]:
def clean_text_bert(text):
    text = text.lower() # Convert the text into lowercase
    text = text.replace('</br>', '') # Remove </br>
    text = text.replace('\n', '') # Remove \n
    
    # Remove quotes
    text = re.sub(r"\'", "", text) 
    text = re.sub(r"\"", "", text) 
    
    text = re.sub(r"[^\w]", " ", text) # Remove all symbols

    text = re.sub(r'[ ]{2,}', ' ', text) # Remove extra spaces
    text = re.sub(r'[ \t]+$', '', text) # Remove trailing white spaces
    
    return text

In [14]:
df_train["argument"] = df_train["argument"].apply(clean_text_bert)
df_test["argument"] = df_test["argument"].apply(clean_text_bert)

In [15]:
df_train["argument"]

0        marriage isnt keeping up with the times abando...
1         a multi party system would be too confusing a...
2         ero tolerance policy in schools should not be...
3         people reach their limit when it comes to the...
4        100 agree should they do that it would be a go...
                               ...                        
24177    zoos trap animals into a meaningless life only...
24178    zoos treat animals badly they should be closed...
24179    zoos unfairly imprison animals and cause them ...
24180    zoos work as educational centers and are not t...
24181           zoos work to help breed endangered species
Name: argument, Length: 24182, dtype: object

## **PRE-CONFIG FOR BERT**

In [16]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [17]:
MAX_LEN = np.max([
    np.max([len(x) for x in df_train["argument"]]), 
    np.max([len(x) for x in df_test["argument"]])
])

BATCH_SIZE = 32
LEARNING_RATE = 2e-5

## **BUILD DATASET FOR BERT**

In [18]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [19]:
from torch.utils.data import Dataset

class ModelDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.max_len = max_len
        self.text = df["argument"].values
        self.tokenizer = tokenizer
        self.targets = df["class"].values
        
    def __len__(self):
        return len(self.targets)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long).to(device),
            'mask': torch.tensor(mask, dtype=torch.long).to(device),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long).to(device),
            'targets': torch.tensor(self.targets[index], dtype=torch.float).to(device)
        }

In [20]:
train_data = ModelDataset(df_train, tokenizer, MAX_LEN)
test_data = ModelDataset(df_test, tokenizer, MAX_LEN)

In [21]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE)

## **BUILD CUSTOM MODEL**

In [22]:
from torch import nn
from transformers import AutoModel

class CustomModel(nn.Module):
    
    def __init__(self):
        super(CustomModel, self).__init__()
        self.bert = AutoModel.from_pretrained('bert-base-uncased')
        self.out_layer = nn.Linear(768, 1)

    def forward(self, ids, mask, token_type_ids):
        _, features = self.bert(
            ids, token_type_ids=token_type_ids,
            attention_mask=mask, return_dict=False
        )
        output = self.out_layer(features)
        return output

In [23]:
model = CustomModel().to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## **TRAIN MODEL**

In [24]:
loss_fn = nn.BCEWithLogitsLoss() # Loss function
optimizer = torch.optim.AdamW(params=model.parameters(), lr=LEARNING_RATE) # Optimizer

In [25]:
# Define function to train the model

from tqdm import tqdm

def train(epoch):
    
    model.train()
    
    for batch in tqdm(train_loader):
        
        optimizer.zero_grad()
        
        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs.view(-1), targets)
        
        loss.backward()
        optimizer.step()
        
        
    print(f'Epoch: {epoch + 1}, Loss:  {loss.item()}')

In [26]:
# Define function for model evaluation

def validation(data_loader):
    
    model.eval()
    targets = []
    outputs = []
    
    with torch.no_grad():
        
        for batch in data_loader:
            
            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
            batch_targets = batch['targets'].to(device, dtype = torch.float)
            
            batch_outputs = model(ids, mask, token_type_ids)
            
            targets.extend(batch_targets.cpu().numpy().tolist())
            outputs.extend(batch_outputs.cpu().numpy().tolist())
            
    return outputs, targets

In [27]:
# Perform model training

from copy import deepcopy
from sklearn import metrics

best_score = -np.inf
best_weights = None
history = []

EPOCHS = 3

for epoch in range(EPOCHS):
    train(epoch)

    outputs, targets = validation(test_loader)
    outputs = np.array(outputs) >= 0.5

    score = metrics.f1_score(targets, outputs, average='weighted')

    history.append(score)
    print(f"Validation f1-score: {score:>.4f}")

    if score > best_score:
        best_score = score
        best_weights = deepcopy(model.state_dict())

100%|██████████| 756/756 [15:47<00:00,  1.25s/it]


Epoch: 1, Loss:  0.5989481210708618
Validation f1-score: 0.7201


100%|██████████| 756/756 [15:45<00:00,  1.25s/it]


Epoch: 2, Loss:  0.28935593366622925
Validation f1-score: 0.7106


100%|██████████| 756/756 [15:45<00:00,  1.25s/it]


Epoch: 3, Loss:  0.5452108383178711
Validation f1-score: 0.7035


## **EVALUATE CUSTOM MODEL**

In [28]:
model.load_state_dict(best_weights)
outputs, targets = validation(test_loader)
outputs = np.array(outputs) >= 0.5

In [29]:
accuracy = metrics.accuracy_score(targets, outputs)
f1_score_w_avg = metrics.f1_score(targets, outputs, average='weighted')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Weighted) = {f1_score_w_avg}")

print(metrics.classification_report(targets, outputs))

Accuracy Score = 0.7266825019794141
F1 Score (Weighted) = 0.7200956986319198
              precision    recall  f1-score   support

         0.0       0.58      0.49      0.53      1979
         1.0       0.78      0.84      0.81      4336

    accuracy                           0.73      6315
   macro avg       0.68      0.66      0.67      6315
weighted avg       0.72      0.73      0.72      6315



## **SAVE MODEL**

In [30]:
torch.save(best_weights, "./bert_cls_finetune.pt")
torch.save(best_weights, "/content/drive/MyDrive/bert_cls_finetune.pt")

In [31]:
saved_model = CustomModel().to(device)
saved_model.load_state_dict(torch.load("./bert_cls_finetune.pt"))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [32]:
saved_model.eval()

targets = []
outputs = []

with torch.no_grad():
    for batch in test_loader:
        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
        batch_targets = batch['targets'].to(device, dtype = torch.float)
        
        batch_outputs = saved_model(ids, mask, token_type_ids)
        
        targets.extend(batch_targets.cpu().numpy().tolist())
        outputs.extend(batch_outputs.cpu().numpy().tolist())

In [33]:
outputs = np.array(outputs) >= 0.5
accuracy = metrics.accuracy_score(targets, outputs)
f1_score_w_avg = metrics.f1_score(targets, outputs, average='weighted')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Weighted) = {f1_score_w_avg}")

print(metrics.classification_report(targets, outputs))

Accuracy Score = 0.7266825019794141
F1 Score (Weighted) = 0.7200956986319198
              precision    recall  f1-score   support

         0.0       0.58      0.49      0.53      1979
         1.0       0.78      0.84      0.81      4336

    accuracy                           0.73      6315
   macro avg       0.68      0.66      0.67      6315
weighted avg       0.72      0.73      0.72      6315

