# **BERT FINETUNE**

## **ENVIRONMENT SETUP**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install -q transformers

In [3]:
# Download the dataset

!rm -r data*
!wget http://argumentation.bplaced.net/arguana-data/dagstuhl-15512-argquality-corpus-v2.zip
!unzip dagstuhl-15512-argquality-corpus-v2.zip
!rm *.zip
!rm -r __MACOSX
!mv dagstuhl-15512-argquality-corpus-v2 data

--2023-04-18 13:59:19--  http://argumentation.bplaced.net/arguana-data/dagstuhl-15512-argquality-corpus-v2.zip
Resolving argumentation.bplaced.net (argumentation.bplaced.net)... 162.55.0.134, 2a01:4f8:252:1ee::2
Connecting to argumentation.bplaced.net (argumentation.bplaced.net)|162.55.0.134|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 827888 (808K) [application/zip]
Saving to: ‘dagstuhl-15512-argquality-corpus-v2.zip’


2023-04-18 13:59:22 (258 KB/s) - ‘dagstuhl-15512-argquality-corpus-v2.zip’ saved [827888/827888]

Archive:  dagstuhl-15512-argquality-corpus-v2.zip
   creating: dagstuhl-15512-argquality-corpus-v2/
  inflating: dagstuhl-15512-argquality-corpus-v2/.DS_Store  
   creating: __MACOSX/
   creating: __MACOSX/dagstuhl-15512-argquality-corpus-v2/
  inflating: __MACOSX/dagstuhl-15512-argquality-corpus-v2/._.DS_Store  
   creating: dagstuhl-15512-argquality-corpus-v2/dagstuhl-15512-argquality-corpus-annotated-xmi/
  inflating: dagstuhl-15512-argqualit

## **IMPORT DATASET**

In [4]:
import pandas as pd

df = pd.read_csv("data/dagstuhl-15512-argquality-corpus-annotated.csv", sep='\t', encoding_errors="ignore")
df.head(3)

Unnamed: 0,annotator,argumentative,overall quality,local acceptability,appropriateness,arrangement,clarity,cogency,effectiveness,global acceptability,...,global sufficiency,reasonableness,local relevance,credibility,emotional appeal,sufficiency,argument,#id,issue,stance
0,1,y,1 (Low),1 (Low),1 (Low),1 (Low),2 (Average),1 (Low),1 (Low),1 (Low),...,1 (Low),1 (Low),1 (Low),1 (Low),1 (Low),1 (Low),"it is true that bottled water is a waste, but ...",arg219250,ban-plastic-water-bottles,no-bad-for-the-economy
1,2,y,1 (Low),3 (High),2 (Average),2 (Average),3 (High),1 (Low),1 (Low),3 (High),...,1 (Low),2 (Average),2 (Average),2 (Average),2 (Average),1 (Low),"it is true that bottled water is a waste, but ...",arg219250,ban-plastic-water-bottles,no-bad-for-the-economy
2,3,y,2 (Average),2 (Average),2 (Average),2 (Average),2 (Average),2 (Average),2 (Average),2 (Average),...,2 (Average),2 (Average),3 (High),2 (Average),1 (Low),2 (Average),"it is true that bottled water is a waste, but ...",arg219250,ban-plastic-water-bottles,no-bad-for-the-economy


In [5]:
import numpy as np

print(f"Number of annotations = {len(df['argument'])}")
print(f"Number of unique arguements = {len(np.unique(df['argument']))}") # Each argument was scored by 3 annotators
print(f"Number of unique issue = {len(np.unique(df['issue']))}")  # There are a total of 16 issues
print(f"Number of unique stance = {len(np.unique(df['stance']))}") # Each issue has on an avg 2 stance (positive and negative)

Number of annotations = 960
Number of unique arguements = 320
Number of unique issue = 16
Number of unique stance = 28


### Remove non-arguments

In [6]:
df = df[df["argumentative"] == "y"]

In [7]:
print(f"Number of annotations = {len(df['argument'])}")
print(f"Number of unique arguements = {len(np.unique(df['argument']))}") # Each argument was scored by 3 annotators
print(f"Number of unique issue = {len(np.unique(df['issue']))}")  # There are a total of 16 issues
print(f"Number of unique stance = {len(np.unique(df['stance']))}") # Each issue has on an avg 2 stance (positive and negative)

Number of annotations = 935
Number of unique arguements = 316
Number of unique issue = 16
Number of unique stance = 28


### Consolidate annotator scores

In [8]:
argument = np.unique(df["argument"])

attributes = ["annotator", "overall quality", "cogency", "effectiveness", "reasonableness", "argument", "#id"]

cleaned_df = []

for arg in argument:

    new_df = df[df["argument"] == arg][attributes]
    flag = 0
    new_dict = {
        "#id": new_df["#id"].iloc[0],
        "argument": new_df["argument"].iloc[0],
    }

    for ele in ["overall quality", "cogency", "effectiveness", "reasonableness"]:
        if len(pd.value_counts(new_df[ele])) == 3:
            flag = 1
            break
        new_dict[ele] = pd.value_counts(new_df[ele]).index[0]
        
    if flag == 1:
        continue
    cleaned_df.append(new_dict)

In [9]:
df = pd.DataFrame(cleaned_df)

In [10]:
print(f"Number of annotations = {len(df['argument'])}")
print(f"Number of unique arguements = {len(np.unique(df['argument']))}") # Each argument was scored by 3 annotators

Number of annotations = 273
Number of unique arguements = 273


### Perform train-test split

In [11]:
df_train = df.sample(frac=0.8, random_state=101)
df_test = df.drop(df_train.index)

### Display Dataset Metrics

In [12]:
print(f"Length of dataset = {len(df)}")
print(f"Number of training data = {len(df_train)}")
print(f"Number of testing data = {len(df_test)}")

Length of dataset = 273
Number of training data = 218
Number of testing data = 55


In [13]:
print(f"Number of Classes = {len(np.unique(df['overall quality']))}")

for label in np.unique(df["overall quality"]):
    print(f"Number of Class {label} in training data = {len(df_train[df_train['overall quality']==label])}")
    print(f"Number of Class {label} in testing data = {len(df_test[df_test['overall quality']==label])}")

Number of Classes = 3
Number of Class 1 (Low) in training data = 128
Number of Class 1 (Low) in testing data = 31
Number of Class 2 (Average) in training data = 77
Number of Class 2 (Average) in testing data = 21
Number of Class 3 (High) in training data = 13
Number of Class 3 (High) in testing data = 3


## **TEXT CLEANING**

In [14]:
import re

In [15]:
def clean_text_bert(text):
    text = text.lower() # Convert the text into lowercase
    text = text.replace('</br>', '') # Remove </br>
    text = text.replace('\n', '') # Remove \n
    
    # Remove quotes
    text = re.sub(r"\'", "", text) 
    text = re.sub(r"\"", "", text) 
    
    text = re.sub(r"[^\w]", " ", text) # Remove all symbols

    text = re.sub(r'[ ]{2,}', ' ', text) # Remove extra spaces
    text = re.sub(r'[ \t]+$', '', text) # Remove trailing white spaces
    
    return text

In [16]:
df_train["argument"] = df_train["argument"].apply(clean_text_bert)
df_test["argument"] = df_test["argument"].apply(clean_text_bert)

In [17]:
df_train["argument"]

33     do we have the potential br yes br the only co...
13     americans spend billions on bottled water ever...
189    thats your best argument because they dont br ...
269    yes because if they fear getting hit than they...
135    kids are fat these days p e helps 1 one br and...
                             ...                        
97     i think that personal pursuit is more importan...
207    u s alone grew by over 13 according to researc...
203    theres no reason at all why anyone could say i...
17     as an ambitious young person wanting to become...
140    murder under any circumstance is not right a p...
Name: argument, Length: 218, dtype: object

## **PRE-CONFIG FOR BERT**

In [18]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [19]:
MAX_LEN = np.max([
    np.max([len(x) for x in df_train["argument"]]), 
    np.max([len(x) for x in df_test["argument"]])
])

if MAX_LEN > 510:  # Limit of the BERT model
    MAX_LEN = 510

BATCH_SIZE = 16
LEARNING_RATE = 1e-5

## **BUILD DATASET FOR BERT**

In [20]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [21]:
from torch.utils.data import Dataset

class ModelDataset(Dataset):
    def __init__(self, X, y, tokenizer, max_len):
        self.df = df
        self.max_len = max_len
        self.text = X
        self.tokenizer = tokenizer
        self.targets = y
        
    def __len__(self):
        return len(self.targets)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long).to(device),
            'mask': torch.tensor(mask, dtype=torch.long).to(device),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long).to(device),
            'targets': torch.tensor(self.targets[index], dtype=torch.float).to(device)
        }

In [22]:
X_train = np.array(df_train["argument"])
X_test = np.array(df_test["argument"])

In [23]:
y_train = df_train["overall quality"]
y_test = df_test["overall quality"]

encoder = {
    "1 (Low)": [1, 0, 0],
    "2 (Average)": [0, 1, 0],
    "3 (High)": [0, 0, 1],
}

y_train = np.array([encoder[ele] for ele in y_train])
y_test = np.array([encoder[ele] for ele in y_test])

In [24]:
train_data = ModelDataset(X_train, y_train, tokenizer, MAX_LEN)
test_data = ModelDataset(X_test, y_test, tokenizer, MAX_LEN)

In [25]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE)

## **BUILD CUSTOM MODEL**

In [26]:
from torch import nn
from transformers import AutoModel

class CustomModel(nn.Module):
    
    def __init__(self):
        super(CustomModel, self).__init__()
        self.bert = AutoModel.from_pretrained('bert-base-uncased')
        self.out_layer = nn.Linear(768, 3)

    def forward(self, ids, mask, token_type_ids):
        _, features = self.bert(
            ids, token_type_ids=token_type_ids,
            attention_mask=mask, return_dict=False
        )
        output = self.out_layer(features)
        return output

In [27]:
model = CustomModel().to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## **TRAIN MODEL**

In [28]:
loss_fn = nn.BCEWithLogitsLoss() # Loss function
optimizer = torch.optim.AdamW(params=model.parameters(), lr=LEARNING_RATE) # Optimizer

In [29]:
# Define function to train the model

from tqdm import tqdm

def train(epoch):
    
    model.train()
    
    # for batch in tqdm(train_loader):
    for batch in train_loader:
        
        optimizer.zero_grad()
        
        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)
        
        loss.backward()
        optimizer.step()
        
        
    # print(f'Epoch: {epoch + 1}, Loss:  {loss.item()}')

In [30]:
# Define function for model evaluation

def validation(data_loader):
    
    model.eval()
    targets = []
    outputs = []
    
    with torch.no_grad():
        
        for batch in data_loader:
            
            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
            batch_targets = batch['targets'].to(device, dtype = torch.float)
            
            batch_outputs = model(ids, mask, token_type_ids)
            
            targets.extend(batch_targets.cpu().numpy().tolist())
            outputs.extend(batch_outputs.cpu().numpy().tolist())
            
    return outputs, targets

In [31]:
# Perform model training

from copy import deepcopy
from sklearn import metrics

best_score = -np.inf
best_weights = None
history = []

EPOCHS = 100

for epoch in tqdm(range(EPOCHS)):
    train(epoch)

    outputs, targets = validation(test_loader)

    targets = [np.argmax(x) for x in targets]
    outputs = [np.argmax(x) for x in outputs]
    score = metrics.f1_score(targets, outputs, average='weighted')

    history.append(score)
    # print(f"Validation f1-score: {score:>.4f}")

    if score > best_score:
        best_score = score
        best_weights = deepcopy(model.state_dict())
        torch.save(best_weights, "/content/drive/MyDrive/bert_argqual_finetune.pt")

100%|██████████| 100/100 [36:07<00:00, 21.67s/it]


## **EVALUATE CUSTOM MODEL**

In [32]:
model.load_state_dict(best_weights)
outputs, targets = validation(test_loader)
targets = [np.argmax(x) for x in targets]
outputs = [np.argmax(x) for x in outputs]

In [33]:
accuracy = metrics.accuracy_score(targets, outputs)
f1_score_w_avg = metrics.f1_score(targets, outputs, average='weighted')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Weighted) = {f1_score_w_avg}")

print(metrics.classification_report(targets, outputs))

Accuracy Score = 0.6181818181818182
F1 Score (Weighted) = 0.6103557312252964
              precision    recall  f1-score   support

           0       0.72      0.68      0.70        31
           1       0.52      0.62      0.57        21
           2       0.00      0.00      0.00         3

    accuracy                           0.62        55
   macro avg       0.41      0.43      0.42        55
weighted avg       0.61      0.62      0.61        55



## **SAVE MODEL**

In [34]:
torch.save(best_weights, "./bert_argqual_finetune.pt")
torch.save(best_weights, "/content/drive/MyDrive/bert_argqual_finetune.pt")

In [35]:
saved_model = CustomModel().to(device)
saved_model.load_state_dict(torch.load("./bert_argqual_finetune.pt"))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [36]:
saved_model.eval()

targets = []
outputs = []

with torch.no_grad():
    for batch in test_loader:
        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
        batch_targets = batch['targets'].to(device, dtype = torch.float)
        
        batch_outputs = saved_model(ids, mask, token_type_ids)
        
        targets.extend(batch_targets.cpu().numpy().tolist())
        outputs.extend(batch_outputs.cpu().numpy().tolist())

In [37]:
targets = [np.argmax(x) for x in targets]
outputs = [np.argmax(x) for x in outputs]
accuracy = metrics.accuracy_score(targets, outputs)
f1_score_w_avg = metrics.f1_score(targets, outputs, average='weighted')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Weighted) = {f1_score_w_avg}")

print(metrics.classification_report(targets, outputs))

Accuracy Score = 0.6181818181818182
F1 Score (Weighted) = 0.6103557312252964
              precision    recall  f1-score   support

           0       0.72      0.68      0.70        31
           1       0.52      0.62      0.57        21
           2       0.00      0.00      0.00         3

    accuracy                           0.62        55
   macro avg       0.41      0.43      0.42        55
weighted avg       0.61      0.62      0.61        55

