# Fine-tuning of the pre-trained seBERT model for the sentiment classification task
This notebook can be used to fine-tune a pre-trained seBERT model for any sequence classification task.
We use the example of sentiment mining which is a multi-label sequence classification task.

We use a reduced batch size and sample size so that this can run on consumer hardware. We tested this on a Nvidia GTX 1080.

In [1]:
import os

import torch
import pandas as pd
import numpy as np

from sklearn.metrics import recall_score, precision_score, f1_score, matthews_corrcoef, accuracy_score
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer

In [2]:
DATA_PATH = './data/'
SEBERT_MODEL_PATH = './models/seBERT/'  # path of the pre-trained sebert model
SENTIMENT_MODEL_PATH = './models/sentiment/'  # path to store the final fine-tuned sentiment classification model
CHECKPOINTS_PATH = './models/checkpoints/'  # path to store checkpoints of the model for each epoch

print(os.getcwd())
%pwd
%ls -l {SEBERT_MODEL_PATH}

/home/mamo/Research/seBERT/notebooks
total 1313764
-rw------- 1 mamo mamo        314 Jun 16  2021 config.json
-rw------- 1 mamo mamo 1345068138 Jun 16  2021 pytorch_model.bin
-rwx------ 1 mamo mamo     214692 Jan 10  2021 [0m[01;32mvocab.txt[0m*


# Datasets

In [14]:
gh = pd.read_csv(DATA_PATH + 'github_gold.csv', sep=';')
gh.head()

Unnamed: 0,ID,Polarity,Text
0,4063186,neutral,No. I still see the wrong twins. * https://gi...
1,3894703,neutral,"Reverted."""
2,1971084,neutral,You can leave a queue while in queue ? (before...
3,1827828,positive,"Didn't look at SpellTargetRestrictions XD"""
4,232603,neutral,Not sure about what kind of line lengths the p...


In [15]:
gh = gh[['Text', 'Polarity']]
gh.rename(columns={"Text": "text", 'Polarity': 'label'}, inplace=True)

gh.head()

Unnamed: 0,text,label
0,No. I still see the wrong twins. * https://gi...,neutral
1,"Reverted.""",neutral
2,You can leave a queue while in queue ? (before...,neutral
3,"Didn't look at SpellTargetRestrictions XD""",positive
4,Not sure about what kind of line lengths the p...,neutral


In [16]:
jira = pd.read_csv(DATA_PATH + 'JIRA.csv')
jira.head()

Unnamed: 0,sentence,oracle
0,guys... this is so stupid...,-1
1,I lost the whole morning cause HBase's RegionS...,-1
2,{quote}You are messing down deep below hbase i...,-1
3,And I think if we're going to do a sweep up of...,-1
4,"@idiot Yeah, I was on that idiot-path for a go...",-1


In [17]:
jira = jira[['sentence', 'oracle']]
jira.rename(columns={"sentence": "text", 'oracle': 'label'}, inplace=True)
jira_labels = {
    -1: 'negative', 
    0: 'neutral', 
    1: 'positive' 
}
jira['label'] = jira['label'].replace(jira_labels)

jira.head()

Unnamed: 0,text,label
0,guys... this is so stupid...,negative
1,I lost the whole morning cause HBase's RegionS...,negative
2,{quote}You are messing down deep below hbase i...,negative
3,And I think if we're going to do a sweep up of...,negative
4,"@idiot Yeah, I was on that idiot-path for a go...",negative


In [18]:
so = pd.read_csv(DATA_PATH + 'StackOverflow.csv')
so.head()

Unnamed: 0,id,text,oracle
0,6,But sadly this is not working.,-1
1,78,"So, everything builds fine, but when we try to...",-1
2,90,That is what is causing your null pointer exce...,-1
3,139,"All attempts I've made were, in a shortcut, un...",-1
4,162,Don't use.,-1


In [19]:
so = so[['text', 'oracle']]
so.rename(columns={'oracle': 'label'}, inplace=True)
so_labels = {
    -1: 'negative', 
    0: 'neutral', 
    1: 'positive' 
}
so['label'] = so['label'].replace(so_labels)

so.head()

Unnamed: 0,text,label
0,But sadly this is not working.,negative
1,"So, everything builds fine, but when we try to...",negative
2,That is what is causing your null pointer exce...,negative
3,"All attempts I've made were, in a shortcut, un...",negative
4,Don't use.,negative


In [20]:
so2 = pd.read_csv(DATA_PATH + 'NewData.csv')
so2.head()

Unnamed: 0,id,text,oracle
0,,"After some research, I found that this was pos...",Negative
1,,Below is the small code snippet written in swi...,Negative
2,,Fatal error: Index out of rangeIllegal instruc...,Negative
3,,Cannot import python module using env and shou...,Negative
4,,"After long hours of research into the problem,...",Negative


In [21]:
so2 = so2[['text', 'oracle']]
so2.rename(columns={'oracle': 'label'}, inplace=True)
so2['label'] = so2['label'].apply(str.lower)

so2.head()

Unnamed: 0,text,label
0,"After some research, I found that this was pos...",negative
1,Below is the small code snippet written in swi...,negative
2,Fatal error: Index out of rangeIllegal instruc...,negative
3,Cannot import python module using env and shou...,negative
4,"After long hours of research into the problem,...",negative


In [22]:
api = pd.read_excel(DATA_PATH + 'BenchmarkUddinSO-ConsoliatedAspectSentiment.xls')
api.head()

Unnamed: 0,thread,tid,sent,ManualLabel,codes,stakeholder,signal,intent,action
0,15936368,15936368:1,"""JAXB Bindings File Sets @XmlElement type to S...",o,"[""'Usability'""]",,used,,
1,15936368,15936368:2,"""I'm trying to create an CODETERM1 that takes ...",o,"[""'Others'""]",,used,,
2,15936368,15936368:3,"""The purpose is simlply to remove timezone dat...",o,"[""'Others'""]",,used,,
3,15936368,15936368:4,"""It looks like this: CODESNIPPET_JAVA1 .""",o,"[""'Others'""]",,used,,
4,15936368,15936368:5,"""This works fine for the following code: CODES...",p,"[""'OnlySentiment'""]",,used,,


In [23]:
api = api[['sent', 'ManualLabel']]
api.rename(columns={"sent": "text", 'ManualLabel': 'label'}, inplace=True)
api_labels = {
    'n': 'negative', 
    'o': 'neutral', 
    'p': 'positive' 
}
api['label'] = api['label'].replace(api_labels)

api.head()

Unnamed: 0,text,label
0,"""JAXB Bindings File Sets @XmlElement type to S...",neutral
1,"""I'm trying to create an CODETERM1 that takes ...",neutral
2,"""The purpose is simlply to remove timezone dat...",neutral
3,"""It looks like this: CODESNIPPET_JAVA1 .""",neutral
4,"""This works fine for the following code: CODES...",positive


In [24]:
dataset = pd.concat([gh, jira, so, so2, api], ignore_index=True)

dataset['text'] = dataset['text'].str.replace('\n', ' ')
dataset['label'] = dataset['label'].apply(str.lower)

# duplicates are kept deliberately 
# dataset[dataset.duplicated(subset='text', keep=False)].sort_values(by='text')

dataset.to_csv(f"{DATA_PATH}processed/finetuning.csv", index=False)

dataset['label'].value_counts()

label
neutral     9153
negative    4859
positive    4058
Name: count, dtype: int64

In [25]:
labelnum = {
    'negative': 0,
    'neutral': 1,
    'positive': 2
}

dataset['label'] = dataset['label'].map(labelnum)

In [26]:
# len(dataset.loc[dataset['text'].str.len().idxmax(), 'text'])
dataset.head()

Unnamed: 0,text,label
0,No. I still see the wrong twins. * https://gi...,1
1,"Reverted.""",1
2,You can leave a queue while in queue ? (before...,1
3,"Didn't look at SpellTargetRestrictions XD""",2
4,Not sure about what kind of line lengths the p...,1


# Fine-tuning

In [27]:
X = [str(i) for i in dataset['text'].tolist()]
y = [int(i) for i in dataset['label'].tolist()]

In [28]:
class Dataset(torch.utils.data.Dataset):
    """Just a standard torch Dataset for BERT-style data."""
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

In [29]:
def compute_metrics_multi_label(p):
    """This metrics computation is used by the huggingface trainer."""
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='micro')
    precision = precision_score(y_true=labels, y_pred=pred, average='micro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='micro')
    mcc = matthews_corrcoef(y_true=labels, y_pred=pred)

    recall_ma = recall_score(y_true=labels, y_pred=pred, average='macro')
    precision_ma = precision_score(y_true=labels, y_pred=pred, average='macro')
    f1_ma = f1_score(y_true=labels, y_pred=pred, average='macro')

    return {'accuracy': accuracy, 'precision_micro': precision, 'recall_micro': recall, 'f1_micro': f1, 'mcc': mcc, 'precision_macro': precision_ma, 'recall_macro': recall_ma, 'f1_macro': f1_ma}

In [30]:
class seBERT(BaseEstimator, ClassifierMixin):
    """
    We are effectively wrapping the high-level Trainer and TrainingArguments classes from the Huggingface library into a
    scikit-learn classifier.
    This allows us to use all of scikit-learn in a more natural way, e.g., pipelines or grid search.
    """
    def __init__(self, checkpoints_dir='../checkpoints/', batch_size=8):
        self.trainer = None
        self.checkpoints_dir = checkpoints_dir
        self.model = BertForSequenceClassification.from_pretrained(SEBERT_MODEL_PATH, config=SEBERT_MODEL_PATH + 'config.json', num_labels=3)
        self.tokenizer = BertTokenizer.from_pretrained(SEBERT_MODEL_PATH, do_lower_case=True)
        self.batch_size = batch_size
        self.max_length = 128
            
    def fit(self, X_train, X_val, y_train, y_val, training_args):
        """fit implements simple fine-tuning from the pre-trained model.
        """
        
        X_train_tokens = self.tokenizer(X_train, padding=True, truncation=True, max_length=self.max_length)
        X_val_tokens = self.tokenizer(X_val, padding=True, truncation=True, max_length=self.max_length)

        train_dataset = Dataset(X_train_tokens, y_train)
        eval_dataset = Dataset(X_val_tokens, y_val)

        if not os.path.exists(self.checkpoints_dir):
            os.makedirs(self.checkpoints_dir)
            
        self.trainer = Trainer(
            model           = self.model,
            args            = training_args,
            train_dataset   = train_dataset,
            eval_dataset    = eval_dataset,
            compute_metrics = compute_metrics_multi_label
        )
        print(self.trainer.train())
        return self

    def predict_proba(self, X, y=None):
        """This is kept simple intentionally, for larger Datasets this would be too ineficient,
        because we would effectively force a batch size of 1."""
        y_probs = []
        self.trainer.model.eval()
        with torch.no_grad():
            for _, X_row in enumerate(X):
                inputs = self.tokenizer(X_row, padding=True, truncation=True, max_length=self.max_length, return_tensors="pt").to('cuda')
                outputs = self.trainer.model(**inputs)
                probs = outputs[0].softmax(1).cpu().detach().numpy()
                y_probs.append(probs)
        return y_probs

    def predict(self, X, y=None):
        """Predict is evaluation."""
        y_probs = self.predict_proba(X, y)
        y_pred = []
        for y_prob in y_probs:
            y_pred.append(y_prob.argmax())
        return y_pred

    def save_model(self, path):
        if not os.path.exists(path):
            os.makedirs(path)
        self.trainer.model.save_pretrained(path)

In [31]:
# We split the training data into 80/20 training and validation sets, train for 10 epochs and chose the model
# that performs best on the validation data in the end.
training_args = TrainingArguments(
    output_dir                  = CHECKPOINTS_PATH,
    num_train_epochs            = 10,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size  = 8,
    gradient_accumulation_steps = 4,
    eval_accumulation_steps     = 10,
    evaluation_strategy         = 'epoch',
    save_strategy               = 'epoch',
    load_best_model_at_end      = True
)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True)

In [32]:
clf = seBERT()

# fit all the data
clf.fit(X_train, X_val, y_train, y_val, training_args)

# save the fine-tuned model
clf.save_model(SENTIMENT_MODEL_PATH)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./models/seBERT/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 