# Fine-tuning of the pre-trained seBERT model for the sentiment classification task
This notebook can be used to fine-tune a pre-trained seBERT model for any sequence classification task.
We use the example of sentiment mining which is a multi-label sequence classification task.

We use a reduced batch size and sample size so that this can run on consumer hardware. We tested this on a Nvidia GTX 1080.

In [1]:
import os

import torch
import pandas as pd
import numpy as np

from sklearn.metrics import recall_score, precision_score, f1_score, matthews_corrcoef, accuracy_score
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer

In [2]:
DATA_PATH = './data/'
SEBERT_MODEL_PATH = './models/seBERT/'  # path of the pre-trained sebert model
SENTIMENT_MODEL_PATH = './models/sentiment/'  # path to store the final fine-tuned sentiment classification model
CHECKPOINTS_PATH = './models/checkpoints/'  # path to store checkpoints of the model for each epoch

# print current directory
print(os.getcwd())
%pwd
%ls -l {SEBERT_MODEL_PATH}

/home/mamo/Research/seBERT/notebooks
total 1313764
-rw------- 1 mamo mamo        314 Jun 16  2021 config.json
-rw------- 1 mamo mamo 1345068138 Jun 16  2021 pytorch_model.bin
-rwx------ 1 mamo mamo     214692 Jan 10  2021 [0m[01;32mvocab.txt[0m*


In [3]:
class Dataset(torch.utils.data.Dataset):
    """Just a standard torch Dataset for BERT-style data."""
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

In [4]:
def compute_metrics_multi_label(p):
    """This metrics computation is used by the huggingface trainer."""
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='micro')
    precision = precision_score(y_true=labels, y_pred=pred, average='micro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='micro')
    mcc = matthews_corrcoef(y_true=labels, y_pred=pred)

    recall_ma = recall_score(y_true=labels, y_pred=pred, average='macro')
    precision_ma = precision_score(y_true=labels, y_pred=pred, average='macro')
    f1_ma = f1_score(y_true=labels, y_pred=pred, average='macro')

    return {'accuracy': accuracy, 'precision_micro': precision, 'recall_micro': recall, 'f1_micro': f1, 'mcc': mcc, 'precision_macro': precision_ma, 'recall_macro': recall_ma, 'f1_macro': f1_ma}

In [22]:
class seBERT(BaseEstimator, ClassifierMixin):
    """
    We are effectively wrapping the high-level Trainer and TrainingArguments classes from the Huggingface library into a
    scikit-learn classifier.
    This allows us to use all of scikit-learn in a more natural way, e.g., pipelines or grid search.
    """
    def __init__(self, checkpoints_dir='../checkpoints/', batch_size=8):
        self.trainer = None
        self.checkpoints_dir = checkpoints_dir
        self.model = BertForSequenceClassification.from_pretrained(SEBERT_MODEL_PATH, config=SEBERT_MODEL_PATH + 'config.json', num_labels=3)
        self.tokenizer = BertTokenizer.from_pretrained(SEBERT_MODEL_PATH, do_lower_case=True)
        self.batch_size = batch_size
        self.max_length = 128
            
    def fit(self, X, y, training_args):
        """fit implements simple fine-tuning from the pre-trained model.

        We split the training data into 80/20 training and validation sets, train for 5 epochs and chose the model
        that performs best on the validation data in the end.
        """
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
        
        X_train_tokens = self.tokenizer(X_train, padding=True, truncation=True, max_length=self.max_length)
        X_val_tokens = self.tokenizer(X_val, padding=True, truncation=True, max_length=self.max_length)

        train_dataset = Dataset(X_train_tokens, y_train)
        eval_dataset = Dataset(X_val_tokens, y_val)

        if not os.path.exists(self.checkpoints_dir):
            os.makedirs(self.checkpoints_dir)
            
        self.trainer = Trainer(
            model           = self.model,
            args            = training_args,
            train_dataset   = train_dataset,
            eval_dataset    = eval_dataset,
            compute_metrics = compute_metrics_multi_label
        )
        print(self.trainer.train())
        return self

    def predict_proba(self, X, y=None):
        """This is kept simple intentionally, for larger Datasets this would be too ineficient,
        because we would effectively force a batch size of 1."""
        y_probs = []
        self.trainer.model.eval()
        with torch.no_grad():
            for _, X_row in enumerate(X):
                inputs = self.tokenizer(X_row, padding=True, truncation=True, max_length=self.max_length, return_tensors="pt").to('cuda')
                outputs = self.trainer.model(**inputs)
                probs = outputs[0].softmax(1).cpu().detach().numpy()
                y_probs.append(probs)
        return y_probs

    def predict(self, X, y=None):
        """Predict is evaluation."""
        y_probs = self.predict_proba(X, y)
        y_pred = []
        for y_prob in y_probs:
            y_pred.append(y_prob.argmax())
        return y_pred

    def save_model(self, path):
        if not os.path.exists(path):
            os.makedirs(path)
        self.trainer.model.save_pretrained(path)

In [23]:
# process github_gold.csv dataset
df1 = pd.read_csv(DATA_PATH + 'github_gold.csv', sep=';', quotechar='"')
df1['text_no_newlines'] = df1['Text'].str.replace('\n', ' ')

# process JIRA.csv dataset
df2 = pd.read_csv(DATA_PATH + 'JIRA.csv', sep=',', quotechar='"')
df2['text_no_newlines'] = df2['sentence'].str.replace('\n', ' ')
df2['Polarity'] = df2['oracle'].replace({-1: 'negative', 1: 'positive'})

# process NewData.csv dataset
df3 = pd.read_csv(DATA_PATH + 'NewData.csv', sep=',', quotechar='"')
df3['text_no_newlines'] = df3['text'].str.replace('\n', ' ')
df3['Polarity'] = df3['oracle'].str.lower()

# # concatenate all datasets
df = pd.concat([df1, df2, df3], ignore_index=True)
df.map(str)

# print percentages of each class in Polarity column
print(df['Polarity'].value_counts(normalize=True) * 100)

Polarity
neutral     44.214807
negative    31.889110
positive    23.896082
Name: proportion, dtype: float64


In [24]:
# make labels numeric
def labelnum(row):
    if row['Polarity'] == 'negative':
        return 0
    elif row['Polarity'] == 'neutral':
        return 1
    elif row['Polarity'] == 'positive':
        return 2
    else:
        raise Exception('no such type!')

df['text_no_newlines'] = df['Text'].str.replace('\n', ' ')
df['label'] = df.apply(labelnum, axis=1)

# X = df['text_no_newlines'].values
# y = df['label'].astype(int).values

X = [str(i) for i in df['text_no_newlines'].tolist()]
y = [int(i) for i in df['label'].tolist()]

In [29]:
training_args = TrainingArguments(
    output_dir                  = CHECKPOINTS_PATH,
    num_train_epochs            = 10,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size  = 8,
    gradient_accumulation_steps = 2,
    eval_accumulation_steps     = 10,
    evaluation_strategy         = 'epoch',
    save_strategy               = 'epoch',
    load_best_model_at_end      = True
)

# # use adam optimizer
# training_args.optimizer_type = 'adam'
# training_args.learning_rate = 5e-5
# training_args.adam_epsilon = 1e-8
# training_args.warmup_steps = 0
# training_args.weight_decay = 0.01

In [30]:
clf = seBERT()

# fit all the data
clf.fit(X, y, training_args)

# save the fine-tuned model
clf.save_model(SENTIMENT_MODEL_PATH)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./models/seBERT/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision Micro,Recall Micro,F1 Micro,Mcc,Precision Macro,Recall Macro,F1 Macro
0,0.73,0.809939,0.585477,0.585477,0.585477,0.585477,0.396915,0.732527,0.506334,0.485434
2,0.5344,0.605283,0.730705,0.730705,0.730705,0.730705,0.612267,0.836816,0.695245,0.717993
4,0.4652,0.670382,0.725311,0.725311,0.725311,0.725311,0.599526,0.818758,0.691702,0.711775
6,0.4432,0.704892,0.728216,0.728216,0.728216,0.728216,0.599077,0.814042,0.697769,0.719838
8,0.4344,0.728709,0.736929,0.736929,0.736929,0.736929,0.621023,0.841115,0.703133,0.726115
9,0.4215,0.729964,0.735685,0.735685,0.735685,0.735685,0.618262,0.837573,0.702173,0.724604


TrainOutput(global_step=6020, training_loss=0.49889788215738595, metrics={'train_runtime': 4864.9872, 'train_samples_per_second': 19.811, 'train_steps_per_second': 1.237, 'total_flos': 2.2436789346737664e+16, 'train_loss': 0.49889788215738595, 'epoch': 9.991701244813278})
