# Fine-tuning of the pre-trained seBERT model for the sentiment classification task
This notebook can be used to fine-tune a pre-trained seBERT model for any sequence classification task.
We use the example of sentiment mining which is a multi-label sequence classification task.

We use a reduced batch size and sample size so that this can run on consumer hardware. We tested this on a Nvidia GTX 1080.

In [1]:
import os
from pathlib import Path

import torch
import pandas as pd
import numpy as np
# from torch.utils.tensorboard import SummaryWriter

from sklearn.metrics import recall_score, precision_score, f1_score, matthews_corrcoef, accuracy_score
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer

In [2]:
DATA_PATH = './data/'
SEBERT_MODEL_PATH = './models/seBERT/'  # path of the pre-trained sebert model
SENTIMENT_MODEL_PATH = './models/sentiment/'  # path to store the final fine-tuned sentiment classification model
CHECKPOINTS_PATH = './models/checkpoints/'  # path to store checkpoints of the model for each epoch

In [3]:
# Load the dataset
dataset = pd.read_csv(f"{DATA_PATH}processed/finetuning_train.csv")

dataset['label'].value_counts()

label
neutral     8220
negative    4360
positive    3683
Name: count, dtype: int64

In [4]:
from utils import labelnum

dataset['label'] = dataset['label'].map(labelnum)

dataset.head()

Unnamed: 0,text,label
0,"""ForGUI-intensive work, AWT feels very primiti...",2
1,"""Anyway, what is the content of the objects in...",1
2,"""Have a look at WPF.""",0
3,"Re-add the newline at the end of the file.""",1
4,"cool, no worries.",2


In [5]:
X = [str(i) for i in dataset['text'].tolist()]
y = [int(i) for i in dataset['label'].tolist()]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
class Dataset(torch.utils.data.Dataset):
    """Just a standard torch Dataset for BERT-style data."""
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

In [7]:
def compute_metrics_multi_label(p):
    """This metrics computation is used by the huggingface trainer."""
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='micro')
    precision = precision_score(y_true=labels, y_pred=pred, average='micro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='micro')
    mcc = matthews_corrcoef(y_true=labels, y_pred=pred)

    recall_ma = recall_score(y_true=labels, y_pred=pred, average='macro')
    precision_ma = precision_score(y_true=labels, y_pred=pred, average='macro')
    f1_ma = f1_score(y_true=labels, y_pred=pred, average='macro')

    return {'accuracy': accuracy, 'precision_micro': precision, 'recall_micro': recall, 'f1_micro': f1, 'mcc': mcc, 'precision_macro': precision_ma, 'recall_macro': recall_ma, 'f1_macro': f1_ma}

In [8]:
class seBERT(BaseEstimator, ClassifierMixin):
    """
    We are effectively wrapping the high-level Trainer and TrainingArguments classes from the Huggingface library into a
    scikit-learn classifier.
    This allows us to use all of scikit-learn in a more natural way, e.g., pipelines or grid search.
    """
    def __init__(self, checkpoints_dir='../checkpoints/', batch_size=8):
        self.trainer = None
        self.checkpoints_dir = checkpoints_dir
        self.model = BertForSequenceClassification.from_pretrained(SEBERT_MODEL_PATH, config=SEBERT_MODEL_PATH + 'config.json', num_labels=3)
        self.tokenizer = BertTokenizer.from_pretrained(SEBERT_MODEL_PATH, do_lower_case=True)
        self.batch_size = batch_size
        self.max_length = 128
            
    def fit(self, X_train, X_val, y_train, y_val, training_args):
        """fit implements simple fine-tuning from the pre-trained model.
        """
        
        X_train_tokens = self.tokenizer(X_train, padding=True, truncation=True, max_length=self.max_length)
        X_val_tokens = self.tokenizer(X_val, padding=True, truncation=True, max_length=self.max_length)

        train_dataset = Dataset(X_train_tokens, y_train)
        eval_dataset = Dataset(X_val_tokens, y_val)

        if not os.path.exists(self.checkpoints_dir):
            os.makedirs(self.checkpoints_dir)
            
        self.trainer = Trainer(
            model           = self.model,
            args            = training_args,
            train_dataset   = train_dataset,
            eval_dataset    = eval_dataset,
            compute_metrics = compute_metrics_multi_label
        )
        print(self.trainer.train())
        return self

    def predict_proba(self, X, y=None):
        """This is kept simple intentionally, for larger Datasets this would be too inefficient,
        because we would effectively force a batch size of 1."""
        y_probs = []
        self.trainer.model.eval()
        with torch.no_grad():
            for _, X_row in enumerate(X):
                inputs = self.tokenizer(X_row, padding=True, truncation=True, max_length=self.max_length, return_tensors="pt").to('cuda')
                outputs = self.trainer.model(**inputs)
                probs = outputs[0].softmax(1).cpu().detach().numpy()
                y_probs.append(probs)
        return y_probs

    def predict(self, X, y=None):
        """Predict is evaluation."""
        y_probs = self.predict_proba(X, y)
        y_pred = []
        for y_prob in y_probs:
            y_pred.append(y_prob.argmax())
        return y_pred

    def save_model(self, path):
        if not os.path.exists(path):
            os.makedirs(path)
        self.trainer.model.save_pretrained(path)

In [9]:
# We split the training data into 80/20 training and validation sets, train for 5 epochs and chose the model
# that performs best on the validation data in the end.
training_args = TrainingArguments(
    output_dir                  = CHECKPOINTS_PATH,
    num_train_epochs            = 5,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size  = 8,
    gradient_accumulation_steps = 4,
    eval_accumulation_steps     = 10,
    evaluation_strategy         = 'epoch',
    save_strategy               = 'epoch',
    load_best_model_at_end      = True
)

In [10]:
clf = seBERT()

# fit all the data
clf.fit(X_train, X_val, y_train, y_val, training_args)

# save the fine-tuned model
clf.save_model(SENTIMENT_MODEL_PATH)

# save the fine-tuning history
pd.DataFrame(clf.trainer.state.log_history).to_csv(f"{SENTIMENT_MODEL_PATH}/log_history.csv", index=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./models/seBERT/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision Micro,Recall Micro,F1 Micro,Mcc,Precision Macro,Recall Macro,F1 Macro
0,No log,0.37993,0.846296,0.846296,0.846296,0.846296,0.752994,0.834358,0.843052,0.838312
1,0.443500,0.370838,0.853059,0.853059,0.853059,0.853059,0.761256,0.84584,0.841852,0.843693
2,0.235700,0.555268,0.84107,0.84107,0.84107,0.84107,0.745529,0.835879,0.835571,0.832557
4,0.030700,0.84046,0.848755,0.848755,0.848755,0.848755,0.753887,0.841267,0.835577,0.838338


TrainOutput(global_step=2030, training_loss=0.20025856257072222, metrics={'train_runtime': 3091.3121, 'train_samples_per_second': 21.043, 'train_steps_per_second': 0.657, 'total_flos': 1.5129024869885952e+16, 'train_loss': 0.20025856257072222, 'epoch': 4.990780577750461})
