# Fine-tuning BERT-based model

In [1]:
model_name = 'distilbert-base-uncased'
responses_file = "responses/04091703_parsed_turbo_2000train_clean_eval.jsonl"
balance_dataset = False
use_latest_solution = False # run on single gpu

## Set up

In [2]:
!export "CUDA_VISIBLE_DEVICES"=1 jupyter notebook
!export "CUDA_VISIBLE_DEVICES"=1

In [3]:
import transformers
transformers.__version__

'4.29.2'

In [29]:
import os
import json
import pandas as pd
import numpy as np
import pprint as pp
import logging

import torch
from sklearn.model_selection import train_test_split
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

In [5]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
#device = torch.device('mps') # use mac m1 gpu
device

device(type='cuda', index=1)

In [50]:
label_mapping = {'False': 0, 'True': 1}

model_config = AutoConfig.from_pretrained(model_name,
                                          num_labels=len(label_mapping),
                                          hidden_dropout_prob=0.3,
                                          attention_probs_dropout_prob=0.3)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=model_config)

model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias', 'classifier.

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

### Data

In [7]:
PROJECT_DIR = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
RESPONSES_DIR = os.path.join(PROJECT_DIR, 'responses')
MODELS_DIR = os.path.join(PROJECT_DIR, 'classification/models')
OUTPUT_DIR = os.path.join(PROJECT_DIR, 'classification/preds')
LOGS_DIR = os.path.join(PROJECT_DIR, 'logs')

In [8]:
if os.path.isfile(responses_file):
    responses_path = responses_file
else:
    responses_path = os.path.join(RESPONSES_DIR, responses_file)
    if not os.path.isfile(responses_path):
        responses_path = os.path.join(PROJECT_DIR, responses_file)

# create df. do not add an index column
data_df = pd.read_json(responses_path, lines=True).drop(columns=['idx', 'uuid', 'parsed'])
data_df['eval'] = data_df['eval'].apply(lambda x: x['outcome'])
data_df

Unnamed: 0,text,eval
0,"The sanctions were a punishing blow, which mea...",False
1,Sammy wants to go where the people are. This m...,True
2,"First, if the choker is not in a jewelry box o...",False
3,Google Maps and other highway and street GPS s...,True
4,"The fox walked from the city into the forest, ...",True
...,...,...
1995,"First, we need to find a place with a dome. Th...",True
1996,"The sentence mentions a ""baby bottle"" and ""pac...",True
1997,"First, we need to determine what type of BBQ i...",False
1998,"First, we need to understand what a disease is...",True


#### Balance dataset

The original dataset is slightly unbalanced (60/40) with more True instance. Balance the dataset so that we have equal number of instances.

In [9]:
if balance_dataset:

    true_instances_count = len(data_df[data_df['outcome'] == True])
    false_instances_count = len(data_df[data_df['outcome'] == False])
    print(f'True instances: {true_instances_count}, False instances: {false_instances_count}')

    # create balanced dataset
    # a subset of data_df, where we select only false_instances_count number of true instances, and shuffle the data instances
    balanced_data_df = pd.concat([data_df[data_df['outcome'] == True].sample(false_instances_count, random_state=1), data_df[data_df['outcome'] == False]]).sample(frac=1, random_state=1)

    print(f'Balanced dataset: {len(balanced_data_df)} instances.\n\tLength match? {len(balanced_data_df) == false_instances_count * 2}')
    
    data_df = balanced_data_df

In [10]:
train, tmp = train_test_split(data_df, test_size=0.2, random_state=42)
val, test = train_test_split(tmp, test_size=0.5, random_state=42)
train.shape, val.shape, test.shape

((1600, 2), (200, 2), (200, 2))

In [11]:
from datasets import Dataset, DatasetDict

raw_datasets = DatasetDict({'train': Dataset.from_pandas(train),
                            'validation': Dataset.from_pandas(val), 
                            'test': Dataset.from_pandas(test)})
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'eval', '__index_level_0__'],
        num_rows: 1600
    })
    validation: Dataset({
        features: ['text', 'eval', '__index_level_0__'],
        num_rows: 200
    })
    test: Dataset({
        features: ['text', 'eval', '__index_level_0__'],
        num_rows: 200
    })
})

In [12]:
for key in raw_datasets.keys():
    raw_datasets[key] = raw_datasets[key].rename_column("eval", "label")
    raw_datasets[key] = raw_datasets[key].rename_column("__index_level_0__", "pandas_idx")

In [13]:
raw_datasets['train'][0]

{'text': 'The student left his writing instrument at his last place of study. This means that the instrument is not with him currently. \n\nOption A: Classroom - This could be a possibility if the student left the instrument on a desk or table in the classroom. \n\nOption B: Desk drawer - This could also be a possibility if the student had a desk or drawer in the classroom where he left the instrument. \n\nOption C: Bathroom - It is unlikely that the student left his writing instrument in the bathroom unless he was using it to write something while in there. \n\nOption D: Pocket - If the student had the instrument in his pocket, he would have realized that he still had it with him. \n\nOption E: Stationery store - This is not a possibility as the student left the instrument at his last place of study, not at a store. \n\nBased on the above analysis, options A and B seem to be the most likely possibilities. However, without more information, it is impossible to determine the exact locat

In [14]:
def tokenize_and_mask(raw_data):

    '''Tokenize
    Normal padding: set padding='max_length' and max_length=int (default is 512)
    Dynamic padding: set padding=False and (later in the Trainer) pass `data_collator=DataCollatorWithPadding(tokenizer)
    result will be a dict with keys 'input_ids', 'attention_mask'
    '''
    result = tokenizer(raw_data["text"],
                       max_length=512,
                       truncation=True,
                       #padding='max_length'
                       )

    '''Add labels'''
    if label_mapping is not None:
        if "label" in raw_data:
            result['labels'] = [label_mapping[str(label)] for label in raw_data["label"]]
    
    return result

In [15]:
'''Sanity check'''

processed_test = tokenize_and_mask(raw_datasets['train'][:5])

print(len(processed_test), # keys
      len(processed_test['input_ids']), 
      len(processed_test['attention_mask']), 
      len(processed_test['labels']))
print(processed_test['labels'][:5])
print(raw_datasets['train']['pandas_idx'][:5])
print(data_df.loc[545])

3 5 5 5
[0, 0, 0, 1, 1]
[968, 240, 819, 692, 420]
text    First, we need to consider the size of the not...
eval                                                 True
Name: 545, dtype: object


In [16]:
'''Prepare inputs: tokenize and mask'''
datasets = raw_datasets.map(tokenize_and_mask, batched=True)

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [17]:
datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'pandas_idx', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1600
    })
    validation: Dataset({
        features: ['text', 'label', 'pandas_idx', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
    test: Dataset({
        features: ['text', 'label', 'pandas_idx', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
})

In [18]:
for split in ['train', 'validation', 'test']:
    datasets[split].set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

### Transformers Setup

In [51]:
from transformers import default_data_collator, DataCollatorWithPadding
from torch.utils.data import DataLoader

data_collator_dynamic_padding = DataCollatorWithPadding(tokenizer,
                                                        pad_to_multiple_of=8
                                                        )

train_dataloader = DataLoader(datasets['train'],
                              batch_size=16,
                              shuffle=True,
                              collate_fn=data_collator_dynamic_padding # default_data_collator or data_collator_dynamic_padding
                              )

for idx, batch in enumerate(train_dataloader):
    print(batch['input_ids'].shape)
    if idx == 2:
        break

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


torch.Size([16, 280])
torch.Size([16, 216])
torch.Size([16, 280])


#### Run on single GPU 

Discussion and possible solutions here:
https://github.com/huggingface/transformers/issues/12570

with latest version (transformers-4.34.0.dev0): 
AttributeError: 'customTrainingArguments' object has no attribute 'distributed_state' (for both solutions)

downgrade to:
pip install --upgrade transformers==4.29.2

In [20]:
if use_latest_solution:

    from transformers import TrainingArguments
    from accelerate.state import AcceleratorState
    from accelerate.utils import DistributedType

    class cached_property(property):
        def get(self, obj, objtype=None):
            if obj is None:
                return self

            if self.fget is None:
                raise AttributeError("unreadable attribute")

            attr = "_cached" + self.fget.name
            cached = getattr(obj, attr, None)
            if cached is None:
                cached = self.fget(obj)
                setattr(obj, attr, cached)
                return cached
                

    class customTrainingArguments(TrainingArguments):
        def __init__(self,*args, **kwargs):
            super(customTrainingArguments, self).__init__(*args, **kwargs)

        @property
        def device(self) -> "torch.device":
            return torch.device("cuda:1")

        @property
        def n_gpu(self):
            self._n_gpu = 1
            return self._n_gpu

        @property
        def parallel_mode(self):
            return "not_parallel"

        @cached_property
        def _setup_devices(self) -> "torch.device":
            self.distributed_state = AcceleratorState(backend=self.ddp_backend)
            self._n_gpu = 1
            device = self.distributed_state.device
            self.local_rank = self.distributed_state.local_process_index
            self.distributed_state.distributed_type = DistributedType.NO
            device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
            torch.cuda.set_device(device)
            return device

else:
    class customTrainingArguments(TrainingArguments):
        def __init__(self,*args, **kwargs):
            super(customTrainingArguments, self).__init__(*args, **kwargs)

        @property
        #@torch_required
        def device(self) -> "torch.device":
            """
            The device used by this process.
            Name the device the number you use.
            """
            return torch.device("cuda:1")

        @property
        #@torch_required
        def n_gpu(self):
            """
            The number of GPUs used by this process.
            Note:
                This will only be greater than one when you have multiple GPUs available but are not using distributed
                training. For distributed training, it will always be 1.
            """
            # Make sure `self._n_gpu` is properly setup.
            # _ = self._setup_devices
            # I set to one manullay
            self._n_gpu = 1
            return self._n_gpu

#### Training Parameters

In [52]:
transformers.set_seed(1)

In [53]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [56]:
from transformers import Trainer, AdamW, get_cosine_schedule_with_warmup, EarlyStoppingCallback

#training_args = TrainingArguments(
training_args = customTrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    learning_rate=0.0001,
    warmup_steps=200,
    weight_decay=0.05,
    do_train=True,
    do_eval=True,
    logging_dir=LOGS_DIR,
    logging_steps=50,
    evaluation_strategy='steps',
    eval_steps=50,
    load_best_model_at_end=True, # needed for early stopping
)

optimizer = AdamW(
    model.parameters(), 
    lr=training_args.learning_rate,
    betas=(0.9, 0.98),
    eps=1e-6, # numerical stability
)

total_steps = len(datasets['train']) // training_args.per_device_train_batch_size * training_args.num_train_epochs
print(f"Number of training steps: {total_steps}")

scheduler = get_cosine_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=training_args.warmup_steps, 
    num_training_steps=total_steps,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=datasets['train'],
    eval_dataset=datasets['validation'],
    data_collator=data_collator_dynamic_padding, # default_data_collator or data_collator_dynamic_padding
    optimizers=(optimizer, scheduler),
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)],
)

Number of training steps: 500


## Finetune + evaluate

In [57]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,0.6683,0.665066,0.61,0.757764,0.61,1.0
100,0.538,0.568779,0.76,0.819549,0.756944,0.893443
150,0.4091,0.638101,0.73,0.796992,0.736111,0.868852
200,0.3309,0.686583,0.725,0.773663,0.77686,0.770492
250,0.2406,0.619478,0.775,0.836364,0.751634,0.942623
300,0.1795,0.683914,0.755,0.812261,0.76259,0.868852
350,0.0771,1.14181,0.78,0.837037,0.763514,0.92623
400,0.0338,1.385006,0.77,0.824427,0.771429,0.885246
450,0.0085,1.60301,0.745,0.801556,0.762963,0.844262
500,0.0034,1.653175,0.745,0.8,0.766917,0.836066


TrainOutput(global_step=500, training_loss=0.24889229640364646, metrics={'train_runtime': 156.9334, 'train_samples_per_second': 101.954, 'train_steps_per_second': 3.186, 'total_flos': 1132066388914176.0, 'train_loss': 0.24889229640364646, 'epoch': 10.0})

In [60]:
# evaluate on test set
preds_test = trainer.predict(datasets['test'])
print(compute_metrics(preds_test))
preds_test.label_ids

{'accuracy': 0.78, 'f1': 0.8333333333333333, 'precision': 0.7857142857142857, 'recall': 0.8870967741935484}


array([1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1])

In [62]:
'''Save model'''
import datetime
model_id = datetime.datetime.now().strftime("%d%m%H%M")
#trainer.save_model(os.path.join(MODELS_DIR, f"distilbert-base-uncased_{model_id}"))