# Fine-tuning

In [2]:
!export "CUDA_VISIBLE_DEVICES"=1 jupyter notebook

In [3]:
import os
import pandas as pd
import numpy as np
import pprint as pp

import torch
from sklearn.model_selection import train_test_split
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import set_seed

In [4]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
#device = torch.device('mps')
device

device(type='cuda', index=1)

In [5]:
set_seed(1)
label_mapping = {'False': 0, 'True': 1}

model_name = 'distilbert-base-uncased'
model_config = AutoConfig.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=model_config)

model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier.bias', 'pre_classifier.we

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

## Data

In [6]:
PROJECT_DIR = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
RESPONSES_DIR = os.path.join(PROJECT_DIR, 'responses')
MODELS_DIR = os.path.join(PROJECT_DIR, 'classification/models')
OUTPUT_DIR = os.path.join(PROJECT_DIR, 'classification/preds')
LOGS_DIR = os.path.join(PROJECT_DIR, 'logs')

responses_path = os.path.join(RESPONSES_DIR, 'formatted_turbo14081857_turbo1508_eval.json')

In [7]:
data_df = pd.read_json(responses_path, orient='index').drop(columns=['answer_letter', 'answer_text', 'ERROR'])
data_df

Unnamed: 0,full_text,outcome
0,Revolving doors are convenient for two-directi...,True
1,A) Completing the job is one aim that people h...,False
2,"First, we need to identify what type of printe...",True
3,- A fast food restaurant is a common place to ...,True
4,"First, James is looking for farmland, which su...",False
...,...,...
695,"First, we can eliminate options A, C, and D as...",False
696,"First, we need to identify what kind of lawyer...",True
697,James bought a new set of tire chains. Tire ch...,True
698,The question states that the food item needs t...,False


In [8]:
train, tmp = train_test_split(data_df, test_size=0.2, random_state=42)
val, test = train_test_split(tmp, test_size=0.5, random_state=42)
train.shape, val.shape, test.shape

((560, 2), (70, 2), (70, 2))

In [9]:
from datasets import Dataset, DatasetDict

raw_datasets = DatasetDict({'train': Dataset.from_pandas(train),
                            'validation': Dataset.from_pandas(val), 
                            'test': Dataset.from_pandas(test)})
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['full_text', 'outcome', '__index_level_0__'],
        num_rows: 560
    })
    validation: Dataset({
        features: ['full_text', 'outcome', '__index_level_0__'],
        num_rows: 70
    })
    test: Dataset({
        features: ['full_text', 'outcome', '__index_level_0__'],
        num_rows: 70
    })
})

In [10]:
for key in raw_datasets.keys():
    raw_datasets[key] = raw_datasets[key].rename_column("outcome", "label")
    raw_datasets[key] = raw_datasets[key].rename_column("__index_level_0__", "pandas_idx")

In [11]:
raw_datasets['train'][0]

{'full_text': 'Option A is a possible answer because the Amazon basin is known to have a diverse range of primates, including monkeys. However, we cannot be sure that it is home to the most monkeys without further information.\n\nOption B, C, and D are unlikely answers because they are not large land masses that can support a significant population of monkeys.\n\nOption E is also a possible answer because Africa is home to several species of monkeys, including baboons, vervet monkeys, and colobus monkeys. However, we still cannot be certain that it is home to the most monkeys without more information.\n\nTherefore, the answer is inconclusive without more specific data.',
 'label': False,
 'pandas_idx': 82}

In [12]:
def tokenize_and_mask(raw_data):

    '''Tokenize
    Normal padding: set padding='max_length' and max_length=int (default is 512)
    Dynamic padding: set padding=False and (later in the Trainer) pass `data_collator=DataCollatorWithPadding(tokenizer)
    result will be a dict with keys 'input_ids', 'attention_mask'
    '''
    result = tokenizer(raw_data["full_text"],
                       max_length=512,
                       truncation=True,
                       #padding='max_length'
                       )

    '''Add labels'''
    if label_mapping is not None:
        if "label" in raw_data:
            result['labels'] = [label_mapping[str(label)] for label in raw_data["label"]]
    
    return result

In [13]:
'''Sanity check'''

processed_test = tokenize_and_mask(raw_datasets['train'][:5])

print(len(processed_test), # keys
      len(processed_test['input_ids']), 
      len(processed_test['attention_mask']), 
      len(processed_test['labels']))
print(processed_test['labels'][:5])
print(raw_datasets['train']['pandas_idx'][:5])
print(data_df.loc[545])

3 5 5 5
[0, 0, 1, 0, 0]
[82, 51, 220, 669, 545]
full_text    Injecting water into oneself can lead to dilut...
outcome                                                  False
Name: 545, dtype: object


In [14]:
'''Prepare inputs: tokenize and mask'''
datasets = raw_datasets.map(tokenize_and_mask, batched=True)

Map:   0%|          | 0/560 [00:00<?, ? examples/s]

Map:   0%|          | 0/70 [00:00<?, ? examples/s]

Map:   0%|          | 0/70 [00:00<?, ? examples/s]

In [15]:
datasets

DatasetDict({
    train: Dataset({
        features: ['full_text', 'label', 'pandas_idx', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 560
    })
    validation: Dataset({
        features: ['full_text', 'label', 'pandas_idx', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 70
    })
    test: Dataset({
        features: ['full_text', 'label', 'pandas_idx', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 70
    })
})

In [16]:
'''Checking what is the longest sequence in the dataset'''

import pprint as pp

maxlen = 0
maxlen_info = None

for split in ['train', 'validation', 'test']:
    for tokenized_sequence in datasets[split]['input_ids']:
        if len(tokenized_sequence) > maxlen:
            maxlen = len(tokenized_sequence)
            tmp_idx = datasets[split]['input_ids'].index(tokenized_sequence)
            maxlen_info = {
                'tokenized_sequence': tokenized_sequence,
                'length': maxlen,
                'from_split': split,
                'orig_idx': datasets[split]['pandas_idx'][tmp_idx],
                'orig_text': f"{datasets[split]['full_text'][tmp_idx][:200]}..."
                }

pp.pprint(maxlen_info, compact=True)

{'from_split': 'train',
 'length': 343,
 'orig_idx': 349,
 'orig_text': 'First, we need to consider what type of lizard we want as a '
              'pet. Different types of lizards have different requirements for '
              "their care and living conditions. Once we've decided on the "
              'type of lizard w...',
 'tokenized_sequence': [101, 2034, 1010, 2057, 2342, 2000, 5136, 2054, 2828,
                        1997, 15450, 2057, 2215, 2004, 1037, 9004, 1012, 2367,
                        4127, 1997, 23898, 2031, 2367, 5918, 2005, 2037, 2729,
                        1998, 2542, 3785, 1012, 2320, 2057, 1005, 2310, 2787,
                        2006, 1996, 2828, 1997, 15450, 2057, 2215, 1010, 2057,
                        2064, 2059, 2298, 2005, 3182, 2000, 6855, 2009, 1012,
                        5724, 1037, 1007, 5532, 2406, 1024, 2023, 2071, 2022,
                        1037, 2825, 3295, 2000, 6855, 1037, 15450, 1010, 5834,
                        2006, 1996, 2828, 1

In [17]:
for split in ['train', 'validation', 'test']:
    datasets[split].set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [18]:
from transformers import default_data_collator, DataCollatorWithPadding
from torch.utils.data import DataLoader

data_collator_dynamic_padding = DataCollatorWithPadding(tokenizer,
                                                        pad_to_multiple_of=8
                                                        )

train_dataloader = DataLoader(datasets['train'],
                              batch_size=16,
                              shuffle=True,
                              collate_fn=data_collator_dynamic_padding # default_data_collator or data_collator_dynamic_padding
                              )
                              

In [19]:
for idx, batch in enumerate(train_dataloader):
    print(batch['input_ids'].shape)
    if idx == 5:
        break

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


torch.Size([16, 208])
torch.Size([16, 264])
torch.Size([16, 232])
torch.Size([16, 200])
torch.Size([16, 272])
torch.Size([16, 216])


In [23]:
from transformers import Trainer, TrainingArguments, AdamW, get_cosine_schedule_with_warmup

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    learning_rate=0.0001,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=LOGS_DIR,
    logging_steps=10,
    group_by_length=True,
)

optimizer = AdamW(
    model.parameters(), 
    lr=training_args.learning_rate
)

total_steps = len(datasets['train']) // training_args.per_device_train_batch_size * training_args.num_train_epochs
print(f"Number of training steps: {total_steps}")
scheduler = get_cosine_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=training_args.warmup_steps, 
    num_training_steps=total_steps
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=datasets['train'],
    eval_dataset=datasets['validation'],
    data_collator=data_collator_dynamic_padding, # default_data_collator or data_collator_dynamic_padding
    optimizers=(optimizer, scheduler)
)

Number of training steps: 560


In [24]:
trainer.train()



Step,Training Loss
10,0.6856
20,0.6678
30,0.6695
40,0.6577
50,0.6727
60,0.6357


TrainOutput(global_step=64, training_loss=0.6620850786566734, metrics={'train_runtime': 26.8848, 'train_samples_per_second': 83.318, 'train_steps_per_second': 2.381, 'total_flos': 106122944748288.0, 'train_loss': 0.6620850786566734, 'epoch': 4.0})

In [25]:
trainer.evaluate(datasets['validation'])

{'eval_loss': 0.6525583267211914,
 'eval_runtime': 0.1094,
 'eval_samples_per_second': 639.575,
 'eval_steps_per_second': 9.137,
 'epoch': 4.0}

In [26]:
from datasets import load_metric

acc_metric = load_metric("accuracy")
f1_metric = load_metric("f1")

  acc_metric = load_metric("accuracy")


In [27]:
preds_val = trainer.predict(datasets['validation'])
# preds_val.predictions is an array of arrays of logits



In [28]:
def compute_metrics(model_preds):
    gold_labels = model_preds.label_ids
    logits = model_preds.predictions
    preds = logits.argmax(-1)
    print(preds)
    
    acc = acc_metric.compute(predictions=preds, references=gold_labels)
    f1 = f1_metric.compute(predictions=preds, references=gold_labels)
    
    return {
        'accuracy': acc,
        'f1': f1
    }

compute_metrics(preds_val)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


{'accuracy': {'accuracy': 0.6142857142857143},
 'f1': {'f1': 0.7610619469026549}}

In [None]:
# always predicts 1!

In [None]:
'''Save model'''
import datetime
model_id = datetime.datetime.now().strftime("%d%m%H%M")
#trainer.save_model(os.path.join(MODELS_DIR, f"distilbert-base-uncased_{model_id}"))