# Fine-tuning v2

Original attempt predicts all 1s. Improve.

- balance dataset

In [1]:
!export "CUDA_VISIBLE_DEVICES"=1 jupyter notebook

In [1]:
import os
import pandas as pd
import numpy as np
import pprint as pp

import torch
from sklearn.model_selection import train_test_split
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import set_seed

In [3]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
#device = torch.device('mps')
device

device(type='cuda', index=1)

In [4]:
set_seed(1)
label_mapping = {'False': 0, 'True': 1}

model_name = 'distilbert-base-uncased'
model_config = AutoConfig.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=model_config)

model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifier.bias', 'pre_classifier.

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

## Data

In [5]:
PROJECT_DIR = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
RESPONSES_DIR = os.path.join(PROJECT_DIR, 'responses')
MODELS_DIR = os.path.join(PROJECT_DIR, 'classification/models')
OUTPUT_DIR = os.path.join(PROJECT_DIR, 'classification/preds')
LOGS_DIR = os.path.join(PROJECT_DIR, 'logs')

responses_path = os.path.join(RESPONSES_DIR, 'formatted_turbo14081857_turbo1508_eval.json')

In [6]:
data_df = pd.read_json(responses_path, orient='index').drop(columns=['answer_letter', 'answer_text', 'ERROR'])
data_df

Unnamed: 0,full_text,outcome
0,Revolving doors are convenient for two-directi...,True
1,A) Completing the job is one aim that people h...,False
2,"First, we need to identify what type of printe...",True
3,- A fast food restaurant is a common place to ...,True
4,"First, James is looking for farmland, which su...",False
...,...,...
695,"First, we can eliminate options A, C, and D as...",False
696,"First, we need to identify what kind of lawyer...",True
697,James bought a new set of tire chains. Tire ch...,True
698,The question states that the food item needs t...,False


### Balance dataset

The original dataset is unbalanced (60/40) with more True instance. Balance the dataset so that we have equal number of instances. It will need more data to work well, but at least now it does not always predict 1.

In [7]:
true_instances_count = len(data_df[data_df['outcome'] == True])
false_instances_count = len(data_df[data_df['outcome'] == False])
print(f'True instances: {true_instances_count}, False instances: {false_instances_count}')

# create balanced dataset
# a subset of data_df, where we select only false_instances_count number of true instances, and shuffle the data instances
balanced_data_df = pd.concat([data_df[data_df['outcome'] == True].sample(false_instances_count, random_state=1), 
                              data_df[data_df['outcome'] == False]]).sample(frac=1, random_state=1)

print(f'Balanced dataset: {len(balanced_data_df)} instances.\n\tLength match? {len(balanced_data_df) == false_instances_count * 2}')

balanced_data_df

True instances: 444, False instances: 256
Balanced dataset: 512 instances.
	Length match? True


Unnamed: 0,full_text,outcome
399,"First, we need to identify what kind of saw is...",True
273,"If fish are not in a stream, it means they are...",False
80,"First, we need to identify what ""pans"" are. Pa...",False
386,"First, it says ""most items in retail stores."" ...",True
679,"First, we need to understand what calorie requ...",False
...,...,...
651,"John and Judy were parents, and they had two k...",True
258,"First, we need to identify what a container is...",True
395,The Empire State Building is a landmark skyscr...,False
238,"First, we need to understand what minerals are...",True


In [8]:
data_df = balanced_data_df

train, tmp = train_test_split(data_df, test_size=0.2, random_state=42)
val, test = train_test_split(tmp, test_size=0.5, random_state=42)
train.shape, val.shape, test.shape

((409, 2), (51, 2), (52, 2))

In [9]:
from datasets import Dataset, DatasetDict

raw_datasets = DatasetDict({'train': Dataset.from_pandas(train),
                            'validation': Dataset.from_pandas(val), 
                            'test': Dataset.from_pandas(test)})
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['full_text', 'outcome', '__index_level_0__'],
        num_rows: 409
    })
    validation: Dataset({
        features: ['full_text', 'outcome', '__index_level_0__'],
        num_rows: 51
    })
    test: Dataset({
        features: ['full_text', 'outcome', '__index_level_0__'],
        num_rows: 52
    })
})

In [10]:
for key in raw_datasets.keys():
    raw_datasets[key] = raw_datasets[key].rename_column("outcome", "label")
    raw_datasets[key] = raw_datasets[key].rename_column("__index_level_0__", "pandas_idx")

In [11]:
raw_datasets['train'][0]

{'full_text': 'If chewing food is difficult for you, it could be because of a broken jaw or a sore mouth, which would make it painful to chew. It could also be because of difficulty with eating, such as missing teeth or a misaligned bite. However, good digestion would not affect the act of chewing itself, and avoiding choking would not necessarily make chewing difficult. Therefore, the possible reasons for difficulty in chewing food are A) broken jaw or B) sore mouth, making both choices correct.',
 'label': False,
 'pandas_idx': 166}

In [12]:
def tokenize_and_mask(raw_data):

    '''Tokenize
    Normal padding: set padding='max_length' and max_length=int (default is 512)
    Dynamic padding: set padding=False and (later in the Trainer) pass `data_collator=DataCollatorWithPadding(tokenizer)
    result will be a dict with keys 'input_ids', 'attention_mask'
    '''
    result = tokenizer(raw_data["full_text"],
                       max_length=512,
                       truncation=True,
                       #padding='max_length'
                       )

    '''Add labels'''
    if label_mapping is not None:
        if "label" in raw_data:
            result['labels'] = [label_mapping[str(label)] for label in raw_data["label"]]
    
    return result

In [13]:
'''Sanity check'''

processed_test = tokenize_and_mask(raw_datasets['train'][:5])

print(len(processed_test), # keys
      len(processed_test['input_ids']), 
      len(processed_test['attention_mask']), 
      len(processed_test['labels']))
print(processed_test['labels'][:5])
print(raw_datasets['train']['pandas_idx'][:5])
print(data_df.loc[545])

3 5 5 5
[0, 1, 1, 0, 0]
[166, 377, 448, 600, 305]
full_text    Injecting water into oneself can lead to dilut...
outcome                                                  False
Name: 545, dtype: object


In [14]:
'''Prepare inputs: tokenize and mask'''
datasets = raw_datasets.map(tokenize_and_mask, batched=True)

Map:   0%|          | 0/409 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/52 [00:00<?, ? examples/s]

In [15]:
datasets

DatasetDict({
    train: Dataset({
        features: ['full_text', 'label', 'pandas_idx', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 409
    })
    validation: Dataset({
        features: ['full_text', 'label', 'pandas_idx', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 51
    })
    test: Dataset({
        features: ['full_text', 'label', 'pandas_idx', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 52
    })
})

In [16]:
for split in ['train', 'validation', 'test']:
    datasets[split].set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [17]:
from transformers import default_data_collator, DataCollatorWithPadding
from torch.utils.data import DataLoader

data_collator_dynamic_padding = DataCollatorWithPadding(tokenizer,
                                                        pad_to_multiple_of=8
                                                        )

train_dataloader = DataLoader(datasets['train'],
                              batch_size=16,
                              shuffle=True,
                              collate_fn=data_collator_dynamic_padding # default_data_collator or data_collator_dynamic_padding
                              )

for idx, batch in enumerate(train_dataloader):
    print(batch['input_ids'].shape)
    if idx == 5:
        break

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


torch.Size([16, 272])
torch.Size([16, 248])
torch.Size([16, 232])
torch.Size([16, 208])
torch.Size([16, 280])
torch.Size([16, 312])


In [30]:
from transformers import TrainingArguments
from accelerate.state import AcceleratorState
from accelerate.utils import DistributedType

class cached_property(property):
    def get(self, obj, objtype=None):
        if obj is None:
            return self

        if self.fget is None:
            raise AttributeError("unreadable attribute")

        attr = "_cached" + self.fget.name
        cached = getattr(obj, attr, None)
        if cached is None:
            cached = self.fget(obj)
            setattr(obj, attr, cached)
            return cached
            

class customTrainingArguments(TrainingArguments):
    def __init__(self,*args, **kwargs):
        super(customTrainingArguments, self).__init__(*args, **kwargs)

    @property
    def device(self) -> "torch.device":
        return torch.device("cuda:1")

    @property
    def n_gpu(self):
        self._n_gpu = 1
        return self._n_gpu

    @property
    def parallel_mode(self):
        return "not_parallel"

    @cached_property
    def _setup_devices(self) -> "torch.device":
        self.distributed_state = AcceleratorState(backend=self.ddp_backend)
        self._n_gpu = 1
        device = self.distributed_state.device
        self.local_rank = self.distributed_state.local_process_index
        self.distributed_state.distributed_type = DistributedType.NO
        device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
        torch.cuda.set_device(device)
        return device

In [38]:
from transformers import Trainer, TrainingArguments, AdamW, get_cosine_schedule_with_warmup

#training_args = TrainingArguments(
training_args = customTrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    learning_rate=0.0001,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=LOGS_DIR,
    logging_steps=10,
    group_by_length=True,
)

optimizer = AdamW(
    model.parameters(), 
    lr=training_args.learning_rate
)

total_steps = len(datasets['train']) // training_args.per_device_train_batch_size * training_args.num_train_epochs
print(f"Number of training steps: {total_steps}")
scheduler = get_cosine_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=training_args.warmup_steps, 
    num_training_steps=total_steps
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=datasets['train'],
    eval_dataset=datasets['validation'],
    data_collator=data_collator_dynamic_padding, # default_data_collator or data_collator_dynamic_padding
    optimizers=(optimizer, scheduler)
)

Number of training steps: 204




In [39]:
trainer.train()

Step,Training Loss
10,0.3917
20,0.3613
30,0.2649
40,0.3516
50,0.2891
60,0.3712
70,0.2154
80,0.133
90,0.3347
100,0.1709


TrainOutput(global_step=208, training_loss=0.21534533519297838, metrics={'train_runtime': 9.4518, 'train_samples_per_second': 173.088, 'train_steps_per_second': 22.006, 'total_flos': 58072465688928.0, 'train_loss': 0.21534533519297838, 'epoch': 4.0})

In [40]:
trainer.evaluate(datasets['validation'])

{'eval_loss': 1.5325576066970825,
 'eval_runtime': 0.1576,
 'eval_samples_per_second': 323.697,
 'eval_steps_per_second': 25.388,
 'epoch': 4.0}

In [48]:
from datasets import load_metric

acc_metric = load_metric("accuracy")
f1_metric = load_metric("f1")

preds_val = trainer.predict(datasets['validation'])
# preds_val.predictions is an array of arrays of logits

def compute_metrics(model_preds):
    gold_labels = model_preds.label_ids
    logits = model_preds.predictions
    preds = logits.argmax(-1)
    print(preds)
    
    acc = acc_metric.compute(predictions=preds, references=gold_labels)
    f1 = f1_metric.compute(predictions=preds, references=gold_labels)
    
    return acc, f1

compute_metrics(preds_val)

[1 1 1 0 0 1 1 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 0 0 1 1 1 0 1 1 0 0 1 1
 1 0 1 1 1 1 0 0 1 0 1 1 0 1]


({'accuracy': 0.7058823529411765}, {'f1': 0.7540983606557377})

In [49]:
# evaluate on test set
preds_test = trainer.predict(datasets['test'])
print(compute_metrics(preds_test))
preds_test.metrics

[1 0 1 1 1 1 0 1 1 0 1 0 1 1 1 1 0 1 1 0 1 1 1 1 0 0 0 1 0 1 1 1 0 1 1 1 1
 1 1 0 1 1 1 0 0 1 1 1 0 0 1 0]
({'accuracy': 0.7115384615384616}, {'f1': 0.7692307692307692})


{'test_loss': 1.4525049924850464,
 'test_runtime': 0.166,
 'test_samples_per_second': 313.168,
 'test_steps_per_second': 24.09}

In [None]:
'''Save model'''
import datetime
model_id = datetime.datetime.now().strftime("%d%m%H%M")
#trainer.save_model(os.path.join(MODELS_DIR, f"distilbert-base-uncased_{model_id}"))