# Fine-tuning

## Set up

In [133]:
import torch
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, accuracy_score
from transformers import (AutoConfig, 
                          AutoModelForSequenceClassification, 
                          AutoTokenizer, AdamW, 
                          get_linear_schedule_with_warmup,
                          set_seed,
                          )

import pandas as pd
import numpy as np
import os

In [134]:
!export CUDA_VISIBLE_DEVICES=1

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [135]:
set_seed(1)
model_name_or_path = 'distilbert-base-uncased'

In [136]:
labels_ids = {'False': 0, 'True': 1}
num_labels = len(labels_ids)

epochs = 4
batches = 8

In [137]:
model_config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_name_or_path, 
                                          num_labels=num_labels)

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path)

model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, 
                                                           config=model_config)

model.to(device)
print('Model loaded to `%s`'%device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.we

Model loaded to `cuda`


## Data

In [138]:
PROJECT_DIR = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
RESPONSES_DIR = os.path.join(PROJECT_DIR, 'responses')
LOGS_DIR = os.path.join(PROJECT_DIR, 'logs')

responses_path = os.path.join(RESPONSES_DIR, 'formatted_turbo14081857_turbo1508_eval.json')

In [139]:
data_df = pd.read_json(responses_path, orient='index').drop(columns=['answer_letter', 'answer_text', 'ERROR'])
data_df

Unnamed: 0,full_text,outcome
0,Revolving doors are convenient for two-directi...,True
1,A) Completing the job is one aim that people h...,False
2,"First, we need to identify what type of printe...",True
3,- A fast food restaurant is a common place to ...,True
4,"First, James is looking for farmland, which su...",False
...,...,...
695,"First, we can eliminate options A, C, and D as...",False
696,"First, we need to identify what kind of lawyer...",True
697,James bought a new set of tire chains. Tire ch...,True
698,The question states that the food item needs t...,False


In [140]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_df.drop(columns=['outcome']), data_df['outcome'], test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=42)

In [141]:
X_train.shape, X_val.shape, X_test.shape

((505, 1), (90, 1), (105, 1))

In [142]:
len(X_train)+len(X_test)+len(X_val) == len(data_df)

True

In [150]:
longest_tokenized = 0 # length of longest sequence
from_split = '' # name of df where the longest sequence is from
longest_idx = 0 # idx of longest sequence
num_sequences = 0 # total number of sequences for sanity check

X_train.name, X_val.name, X_test.name = 'X_train', 'X_val', 'X_test' # set df names

for split in [X_train, X_val, X_test]:
    split['tmp'] = split['full_text'].apply(lambda x: tokenizer.encode(x))
    
    for seq in split['tmp'].values:
        num_sequences += 1

        if len(seq) > longest_tokenized:
            longest_tokenized = len(seq)
            from_split = split.name
            longest_idx = split[split['tmp'].apply(lambda x: len(x)) == longest_tokenized].index[0] 
    
    split.drop(columns=['tmp'], inplace=True)

print(f"Longest sequence\n\tFrom split: {from_split}\n\tIdx of longest seq: {longest_idx}\n\tLength longest seq: {longest_tokenized}")

num_sequences == len(X_train)+len(X_test)+len(X_val)

Longest sequence
	From split: X_train
	Idx of longest seq: 349
	Length longest seq: 343


True

In [202]:
for split in [X_train, X_val, X_test]:
    split['tokenized'] = split['full_text'].apply(lambda x: tokenizer.encode(x, padding='max_length'))

In [203]:
X_train.head()

Unnamed: 0,full_text,tokenized
563,The question states that John loved to paint h...,"[101, 1996, 3160, 2163, 2008, 2198, 3866, 2000..."
7,"When an enemy is approaching, animals usually ...","[101, 2043, 2019, 4099, 2003, 8455, 1010, 4176..."
639,"First, we know that a weasel is an animal know...","[101, 2034, 1010, 2057, 2113, 2008, 1037, 2926..."
349,"First, we need to consider what type of lizard...","[101, 2034, 1010, 2057, 2342, 2000, 5136, 2054..."
75,A) Abaft: This refers to the rear part of a sh...,"[101, 1037, 1007, 19557, 6199, 1024, 2023, 521..."


In [204]:
# Pytorch dataset
class Responses(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

In [205]:
# converting to dictionary of list because that's what pytorch expects
# converting bool to int because of deprecated warning

train_dataset = Responses(X_train['tokenized'].to_dict(), y_train.values.astype(int))
val_dataset = Responses(X_val['tokenized'].to_dict(), y_val.values.astype(int))
test_dataset = Responses(X_test['tokenized'].to_dict(), y_test.values.astype(int))

In [206]:
train_dataset.__len__(), val_dataset.__len__(), test_dataset.__len__()

(505, 90, 105)

In [207]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=f"{PROJECT_DIR}/classification/preds/",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=LOGS_DIR,
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()



TypeError: sequence item 0: expected str instance, int found

In [208]:
idx = 0
dummy = 0
for key, val in train_dataset.encodings.items():
    try:
        dummy = val[idx]
        idx+=1
    except:
        print(f"{key}\n\n{val}\n\n{len(val)}\n\n{idx}")
        break