## Hyperparameter tuning with HF Trainer

This notebook shows how to tune the training of a HF transformer model with HF trainer. 

### Please upvote if you find this helpful :) 

In [None]:
!pip uninstall fsspec -qq -y
!pip install --no-index --find-links ../input/hf-datasets/wheels datasets -qq

In [None]:
import pandas as pd
from datasets import Dataset
from sklearn.metrics import mean_squared_error
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
%env WANDB_DISABLED=True

## Config

In [None]:
model_checkpoint = '../input/distilbertbaseuncased'
batch_size = 16
max_length = 256

## Loading and preprocessing training data with HF datasets

In [None]:
df = pd.read_csv('../input/step-1-create-folds/train_folds.csv') # https://www.kaggle.com/abhishek/step-1-create-folds
df = df.rename(columns={'target':'label'}) # HF expects this column name to pick up the target column in trainer

train_dataset = Dataset.from_pandas(df[df.kfold != 0].reset_index(drop=True))
valid_dataset = Dataset.from_pandas(df[df.kfold == 0].reset_index(drop=True))

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

def tokenize(batch): return tokenizer(batch['excerpt'], padding='max_length', truncation=True, max_length=max_length)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
valid_dataset = valid_dataset.map(tokenize, batched=True, batch_size=len(valid_dataset))

## Model and Training with HF transformers

In [None]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=1) # note this is actually a regression model

def compute_metrics(pred):
    targs = pred.label_ids
    preds = pred.predictions
    rmse = mean_squared_error(targs, preds, squared=False)
    return {
        'rmse': rmse,
    }

args = TrainingArguments(
    "outputs_dir",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    fp16=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
    load_best_model_at_end=False, # change this to True after hyperparameter tuning
    save_strategy='no', # remove this after hyperparamenter tuning
)

trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

## Hyperparameter tuning

HuggingFace makes it easy for us:

In [None]:
def hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 5),
        "seed": trial.suggest_int("seed", 1, 40),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64]),
        "weight_decay": trial.suggest_float("weight_decay", 1e-3, 1e-1, log=True),
    }

In [None]:
best_run = trainer.hyperparameter_search(n_trials=20, direction="minimize", hp_space=hp_space)

Let's see the parameters of the best run. 

In [None]:
best_run

In [None]:
for n, v in best_run.hyperparameters.items():
    print(f'{n}: {v}')

We can now replicate the results of the best run like this. 

In [None]:
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

trainer.train()

### Please upvote if you find this helpful :) 