## 25 DistilBERT models

I'd like to see the CV score distribution after training the same model on 5 folds * 5 seeds. Let's see :) 

### Please upvote if you find this helpful :) 

In [None]:
!pip uninstall fsspec -qq -y
!pip install --no-index --find-links ../input/hf-datasets/wheels datasets -qq

In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset
from sklearn.metrics import mean_squared_error
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# disable W&B logging as we don't have access to the internet
%env WANDB_DISABLED=True

## Config

In [None]:
model_checkpoint = '../input/distilbertbaseuncased'
batch_size = 16
max_length = 256

## Loading and preprocessing training data with HF datasets

In [None]:
df = pd.read_csv('../input/step-1-create-folds/train_folds.csv') # https://www.kaggle.com/abhishek/step-1-create-folds
df = df.rename(columns={'target':'label'}) # HF expects this column name to pick up the target column in trainer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

def tokenize(batch): return tokenizer(batch['excerpt'], padding='max_length', truncation=True, max_length=max_length)

test_df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
test_df = test_df.rename(columns={'target':'label'})

test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

## Model and Training with HF transformers

In [None]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=1) # note this is actually a regression model

def compute_metrics(pred):
    return {
        'rmse': mean_squared_error(pred.label_ids, pred.predictions, squared=False),
    }

def init_trainer(fold, seed):
    train_dataset = Dataset.from_pandas(df[df.kfold != fold].reset_index(drop=True))
    valid_dataset = Dataset.from_pandas(df[df.kfold == fold].reset_index(drop=True))
    train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
    valid_dataset = valid_dataset.map(tokenize, batched=True, batch_size=len(valid_dataset))
    args = TrainingArguments(
        "./tmp",
        evaluation_strategy = "epoch",
        learning_rate=3e-5,
        fp16=True,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=3,
        seed=seed,
        weight_decay=0.001,
        load_best_model_at_end=True,
    )
    trainer = Trainer(
        model_init=model_init,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )    
    return trainer

In [None]:
preds = []
results = []

In [None]:
for fold in range(5):
    for seed in range(5):
        trainer = init_trainer(fold, seed)
        trainer.train()
        metrics = trainer.evaluate()
        results.append({
            'fold': fold,
            'seed': seed,
            'rmse': metrics['eval_rmse']
        })
        test_preds = trainer.predict(test_dataset)
        preds.append(test_preds[0])

## Analysis

In [None]:
res = pd.DataFrame(results)
res

In [None]:
res.rmse.mean(), res.rmse.std()

In [None]:
folds = pd.concat([res.groupby('fold')['rmse'].agg('mean').to_frame(), res.groupby('fold')['rmse'].agg('std').to_frame()], axis=1)
folds.columns = ['mean', 'std']
folds

In [None]:
seeds = pd.concat([res.groupby('seed')['rmse'].agg('mean').to_frame(), res.groupby('seed')['rmse'].agg('std').to_frame()], axis=1)
seeds.columns = ['mean', 'std']
seeds

In [None]:
res.rmse.hist(bins=5);

## Submission

In [None]:
preds = np.stack(preds)
mean_preds = preds.mean(axis=0)
sub = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')
sub.target = mean_preds
sub.to_csv('submission.csv', index=False)
sub.head()

### Please upvote if you find this helpful :) 