In [None]:
!pip install -q --no-deps ../input/fasthugs

In [None]:
from fastai.text.all import *
from fasthugs.data import TransformersTextBlock, TextGetter
from fasthugs.learner import TransLearner

from transformers import AutoModelForSequenceClassification
from sklearn.model_selection import StratifiedKFold
import gc

## Data preprocessing

In [None]:
path = Path('../input/commonlitreadabilityprize')
output_path = Path('./')
path.ls()

In [None]:
train_df = pd.read_csv(path/'train.csv')
train_df.head()

In [None]:
train_df.describe()

In [None]:
cv_lbls = (train_df.target.to_numpy() > 0).astype(np.float)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=8)
valid_idxs = []
for _, valid_idx in cv.split(np.arange(len(train_df)), cv_lbls):
    valid_idxs += [valid_idx]

## Training on first fold

In [None]:
model_name = '../input/roberta-transformers-pytorch/distilroberta-base'

In [None]:
dblock = DataBlock(blocks = [TransformersTextBlock(pretrained_model_name=model_name), RegressionBlock()],
                   get_x=TextGetter('excerpt'),
                   get_y=ItemGetter('target'),
                   splitter=IndexSplitter(valid_idxs[0]))

In [None]:
bs = 16
dls = dblock.dataloaders(train_df, bs=bs, val_bs=bs*2, num_workers=2)

In [None]:
dls.show_batch(max_n=4)

In [None]:
p_hdrop = 0.1
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1, hidden_dropout_prob=p_hdrop)
metrics = [rmse, R2Score(), PearsonCorrCoef(), SpearmanCorrCoef()]
opt_func = Adam
learn = TransLearner(dls, model, metrics=metrics, path=output_path, opt_func=opt_func)

In [None]:
lr = 2e-5
wd = [0.05, 0.05, 0.05]

In [None]:
cbs=[SaveModelCallback(monitor='_rmse', fname='model_0', comp=np.less, reset_on_fit=False), GradientAccumulation(32)]
learn.fit_one_cycle(4, lr, wd=wd, cbs=cbs)

The best performing model is stored and loaded at the end of the training by `SaveModelCallback`:

In [None]:
(output_path/'models').ls()

In [None]:
learn.validate()

In [None]:
all_preds = []

In [None]:
test_df = pd.read_csv(path/'test.csv')
test_dl = dls.test_dl(test_df)
test_dl.show_batch()

In [None]:
preds, _ = learn.get_preds(dl=test_dl)
all_preds += [preds]

## Cross validation

Let's fit models on remaining folds and save all the prediction.

In [None]:
for i in range (1, len(valid_idxs)):
    dblock = DataBlock(blocks = [TransformersTextBlock(pretrained_model_name=model_name), RegressionBlock()],
                   get_x=TextGetter('excerpt'),
                   get_y=ItemGetter('target'),
                   splitter=IndexSplitter(valid_idxs[i]))
    dls = dblock.dataloaders(train_df, bs=bs, val_bs=bs*2, num_workers=2)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1, hidden_dropout_prob=p_hdrop)
    learn = TransLearner(dls, model, metrics=metrics, opt_func=opt_func)
    cbs=[SaveModelCallback(monitor='_rmse', fname=f'model_{i}', comp=np.less)]
    learn.fit_one_cycle(4, lr, wd=wd, cbs=cbs, div=10, div_final=1000)
    preds, _ = learn.get_preds(dl=test_dl)
    all_preds += [preds]
    del learn; gc.collect()
    torch.cuda.empty_cache()

## Submission
Finally we can average the predictions from all models and submit:

In [None]:
preds = torch.cat(all_preds, dim=1)
preds

In [None]:
submission = pd.read_csv(path/'sample_submission.csv', index_col='id')
submission['target'] = preds.mean(dim=-1).numpy()
submission.to_csv('submission.csv')