In [None]:
!pip install -q --no-deps ../input/fasthugs

In [None]:
from fastai.text.all import *
from fastai.callback.wandb import *
from fasthugs.data import TransformersTextBlock, TextGetter
from fasthugs.learner import TransLearner

from transformers import AutoModelForSequenceClassification
from sklearn.model_selection import StratifiedKFold, KFold
import gc
import wandb

In [None]:
from kaggle_secrets import UserSecretsClient

wandb_key = UserSecretsClient().get_secret("wandb_api_key")

In [None]:
%env WANDB_ENTITY=arampacha
%env WANDB_PROJECT=commonlit
%env WANDB_SILENT=true

In [None]:
wandb.login(key=wandb_key)

## Data preprocessing

In [None]:
path = Path('../input/commonlitreadabilityprize')
output_path = Path('./')
path.ls()

In [None]:
train_df = pd.read_csv(path/'train.csv')
train_df.head(2)

In [None]:
train_df.describe()

In [None]:
cv = KFold(n_splits=5, shuffle=True, random_state=8)
valid_idxs = []
for _, valid_idx in cv.split(np.arange(len(train_df))):
    valid_idxs += [valid_idx]

## Training on first fold

For demonstration purposes I'm using `distilroberta-base`. It is a lightweight distilled vertion of RoBERTa, which performes considerably worse. You can easily switch to other models by changing `model_name`. The `TransformersTextBlock` uses pretrained huggingface tokenizer internally and is set up by providing path to pretrained model. 

In [None]:
model_name = '../input/roberta-transformers-pytorch/distilroberta-base'

In [None]:
dblock = DataBlock(blocks = [TransformersTextBlock(pretrained_model_name=model_name), RegressionBlock()],
                   get_x=TextGetter('excerpt'),
                   get_y=ItemGetter('target'),
                   splitter=IndexSplitter(valid_idxs[0]))

In [None]:
dls = dblock.dataloaders(train_df, bs=16, val_bs=32, num_workers=2)

In [None]:
dls.show_batch(max_n=4)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)
learn = TransLearner(dls, model, metrics=rmse, path=output_path)

In [None]:
learn.lr_find()

In [None]:
lr = 2e-5
wd = 0.05

I'm going to log runs to Weights&Biases. This may be of greate use for further analysis of the results. As you shell see with fastai it doesn't require much extra work at all.

In [None]:
name = f"{model_name}_lr{lr:.0e}-fold{0}"
group = f"{model_name}_lr{lr:.0e}"
run = wandb.init(name=name, group=group)

In [None]:
cbs=[WandbCallback(log_preds=False, log_model=False),
     SaveModelCallback(monitor='_rmse', fname='model_0', comp=np.less)]
learn.fit_one_cycle(4, lr, wd=wd, cbs=cbs)

The best performing model is stored and loaded at the end of the training by `SaveModelCallback`:

In [None]:
(output_path/'models').ls()

In [None]:
learn.validate()

In [None]:
all_preds = []

In [None]:
test_df = pd.read_csv(path/'test.csv')
test_dl = dls.test_dl(test_df)
test_dl.show_batch()

In [None]:
preds, _ = learn.get_preds(dl=test_dl)
all_preds += [preds]

## Cross validation

Let's fit models on remaining folds and save all the prediction.

In [None]:
for i in range (1, len(valid_idxs)):
    name = f"{model_name}_lr{lr:.0e}-fold{i}"
    group = f"{model_name}_lr{lr:.0e}"
    with wandb.init(name=name, group=group) as run:
        dblock = DataBlock(blocks = [TransformersTextBlock(pretrained_model_name=model_name), RegressionBlock()],
                       get_x=TextGetter('excerpt'),
                       get_y=ItemGetter('target'),
                       splitter=IndexSplitter(valid_idxs[i]))
        dls = dblock.dataloaders(train_df, bs=16, val_bs=32, num_workers=2)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)
        learn = TransLearner(dls, model, metrics=rmse)
        cbs=[WandbCallback(log_preds=False, log_model=False),
             SaveModelCallback(monitor='_rmse', fname=f'model_{i}', comp=np.less)]
        learn.fit_one_cycle(4, 2e-5, wd=wd, cbs=cbs)
        preds, _ = learn.get_preds(dl=test_dl)
        all_preds += [preds]
        del learn; gc.collect()
        torch.cuda.empty_cache()

## Submission
Finally we can average the predictions from all models and submit:

In [None]:
preds = torch.cat(all_preds, dim=1).mean(dim=-1)

In [None]:
submission = pd.read_csv(path/'sample_submission.csv', index_col='id')
submission['target'] = preds.numpy()
submission.to_csv('submission.csv')