# About this notebook

* This notebook is created based on this fastai [tutorial](https://docs.fast.ai/tutorial.text.html). Additional information can be found on the fastai book as well on [Chapter 10](https://github.com/fastai/fastbook/blob/master/10_nlp.ipynb) and [Chapter 12](https://github.com/fastai/fastbook/blob/master/12_nlp_dive.ipynb). 

* This notebook also creates the submission as well. Additional inference notebook can be found from [here](https://www.kaggle.com/snnclsr/commonlit-fastai-inference) for faster submissions.



In [None]:
from pathlib import Path

from fastai.text.all import *

# Data Loading

In [None]:
BASE_DATA_PATH = Path("../input/commonlitreadabilityprize/")

df_train = pd.read_csv(BASE_DATA_PATH / "train.csv")
df_test = pd.read_csv(BASE_DATA_PATH / "test.csv")

df_train = df_train[["excerpt", "target"]]

# Language Model Dataloaders

In [None]:
common_lit = DataBlock(blocks=(TextBlock.from_df("excerpt", is_lm=True)), 
                       get_x=ColReader("text"), 
                       splitter=RandomSplitter(seed=42))

dls_lm = common_lit.dataloaders(df_train, bs=64, seq_len=256)

In [None]:
dls_lm.show_batch(max_n=4)

# Language Model Learner

In [None]:
learn = language_model_learner(dls_lm, AWD_LSTM, metrics=[accuracy, Perplexity()], wd=0.1).to_fp16()

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(1, 1e-2)

In [None]:
learn.save('1epoch')

In [None]:
learn = learn.load('1epoch')

# Fine-tune the LM

In [None]:
learn.unfreeze()
learn.fit_one_cycle(10, 1e-3)

In [None]:
learn.save_encoder('finetuned')

# Dataloaders for the Text Regression

In [None]:
dls_class = DataBlock(blocks=(TextBlock.from_df('excerpt', seq_len=256, vocab=dls_lm.vocab), RegressionBlock),
                      get_x=ColReader('text'),
                      get_y=ColReader('target'),
                      splitter=RandomSplitter())

dls = dls_class.dataloaders(df_train, bs=64)
dls.show_batch(max_n=4)

# Load & Fine-tune the Regressor Model

In [None]:
learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=rmse)

In [None]:
learn = learn.load_encoder("finetuned")

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(1, 2e-2)

In [None]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2))

In [None]:
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3))

In [None]:
learn.unfreeze()
learn.fit_one_cycle(2, slice(1e-3/(2.6**4),1e-3))

In [None]:
# learn.save("final")
learn.export("final.pkl")

# Inference

In [None]:
test_dl = learn.dls.test_dl(df_test.excerpt)

In [None]:
preds, _ = learn.get_preds(dl=test_dl)

In [None]:
df_sub = pd.read_csv(BASE_DATA_PATH / "sample_submission.csv")
df_sub['target'] = preds.numpy()
df_sub.to_csv('submission.csv',index=False)

In [None]:
df_sub