# What to do?
* Predict if a text is easier to read or not.
* The more negative a score the more un-readable it is.
* At first instance this looks like a tabular data.
* perhaps this is also regression as we have continuous values in targets

look into the data

In [None]:
from shutil import copyfile
import random
import math
from typing import List, TypeVar, Dict, Any
import wandb
from pandas.core.frame import DataFrame
from fastai.basics import *
from fastai.callback.all import *
from fastai.callback.wandb import *
from fastai.text.all import *
from kaggle_secrets import UserSecretsClient

## Setup wandb

In [None]:
user_secrets = UserSecretsClient()
api_key = user_secrets.get_secret("wandb_key")

In [None]:
!wandb login $api_key

In [None]:
path = Path('../input')
op_path = Path('/kaggle/working')

## moving data to pwd

In [None]:
def file_copy(file_paths: List, dest: Path):
    dest.mkdir(parents=True, exist_ok=True)
    for path in file_paths:
        fn = path.name
        dest_path = dest/fn
        dest_path.touch()
        copyfile(path, dest_path)

In [None]:
file_copy([Path('../input/commonlitreadabilityprize/test.csv'),
           Path('../input/commonlit-infusing-data-with-same-domain-data/vanilla_comlit_train_infused_with_cbt.csv')],
           op_path/'common-lit-datset')

## Initial cleaning of dataframe

In [None]:
def drop_cols(df: DataFrame, cols: List) -> DataFrame:
    """drop the given list of columns from the dataframe"""
    return df.drop(cols, axis=1)

In [None]:
df_train = pd.read_csv(op_path/'common-lit-datset/vanilla_comlit_train_infused_with_cbt.csv')
df_train = drop_cols(df_train, 'Unnamed: 0')
df_train.head(2)

In [None]:
df_test=pd.read_csv(op_path/'common-lit-datset/test.csv')
df_test.head(2)

In [None]:
df_train.shape, df_test.shape

## Re-arrenging and pre-processing the data for the Language model
* dropping all columns except excerpts
* concatenating both dataframes(optional)

In [None]:
def concat_dfs(dfs: List) -> DataFrame:
    """concatenate two dataframes"""
    return pd.concat(dfs)

In [None]:
df_train_excerpts = df_train
df_test_excerpts = df_test.copy()
df_test_excerpts = drop_cols(df_test_excerpts, [ 'id',
                                                  'url_legal',
                                                  'license'])

In [None]:
df_train_excerpts.head(1)

In [None]:
df_test_excerpts.head(1)

In [None]:
df = concat_dfs([df_train_excerpts, df_test_excerpts])
df.head(1)

In [None]:
df.shape

## Experiment 1
* Training with the following-->
  * Data augmented with random synonym replacement.
  * Infused with CBT excerpts.
  * Batch size of 128 for language model.
  * Batch sizeof 8 for text learner.
  * train set for regressor-80%, valid-10%
  * Radam as optimizer.
  * Trained the text learner for 4 epochs.

In [None]:
wandb.init(project='commonlit',
                 entity='sapal6',
                 name='vanilla-dataset-infused-with-cbt-lmbs128-textlearnerbs8-radam-4-epochs',
                 tags=['datainfusion',                    
                       'augmentedDataset',
                       'testSetCombined',
                       'bs8',
                       'discriminativeTraining'])

## The language modellearner

In [None]:
def get_lm_learner(df: DataFrame, path: Path,
                   valid_pct: float = 0.3, bs: int = 64,
                   seq_len:int = 72, drop_mult: float = 1.0,
                   opt_func = None, metrics = None,
                   wd: float = 0.1):
    """get the dataloader and the language model learner"""
    dls_lm = TextDataLoaders.from_df(df, path, is_lm=True,
                                     valid_pct=valid_pct, bs=bs,
                                     seq_len=seq_len)
    
    learn = language_model_learner(dls_lm,
                               AWD_LSTM,
                               drop_mult=drop_mult,
                               opt_func=opt_func,
                               metrics=metrics, 
                               path=path,
                              wd=wd).to_fp16()
    
    return dls_lm, learn
    

In [None]:
dls_lm , learn  = get_lm_learner(df, op_path, opt_func=RAdam,bs=128,
                                 seq_len=80
               metrics=[accuracy, Perplexity()])

In [None]:
dls_lm.show_batch()

## Training the Lm

In [None]:
learn.fit_one_cycle(1, 1e-2)

In [None]:
learn.save('./1epoch')

unfreeze all the layers and train more

In [None]:
learn.load('./1epoch')
learn.unfreeze()
learn.fit_one_cycle(2, 1e-3)

good enough accuracy, let;s save it

In [None]:
learn.path = Path(".")
learn.save_encoder('./final_encoder')

## Using fastai text learner classifier head for regression

## Building the learner

In [None]:
def get_text_regressor(df: DataFrame, text_col: str, target_col:str, cols2Drop: List,
                       bs: int = 8, drop_mult: float = 1.0,
                       cbs=None, seq_len: int=80, opt_func=RAdam, wd=None):
    """pre proces the data, get the dataloader and build the text regressor learner"""
    df_train_for_regressor = df.drop(cols2Drop, axis=1)
    
    data = DataBlock(
    blocks=(TextBlock.from_df(text_col, vocab=dls_lm.vocab,seq_len=seq_len), RegressionBlock),
    get_x=ColReader('text'),get_y=ColReader(target_col),
    splitter=RandomSubsetSplitter(0.8,0.1, seed=2))
    
    dls = data.dataloaders(df_train_for_regressor, bs=bs)
    learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=drop_mult,
                                cbs=cbs,
                                opt_func=opt_func,
                                metrics=rmse,
                                   wd=0.5)
    
    learn.path = Path(".")
    learn = learn.load_encoder('./final_encoder')
    
    return df_train_for_regressor, dls, learn

In [None]:
df_for_regressor = pd.read_csv(op_path/'common-lit-datset/aug_df.csv')
df_for_regressor, dls, learn = get_text_regressor(df_for_regressor, 'excerpt','target', ['id',
                                                                    'url_legal',
                                                                     'license',
                                                                     'standard_error'],
                                                                 cbs=[WandbCallback(log_dataset=op_path/'common-lit-datset',
                                                                                    log_model=False)],
                                                                  opt_func=RAdam)

In [None]:
dls.train.show_batch(max_n=3)

In [None]:
dls.valid.show_batch(max_n =3)

In [None]:
learn.fit_one_cycle(1, 2e-2)

In [None]:
#lr_min,lr_steep = learn.lr_find()

In [None]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2))

In [None]:
#lr_min, lr_steep = learn.lr_find()

In [None]:
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2))

In [None]:
#lr_min, lr_steep = learn.lr_find()

In [None]:
learn.unfreeze()
learn.fit_one_cycle(4, slice(1e-2/(2.6**4),1e-2))

In [None]:
wandb.run.finish()