# Training RoBERTa-Large with cross entropy

## Construct training environment
We used google colab pro and used miniconda to train the model. To reproduce our experiments on kaggle kernel, you need to reproduce the environment too.

In [None]:
%%bash
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
chmod 777 Miniconda3-latest-Linux-x86_64.sh
./Miniconda3-latest-Linux-x86_64.sh -b -p /usr/local/miniconda
export PATH=$PATH":/usr/local/miniconda/bin"
conda update -y -n base -c defaults conda
conda init bash
source ~/.bashrc
__conda_setup="$('/usr/local/miniconda/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
if [ $? -eq 0 ]; then
    eval "$__conda_setup"
else
    if [ -f "/usr/local/miniconda/etc/profile.d/conda.sh" ]; then
        . "/usr/local/miniconda/etc/profile.d/conda.sh"
    else
        export PATH="/usr/local/miniconda/bin:$PATH"
    fi
fi
unset __conda_setup
# <<< conda initialize <<<
conda activate base

There are several packages that aren't necessary for training. Please remove them if you want.

In [None]:
!__conda_setup="$('/usr/local/miniconda/bin/conda' 'shell.bash' 'hook' 2> /dev/null)" && \
eval "$__conda_setup" && \
unset __conda_setup && \
conda activate base && \
conda install -y pip && \
pip3 install jupyterlab mlflow kaggle slackweb optuna numpy pandas scipy scikit-learn matplotlib seaborn psutil logzero japanize_matplotlib pylint yapf pyarrow pytorch-tabnet google-cloud-bigquery cython ipywidgets numba transformers datasets accelerate pytorch_lightning &&\
pip3 install -U scikit-learn sentencepiece

In [None]:
script="""def run():
    import os
    import math
    import random
    import time

    import numpy as np
    import pandas as pd

    import torch
    import torch.nn as nn
    from torch.utils.data import Dataset
    from torch.utils.data import DataLoader

    from transformers import AdamW
    from transformers import AutoTokenizer
    from transformers import AutoModel
    from transformers import AutoConfig
    from transformers import get_cosine_schedule_with_warmup
    #from adabelief_pytorch import AdaBelief

    from sklearn.model_selection import KFold

    import gc
    gc.enable()

    SEED = 9000#int(time.time()) #random
    #SEED = 11111 #manual
    print("SEED=",SEED) 
    NUM_GPUS=1
    NUM_FOLDS = 15
    NUM_EPOCHS = 4
    BATCH_SIZE = 8
    MAX_LEN = 250
    LR_MIDDLE = 5e-5
    LR_INPUT = 1e-4
    EVAL_SCHEDULE = [(0.50, 16), (0.49, 8), (0.48, 4), (0.47, 2), (-1., 1)]
    WEIGHT_DECAY=0.01#0.00075
    LR_RATE=250
    LR_HEAD = 1e-5
    LR_MIDDLE = 5e-5
    LR_INPUT = 1e-4

    # ROBERTA_PATH = "input/deberta-xlarge"
    # TOKENIZER_PATH = "input/deberta-xlarge"
    # ROBERTA_PATH = "../input/debertalarge"
    # TOKENIZER_PATH = "../input/debertalarge"
    INPUT="../input/"
    OUTPUT="./"
    ROBERTA_PATH = f"{INPUT}/robertalarge"
    TOKENIZER_PATH = f"{INPUT}/robertalarge"
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

    train_df = pd.read_csv(f"{INPUT}/commonlitreadabilityprize/train.csv")

    # Remove incomplete entries if any.
    train_df.drop(train_df[(train_df.target == 0) & (train_df.standard_error == 0)].index,
                  inplace=True)
    train_df.reset_index(drop=True, inplace=True)

    test_df = pd.read_csv(f"{INPUT}/commonlitreadabilityprize/test.csv")
    submission_df = pd.read_csv(f"{INPUT}/commonlitreadabilityprize/sample_submission.csv")
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)


    class LitDataset(Dataset):
        def __init__(self, df, inference_only=False):
            super().__init__()

            self.df = df        
            self.inference_only = inference_only
            self.text = df.excerpt.tolist()
            

            if not self.inference_only:
                self.target = torch.tensor(df.target.values, dtype=torch.float32)        

            self.encoded = tokenizer.batch_encode_plus(
                self.text,
                padding = 'max_length',            
                max_length = MAX_LEN,
                truncation = True,
                return_attention_mask=True
            )        


        def __len__(self):
            return len(self.df)


        def __getitem__(self, index):        
            input_ids = torch.tensor(self.encoded['input_ids'][index])
            attention_mask = torch.tensor(self.encoded['attention_mask'][index])

            if self.inference_only:
                return (input_ids, attention_mask)            
            else:
                target = self.target[index]
                return (input_ids, attention_mask, target)

    import pytorch_lightning as pl

    TARGET_CENTER=train_df["target"].median()
    print(TARGET_CENTER)

    class LitModel(pl.LightningModule):
        def __init__(self):
            super().__init__()

            config = AutoConfig.from_pretrained(ROBERTA_PATH)
            config.update({"output_hidden_states":True, 
                           "hidden_dropout_prob": 0.0,
                           "layer_norm_eps": 1e-7})                       

            self.roberta = AutoModel.from_pretrained(ROBERTA_PATH, config=config)  

            self.attention = nn.Sequential(            
                nn.Linear(1024, 512),    
                nn.Tanh(),                       
                #nn.Mish(),
                nn.Linear(512, 1),
                nn.Softmax(dim=1)
            )

            self.regressor = nn.Sequential(                        
                nn.Linear(1024, 1)                        
            )
        def forward(self, input_ids, attention_mask):
            roberta_output = self.roberta(input_ids=input_ids,
                                          attention_mask=attention_mask)        

            # There are a total of 13 layers of hidden states.
            # 1 for the embedding layer, and 12 for the 12 Roberta layers.
            # We take the hidden states from the last Roberta layer.
            hidden_states = roberta_output.hidden_states

            # The number of cells is MAX_LEN.
            # The size of the hidden state of each cell is 768 (for roberta-base).
            # In order to condense hidden states of all cells to a context vector,
            # we compute a weighted average of the hidden states of all cells.
            # We compute the weight of each cell, using the attention neural network.
            hid=hidden_states[-1]
            weights = self.attention(hid)
            # weights.shape is BATCH_SIZE x MAX_LEN x 1
            # last_layer_hidden_states.shape is BATCH_SIZE x MAX_LEN x 768        
            # Now we compute context_vector as the weighted average.
            # context_vector.shape is BATCH_SIZE x 768
            context_vector=torch.sum(weights * hid, dim=1)

            # Now we reduce the context vector to the prediction score.
            return self.regressor(context_vector)

        def configure_optimizers(self):
            named_parameters = list(self.named_parameters())  
            roberta_parameters = named_parameters[:len(named_parameters)-8]    
            attention_parameters = named_parameters[len(named_parameters)-6:len(named_parameters)-2]
            regressor_parameters = named_parameters[len(named_parameters)-2:]


            attention_group = [params for (name, params) in attention_parameters]
            regressor_group = [params for (name, params) in regressor_parameters]

            parameters = []
            parameters.append({"params": attention_group})
            parameters.append({"params": regressor_group})

            for layer_num, (name, params) in enumerate(roberta_parameters):
                weight_decay = 0.0 if "bias" in name else WEIGHT_DECAY

                lr = LR_HEAD*LR_RATE

                if layer_num >= len(named_parameters)/3:        
                    lr = LR_MIDDLE*LR_RATE

                if layer_num >= len(named_parameters)*2/3:
                    lr = LR_INPUT*LR_RATE


                parameters.append({"params": params,
                                   "weight_decay": weight_decay,
                                   "lr": lr})
            optimizer=AdamW(parameters)
            #optimizer=AdaBelief(parameters)
            scheduler = get_cosine_schedule_with_warmup(
                optimizer,
                num_training_steps=NUM_EPOCHS * len(train_loader),
                num_warmup_steps=50)   
            return {
                "optimizer":optimizer,
                "lr_scheduler":{
                    'scheduler': scheduler,
                    'interval': 'step', # or 'epoch'
                    'frequency': 1
                }}

        def training_step(self,train_batch,batch_idx):
            input_ids,attention_mask,target=train_batch
            pred = self.forward(input_ids, attention_mask)  
            mse = nn.BCEWithLogitsLoss(reduction="mean")(pred.flatten(), torch.nn.Sigmoid()(target-TARGET_CENTER))
            return {"loss":mse}

        def validation_step(self,val_batch,batch_idx):
            input_ids,attention_mask,target=val_batch
            preds = self.forward(input_ids, attention_mask).flatten()
            targets=target
            return preds.to("cpu").detach(),targets.to("cpu").detach()

        def validation_epoch_end(self,outputs):
            preds=[]
            targets=[]
            for pred,target in outputs:
                preds.append(pred)
                targets.append(target)
            preds=torch.cat(preds)
            targets=torch.cat(targets)
            mse = nn.MSELoss(reduction="mean")(preds+TARGET_CENTER, targets)
            rmse=torch.sqrt(mse)
            self.log("rmse",rmse,prog_bar=True)
            return {"rmse":rmse}

        def predict_step(self,val_batch,batch_idx,dataloader_idx=None):
            input_ids,attention_mask,target=val_batch
            pred = self.forward(input_ids, attention_mask)                           
            return pred.flatten()+TARGET_CENTER

    from pytorch_lightning.callbacks import ModelCheckpoint
    from pytorch_lightning.callbacks import Callback
    class ChangeValPeriodCallback(Callback):
        def on_validation_end(self,trainer,mod):
            rmse=trainer.logged_metrics["rmse"]
            for val,steps in EVAL_SCHEDULE:
                if rmse>val:
                    break
            trainer.val_check_batch=steps

    # Remove outliners
    drop_list=['590467bd6','c2013b87b','cd19e2350','0684bb254','d2556a097']
    is_ok=((train_df.target != 0) | (train_df.standard_error != 0)) & (~train_df["id"].isin(drop_list))

    kf=KFold(NUM_FOLDS,random_state=SEED,shuffle=True)
    val_scores=[]
    #is_ok.iloc[100:]=False #for debug
    for fold,(train_indices,valid_indices) in reversed(list(enumerate(kf.split(train_df)))):
        if os.path.exists(f"{OUTPUT}/model{fold}.ckpt"):
            continue
        print(fold)
        train_flags=train_df.index.isin(train_indices)
        valid_flags=train_df.index.isin(valid_indices)
        train_dataset = LitDataset(train_df[train_flags&is_ok], inference_only=False)
        valid_dataset=LitDataset(train_df[valid_flags&is_ok],inference_only=False)
        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                                 drop_last=True, shuffle=True, num_workers=0)
        val_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE,
                                 drop_last=False, shuffle=False, num_workers=0)
        pl.seed_everything(SEED)
        checkpoint_callback = ModelCheckpoint(monitor='rmse',save_top_k=1,save_weights_only=True,dirpath=f"{OUTPUT}",filename=f"model{fold}")
        vpcallback=ChangeValPeriodCallback()
        trainer=pl.Trainer(
            precision=16,
            gpus=NUM_GPUS,
            deterministic=True,
            val_check_interval=16,
            callbacks=[checkpoint_callback,vpcallback],
            max_epochs=NUM_EPOCHS,
            min_epochs=NUM_EPOCHS,
            gradient_clip_val=0.1
        )
        print("model creation")
        model=LitModel()
        print("fit")
        trainer.fit(model,train_loader,val_loader)
        del model
        vals=trainer.validate(ckpt_path="best",val_dataloaders=val_loader)
        val_scores.append(vals)
        pd.DataFrame({"val_score":[vals]}).to_csv(f"{OUTPUT}/fold_{fold}.csv",index=False)
        del trainer,train_loader,val_loader,train_dataset,valid_dataset
        torch.cuda.empty_cache()
        # this is also stuck
        pl.utilities.memory.garbage_collection_cuda()
        gc.collect()
run()"""
with open("run.py","w") as f:
    f.write(script)

In [None]:
!__conda_setup="$('/usr/local/miniconda/bin/conda' 'shell.bash' 'hook' 2> /dev/null)" && eval "$__conda_setup" && unset __conda_setup && conda activate base && python run.py