# Notes

This kernel if only for inference, the training one is on its road.  
The experimental model is **roBERTa**. But, as we're using the **huggingface**'s **AutoModel** interface, you can easily choose whatever you want.

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

import torch
from torch import nn, optim
from  torch.utils.data import Dataset, DataLoader, Sampler
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
from torch.cuda.amp import autocast, GradScaler
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, AdaBoostRegressor
import os, random, gc
import re, time, json, pickle
from catboost import CatBoostRegressor,CatBoostClassifier
import spacy
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor,BayesianRidge
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVR,SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from tqdm.notebook import tqdm

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

import torch
from torch import nn
from  torch.utils.data import Dataset, DataLoader
import math
import pickle
import multiprocessing
import more_itertools

from tqdm.notebook import tqdm
from transformers import AdamW,get_cosine_schedule_with_warmup
from transformers import AutoTokenizer, AutoConfig, AutoModelForTokenClassification, AutoModelForSequenceClassification,AutoModel

In [None]:
def create_folds(data, num_splits, seed):
    data["fold"] = -1
    kf = KFold(n_splits=num_splits, random_state=seed, shuffle=True)
    for fold, (_, val_set) in enumerate(kf.split(np.arange(len(data)))):
        data.loc[val_set, "fold"] = fold
    return data   
def create_folds2(data, num_splits, seed):
    # we create a new column called kfold and fill it with -1
    data["fold"] = -1
    
    # the next step is to randomize the rows of the data
    data = data.sample(frac=1).reset_index(drop=True)

    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # bin targets
    data.loc[:, "bins"] = pd.cut(
        data["target"], bins=num_bins, labels=False
    )
    
    # initiate the kfold class from model_selection module
    kf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=seed)
    
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'fold'] = f
    
    # drop the bins column
    data = data.drop("bins", axis=1)

    # return dataframe with folds
    return data

In [None]:
CV = 5
MODEL_ROOT = Path(".")
MODE = 'my'

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
TRAIN_NUM_WORKERS = 2
VAL_NUM_WORKERS = 2
print("Device:", DEVICE)
NUM_EPOCHS = 4
BATCH_SIZE = 6
MAX_LEN = 256
EVAL_SCHEDULE = [(0.52, 256),(0.50, 16), (0.48, 8), (0.46, 4), (0.44, 2), (-1., 1)]
ROBERTA_PATH = "../input/huggingface-deberta-variants/deberta-large/deberta-large"#480
TOKENIZER_PATH = "../input/huggingface-deberta-variants/deberta-large/deberta-large"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
TOKENIZER = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

In [None]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True

In [None]:
train_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/train.csv")

# Remove incomplete entries if any.
train_df.drop(train_df[(train_df.target == 0) & (train_df.standard_error == 0)].index,
              inplace=True)
train_df.reset_index(drop=True, inplace=True)


test_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")

wiki = pd.read_csv("../input/commonlit/wiki2.csv").iloc[:10000]
wiki['standard_error'] = 1


train_df

In [None]:
class LitDataset(Dataset):
    def __init__(self, df,tokenizer = TOKENIZER, inference_only=False):
        super().__init__()

        self.df = df        
        self.inference_only = inference_only
        self.text = df.excerpt.tolist()
        #self.text = [text.replace("\n", " ") for text in self.text]
        
        if not self.inference_only:
            self.target = torch.tensor(df.target.values, dtype=torch.float32)        
            self.std = torch.tensor(df.standard_error.values, dtype=torch.float32)
     
        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',            
            max_length = MAX_LEN,
            truncation = True,
            return_attention_mask=True
        )        
 

    def __len__(self):
        return len(self.df)

    
    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return (input_ids, attention_mask)            
        else:
            target = self.target[index]
            #target = torch.normal(self.target[index], self.std[index]/5)#.astype(np.float32)
            return (input_ids, attention_mask, target)


In [None]:
class LitModel(nn.Module):
    def __init__(self):
        super().__init__()
        
        config = AutoConfig.from_pretrained(ROBERTA_PATH)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})                       
        
        self.roberta = AutoModel.from_pretrained(ROBERTA_PATH, config=config)  
        
        #freeze_embedding = False
        #self.roberta.base_model.embeddings.requires_grad_(not freeze_embedding)
        
        #768
        self.attention = nn.Sequential(            
            nn.Linear(1024, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )     

        self.regressor = nn.Sequential(                        
            nn.Linear(1024, 1)                        
        )
        
    def freeze(self):
        for param in self.roberta.parameters():
            param.requires_grad = False
      
    def unfreeze(self):
        for param in self.roberta.parameters():
            param.requires_grad = True
            
    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)        

        last_layer_hidden_states = roberta_output.hidden_states[-1]
        #last_layer_hidden_states2 = roberta_output.hidden_states[-2]#last_hidden_state
        #last_layer_hidden_states = torch.cat((last_layer_hidden_states, last_layer_hidden_states2), 2)

        weights = self.attention(last_layer_hidden_states)
        y = torch.sum(weights * last_layer_hidden_states, dim=1)        


        #y = last_layer_hidden_states[:,0]
        #y = torch.mean(last_layer_hidden_states, 1)
        
        # Now we reduce the context vector to the prediction score.
        return (y,self.regressor(y))

In [None]:
def eval_mse(model, data_loader):
    """Evaluates the mean squared error of the |model| on |data_loader|"""
    model.eval()            
    mse_sum = 0

    with torch.no_grad():
        for batch_num, (input_ids, attention_mask, target) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)                        
            target = target.to(DEVICE)           
            
            pred = model(input_ids, attention_mask)[1]                      

            mse_sum += nn.MSELoss(reduction="sum")(pred.flatten(), target).item()
                

    return mse_sum / len(data_loader.dataset)

In [None]:
def predict(model, data_loader):
    """Returns an np.array with predictions of the |model| on |data_loader|"""
    model.eval()

    result = np.zeros(len(data_loader.dataset))    
    index = 0
    preds1 = []
    with torch.no_grad():
        for batch_num, (input_ids, attention_mask) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
                        
            pred = model(input_ids, attention_mask)                        
            preds1.append(pred[0].cpu().numpy())
            result[index : index + pred[1].shape[0]] = pred[1].flatten().to("cpu")
            index += pred[1].shape[0]
    preds1 = np.concatenate(preds1)
    return preds1, result

In [None]:
def train(model, model_path, train_loader, val_loader,
          optimizer, scheduler=None, num_epochs=NUM_EPOCHS):    
    best_val_rmse = None
    best_epoch = 0
    step = 0
    last_eval_step = 0
    eval_period = EVAL_SCHEDULE[0][1]    
    scaler = GradScaler()
    start = time.time()
    #model.unfreeze()
 
    for epoch in range(num_epochs):                           
        val_rmse = None         

        for batch_num, (input_ids, attention_mask, target) in enumerate(train_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)            
            target = target.to(DEVICE)                        

            optimizer.zero_grad()
            
            model.train()
            #with autocast():
            pred = model(input_ids, attention_mask)[1]
            mse = nn.MSELoss(reduction="mean")(pred.flatten(), target)
                        
            mse.backward()
            optimizer.step()
            #scaler.scale(mse).backward()
            #scaler.step(optimizer)
            #scaler.update()            
            
            if scheduler:
                scheduler.step()
            
            if step >= last_eval_step + eval_period:
                # Evaluate the model on val_loader.
                elapsed_seconds = time.time() - start
                num_steps = step - last_eval_step
                print(f"\n{num_steps} steps took {elapsed_seconds:0.3} seconds")
                last_eval_step = step
                
                val_rmse = math.sqrt(eval_mse(model, val_loader))                            

                print(f"Epoch: {epoch} batch_num: {batch_num}", 
                      f"val_rmse: {val_rmse:0.4}",f"lr: {scheduler.get_lr()[0]}")
                if epoch>0:
                    for rmse, period in EVAL_SCHEDULE:
                        if val_rmse >= rmse:
                            eval_period = period
                            break                               
                
                if not best_val_rmse or val_rmse < best_val_rmse:                    
                    best_val_rmse = val_rmse
                    best_epoch = epoch
                    torch.save(model.state_dict(), model_path)
                    print(f"New best_val_rmse: {best_val_rmse:0.4}")
                    if epoch >= 20:
                        print('Model Frozen -> Train Classifier Only')
                        model.freeze()           
                else:       
                    print(f"Still best_val_rmse: {best_val_rmse:0.4}",
                          f"(from epoch {best_epoch})")                                    
                    
                start = time.time()
                                            
            step += 1
                        
    
    return best_val_rmse

In [None]:
def create_optimizer(model):
    LR = 1e-4
    named_parameters = list(model.named_parameters())    
    #print(model)
    print(len(named_parameters))
    
    roberta_parameters = named_parameters[:-5]    
    attention_parameters = named_parameters[-5:-1]
    regressor_parameters = named_parameters[-1:]
        
    attention_group = [params for (name, params) in attention_parameters]
    regressor_group = [params for (name, params) in regressor_parameters]

    parameters = []
    parameters.append({"params": attention_group})
    parameters.append({"params": regressor_group})

    for layer_num, (name, params) in enumerate(roberta_parameters):
        weight_decay = 0.0 if "bias" in name else 0.01

        lr = LR/20
        """
        if layer_num >= 50:        
            lr = LR/10

        if layer_num >= 100:
            lr = LR/5
        
        if layer_num >= 150:
            lr = LR/2
        """
        if layer_num >= 100:        
            lr = LR/10

        if layer_num >= 200:
            lr = LR/5
        
        if layer_num >= 300:
            lr = LR/2
        
        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr})

    return AdamW(parameters,lr=5e-4)

In [None]:
if len(test_df)>-7:
    gc.collect()
    set_random_seed(0)
    list_val_rmse = []
  
    print("model_pretrain")
    model_path = "model_pretrain.pt"
    
   
    train_dataset = LitDataset(wiki,tokenizer = TOKENIZER,)    
    val_dataset = LitDataset(train_df,tokenizer = TOKENIZER,)    

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              drop_last=True, shuffle=True, num_workers=8)  
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE*6,
                            drop_last=False, shuffle=False, num_workers=8)   

    model = LitModel().to(DEVICE)
    
    optimizer = create_optimizer(model)                        
    
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_training_steps= len(train_loader),
        num_warmup_steps=len(train_loader)//3)    
    
    #scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, eta_min=1e-5, T_max=3)
    train(model, model_path, train_loader,
                                val_loader, optimizer, scheduler=scheduler,
                                num_epochs=1)

    del model
    gc.collect()

In [None]:
EVAL_SCHEDULE = [(0.51, 32),(0.48, 16), (0.45, 8), (0.43, 4), (0.4, 2), (-1., 1)]

In [None]:
if len(test_df)>-7:
    gc.collect()
    folds = [0]
    list_val_rmse = []
    for seed in [0,1,2,3,4]:
        set_random_seed(seed)
        train_df = create_folds2(train_df, num_splits=CV,seed = seed)
        fold_bar = tqdm(train_df.reset_index(drop=True).reset_index().groupby("fold").index.apply(list).items(), total=train_df.fold.max()+1)
        

        for fold, val_set in fold_bar:
            if folds and not fold in folds:
                continue            
            print(f"\n_seed{seed} Fold {fold + 1}/{len(folds)}")
            model_path = f"model_{fold + 1}_seed{seed}.pth"
            

            train_set = np.setdiff1d(train_df.index, val_set)
            
            train_dataset = LitDataset(train_df.iloc[train_set],tokenizer = TOKENIZER,)    
            val_dataset = LitDataset(train_df.iloc[val_set],tokenizer = TOKENIZER,)    

            train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                                      drop_last=True, shuffle=True, num_workers=2)  
            val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE*6,
                                    drop_last=False, shuffle=False, num_workers=2)   

            model = LitModel().to(DEVICE)
            
            model.load_state_dict(torch.load("./model_pretrain.pt"))    
            model.to(DEVICE)     
            """
            reinit_layers = 2
            if reinit_layers > 0:
                print(f'Reinitializing Last {reinit_layers} Layers ...')
                encoder_temp = getattr(model, 'roberta')
                for layer in encoder_temp.encoder.layer[-reinit_layers:]:
                    for module in layer.modules():
                        if isinstance(module, nn.Linear):
                            module.weight.data.normal_(mean=0.0, std=config.initializer_range)
                            if module.bias is not None:
                                module.bias.data.zero_()
                        elif isinstance(module, nn.Embedding):
                            module.weight.data.normal_(mean=0.0, std=config.initializer_range)
                            if module.padding_idx is not None:
                                module.weight.data[module.padding_idx].zero_()
                        elif isinstance(module, nn.LayerNorm):
                            module.bias.data.zero_()
                            module.weight.data.fill_(1.0)
                print('Done.!')
            """   
            optimizer = create_optimizer(model)                        
            
            scheduler = get_cosine_schedule_with_warmup(
                optimizer,
                num_training_steps=NUM_EPOCHS * len(train_loader),
                num_warmup_steps=len(train_loader)//2)    
            
            #scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, eta_min=1e-5, T_max=3)
            list_val_rmse.append(train(model, model_path, train_loader,
                                       val_loader, optimizer, scheduler=scheduler))

            del model
            gc.collect()

            print("\nPerformance estimates:")
            print(list_val_rmse)
            print("Mean:", np.array(list_val_rmse).mean())

In [None]:
preds9 = 0
if (MODE == 'all' or MODE == "my") and len(test_df)>-7:#
    #checkpoint_paths = list(Path(f"../input/vlomme-deberta-large").glob("*.pth"))
    checkpoint_paths = list(Path(f"./").glob("*.pth"))
    print(checkpoint_paths)
    preds9 = []

    test_dataset = LitDataset(test_df,tokenizer = TOKENIZER, inference_only=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE*3,
                             drop_last=False, shuffle=False, num_workers=2)

    for i,model_path in enumerate(checkpoint_paths):            
        print(f"\nUsing {model_path}")
        
        model = LitModel()
        model.load_state_dict(torch.load(model_path))    
        model.to(DEVICE)

        _,preds = predict(model, test_loader)
        preds = preds.T
        preds9.append(preds)

        
        del model
        gc.collect()
    preds9 = np.array(preds9)
    preds9 = np.mean(preds9, axis = 0)    
print(preds9)

In [None]:
submission = pd.DataFrame({'id':test_df.id,'target':preds9})
submission.to_csv('submission.csv',index=False)