## A simple average of my two previous kernels

1. RoBERTa-Base -> [CommonLit Readability Prize - RoBERTa Torch|Infer](https://www.kaggle.com/rhtsingh/commonlit-readability-prize-roberta-torch-infer)

2. RoBERTa-Large -> [CommonLit Readability Prize-RoBERTa Torch|Infer 2](https://www.kaggle.com/rhtsingh/commonlit-readability-prize-roberta-torch-infer-2)

### P.S. What's Coming Soon -   
1. [CommonLit Readability Prize - RoBERTa Torch|FIT](https://www.kaggle.com/rhtsingh/commonlit-readability-prize-roberta-torch-fit) fixed version.  
2. **CommonLit Readability Prize - RoBERTa Torch|FIT 2** Dynamic Padding + Sequence Bucketing, Stochastic Weight Averaging, MIXOUT, Utilizing Different Transformer Layers (LSTM, Attention) **all in one kernel maybe**
3. **Multi-Task Learning**  

### Please upvote before Fork ;)

## SVM

In [None]:
import numpy as np
import pandas as pd

In [None]:
# weight of RoBERTa Large predicts
RoB_L_W = [.2]*5
RoB_B_W = [.2]*5
# weight of SVM predicts
SVM_W = [.2]*5

weight_M_dict = {'RoBL':.3, 'RoBB':.3, 'SVM':.4}


assert np.sum(SVM_W)==1
assert np.sum(RoB_L_W)==1

In [None]:
import os
import gc
import sys
import cv2
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

import optuna
import lightgbm as lgb
import xgboost as xgb
from sklearn.svm import SVR
from catboost import CatBoostRegressor, Pool, CatBoost

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.optim.optimizer import Optimizer
from torch.optim.lr_scheduler import _LRScheduler
from torch.optim.lr_scheduler import (CosineAnnealingWarmRestarts, CosineAnnealingLR, 
                                      ReduceLROnPlateau)

from transformers import (AutoModel, AutoTokenizer, 
                          AutoModelForSequenceClassification,get_constant_schedule_with_warmup)

import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff


from colorama import Fore, Back, Style
y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA
c_ = Fore.CYAN
sr_ = Style.RESET_ALL

In [None]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

target = train_data['target'].to_numpy()

#for kfold  
num_bins = int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:,'bins'] = pd.cut(train_data['target'],bins=num_bins,labels=False)
bins = train_data.bins.to_numpy()

def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

In [None]:
config = {
    'batch_size':128,
    'max_len':256,
    'seed':42,
}

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

In [None]:
class CLRPDataset(nn.Module):
    def __init__(self,df,tokenizer,max_len=128):
        self.excerpt = df['excerpt'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)  
        return encode
    
    def __len__(self):
        return len(self.excerpt)

In [None]:
def get_embeddings(df,path,plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    MODEL_PATH = path
    model = AutoModel.from_pretrained(MODEL_PATH)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model.to(device)
    model.eval()

    ds = CLRPDataset(df,tokenizer,config['max_len'])
    dl = DataLoader(ds,
                  batch_size = config["batch_size"],
                  shuffle=False,
                  num_workers = 4,
                  pin_memory=True,
                  drop_last=False
                 )
        
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs[0][:,0].detach().cpu().numpy()
            embeddings.extend(outputs)
    return np.array(embeddings)

In [None]:
!pip3 install ../input/frozendict/frozendict-2.0.2-py3-none-any.whl
!pip3 install ../input/neuraltangent/neural_tangents-0.3.6-py2.py3-none-any.whl

In [None]:
from jax import random
from neural_tangents import stax
import neural_tangents as nt

def get_preds_svm(X,y,X_test,bins=bins,nfolds=5,C=10,kernel='rbf'):
    kfold = StratifiedKFold(n_splits=nfolds)
    scores = list()
    preds = np.zeros((X_test.shape[0]))
    for k, (train_idx,valid_idx) in enumerate(kfold.split(X,bins)):
        X_train,y_train = X[train_idx], y[train_idx]
        X_valid,y_valid = X[valid_idx], y[valid_idx]

        ResBlock = stax.serial(
                        stax.FanOut(2),
                        stax.parallel(
                            stax.serial(
                                stax.Erf(),
                                stax.Dense(1, W_std=1.25, b_std=0.0),
                                stax.Erf(),
                                stax.Dense(1, W_std=1.25, b_std=0.0),
                                stax.Erf(),
                                stax.Dense(1, W_std=1.25, b_std=0.0),
                            ),
                            stax.Identity(),
                        ),
                        stax.FanInSum()
                    )

        init_fn, apply_fn, kernel_fn = stax.serial(
                stax.Dense(1, W_std=1.0, b_std=0),
                ResBlock, ResBlock, stax.Erf(),
                stax.Dense(1, W_std=2.5, b_std=0.1)
        )

        key = random.PRNGKey(10)
        _, params = init_fn(key, input_shape=X_train.shape)
        predict_fn = nt.predict.gradient_descent_mse_ensemble(kernel_fn,
                                                                  X_train,
                                                                  y_train[:,np.newaxis],
                                                                  diag_reg=1e-1,
                                                                  lr=1)
        prediction = predict_fn(x_test=X_valid, get='nngp', t=None)#model.predict(X_valid)
        score = rmse_score(prediction,y_valid)
        print(f'Fold {k} , rmse score: {score}')
        scores.append(score)
        preds += predict_fn(x_test=X_test, get='nngp', t=None)#model.predict(X_test)
        
    print("mean rmse",np.mean(scores))
    return np.array(preds)/nfolds

In [None]:
train_embeddings1 =  get_embeddings(train_data,'../input/modelf1')
test_embeddings1 = get_embeddings(test_data,'../input/modelf1')
svm_p1 = get_preds_svm(train_embeddings1,target,test_embeddings1).mean(axis=1)
del train_embeddings1,test_embeddings1

train_embeddings2 =  get_embeddings(train_data,'../input/modelf2')
test_embeddings2 = get_embeddings(test_data,'../input/modelf2')
svm_p2 = get_preds_svm(train_embeddings2,target,test_embeddings2).mean(axis=1)
del train_embeddings2,test_embeddings2


train_embeddings3 =  get_embeddings(train_data,'../input/modelf3')
test_embeddings3 = get_embeddings(test_data,'../input/modelf3')
svm_p3 = get_preds_svm(train_embeddings3,target,test_embeddings3).mean(axis=1)
del train_embeddings3,test_embeddings3

train_embeddings4 =  get_embeddings(train_data,'../input/modelf4')
test_embeddings4 = get_embeddings(test_data,'../input/modelf4')
svm_p4 = get_preds_svm(train_embeddings4,target,test_embeddings4).mean(axis=1)
del train_embeddings4,test_embeddings4

train_embeddings5 =  get_embeddings(train_data,'../input/modelf5')
test_embeddings5 = get_embeddings(test_data,'../input/modelf5')
svm_p5 = get_preds_svm(train_embeddings5,target,test_embeddings5).mean(axis=1)
del train_embeddings5,test_embeddings5

del train_data, test_data

In [None]:
svm_preds = svm_p1*SVM_W[0] + svm_p2*SVM_W[1] + svm_p3*SVM_W[2] + svm_p4*SVM_W[3] + svm_p5*SVM_W[4]

In [None]:
svm_pred_df = pd.DataFrame()
svm_pred_df['svm'] = svm_preds
svm_pred_df.head(10)

## RoBERT lager and base

### Load Test Dataset

In [None]:
import pandas as pd
import numpy as np

train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

### Import Dependencies

In [None]:
%matplotlib inline
from glob import glob
import os
import matplotlib.pyplot as plt
import json
from collections import defaultdict
import gc
gc.enable()

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.optimizer import Optimizer
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import (
    Dataset, DataLoader, 
    SequentialSampler, RandomSampler
)
from transformers import RobertaConfig
from transformers import (
    get_cosine_schedule_with_warmup, 
    get_cosine_with_hard_restarts_schedule_with_warmup
)
from transformers import RobertaTokenizer
from transformers import RobertaModel
from IPython.display import clear_output
from tqdm import tqdm, trange

### Convert Examples `(Excerpt)` to Features

In [None]:
def convert_examples_to_features(data, tokenizer, max_len, is_test=False):
    data = data.replace('\n', '')
    tok = tokenizer.encode_plus(
        data, 
        max_length=max_len, 
        truncation=True,
        return_attention_mask=True,
        return_token_type_ids=True
    )
    curr_sent = {}
    padding_length = max_len - len(tok['input_ids'])
    curr_sent['input_ids'] = tok['input_ids'] + ([0] * padding_length)
    curr_sent['token_type_ids'] = tok['token_type_ids'] + \
        ([0] * padding_length)
    curr_sent['attention_mask'] = tok['attention_mask'] + \
        ([0] * padding_length)
    return curr_sent

### Dataset Retriever

In [None]:
class DatasetRetriever(Dataset):
    def __init__(self, data, tokenizer, max_len, is_test=False):
        self.data = data
        self.excerpts = self.data.excerpt.values.tolist()
        self.tokenizer = tokenizer
        self.is_test = is_test
        self.max_len = max_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, item):
        if not self.is_test:
            excerpt, label = self.excerpts[item], self.targets[item]
            features = convert_examples_to_features(
                excerpt, self.tokenizer, 
                self.max_len, self.is_test
            )
            return {
                'input_ids':torch.tensor(features['input_ids'], dtype=torch.long),
                'token_type_ids':torch.tensor(features['token_type_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(features['attention_mask'], dtype=torch.long),
                'label':torch.tensor(label, dtype=torch.double),
            }
        else:
            excerpt = self.excerpts[item]
            features = convert_examples_to_features(
                excerpt, self.tokenizer, 
                self.max_len, self.is_test
            )
            return {
                'input_ids':torch.tensor(features['input_ids'], dtype=torch.long),
                'token_type_ids':torch.tensor(features['token_type_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(features['attention_mask'], dtype=torch.long),
            }

### Model

In [None]:
class CommonLitModel(nn.Module):
    def __init__(
        self, 
        model_name, 
        config,  
        multisample_dropout=False,
        output_hidden_states=False
    ):
        super(CommonLitModel, self).__init__()
        self.config = config
        self.roberta = RobertaModel.from_pretrained(
            model_name, 
            output_hidden_states=output_hidden_states
        )
        self.layer_norm = nn.LayerNorm(config.hidden_size)
        if multisample_dropout:
            self.dropouts = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
        self.regressor = nn.Linear(config.hidden_size, 1)
        self._init_weights(self.layer_norm)
        self._init_weights(self.regressor)
 
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
 
    def forward(
        self, 
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        labels=None
    ):
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
        sequence_output = outputs[1]
        sequence_output = self.layer_norm(sequence_output)
 
        # multi-sample dropout
        for i, dropout in enumerate(self.dropouts):
            if i == 0:
                logits = self.regressor(dropout(sequence_output))
            else:
                logits += self.regressor(dropout(sequence_output))
        
        logits /= len(self.dropouts)
 
        # calculate loss
        loss = None
        if labels is not None:
            loss_fn = torch.nn.MSELoss()
            logits = logits.view(-1).to(labels.dtype)
            loss = torch.sqrt(loss_fn(logits, labels.view(-1)))
        
        output = (logits,) + outputs[1:]
        return ((loss,) + output) if loss is not None else output

### Utils

In [None]:
def make_model(model_name='roberta-large', num_labels=1):
    tokenizer = RobertaTokenizer.from_pretrained(model_name)
    config = RobertaConfig.from_pretrained(model_name)
    config.update({'num_labels':num_labels})
    model = CommonLitModel(model_name, config=config)
    return model, tokenizer

def make_loader(
    data, 
    tokenizer, 
    max_len,
    batch_size,
):
    
    test_dataset = DatasetRetriever(data, tokenizer, max_len, is_test=True)
    test_sampler = SequentialSampler(test_dataset)
    test_loader = DataLoader(
        test_dataset, 
        batch_size=batch_size // 2, 
        sampler=test_sampler, 
        pin_memory=False, 
        drop_last=False, 
        num_workers=0
    )

    return test_loader

### Evaluator

In [None]:
class Evaluator:
    def __init__(self, model, scalar=None):
        self.model = model
        self.scalar = scalar

    def evaluate(self, data_loader, tokenizer):
        preds = []
        self.model.eval()
        total_loss = 0
        with torch.no_grad():
            for batch_idx, batch_data in enumerate(data_loader):
                input_ids, attention_mask, token_type_ids = batch_data['input_ids'], \
                    batch_data['attention_mask'], batch_data['token_type_ids']
                input_ids, attention_mask, token_type_ids = input_ids.cuda(), \
                    attention_mask.cuda(), token_type_ids.cuda()
                
                if self.scalar is not None:
                    with torch.cuda.amp.autocast():
                        outputs = self.model(
                            input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids
                        )
                else:
                    outputs = self.model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids
                    )
                
                logits = outputs[0].detach().cpu().numpy().squeeze().tolist()
                preds += logits
        return preds

### Config

In [None]:
 def config(fold, model_name, load_model_path):
    torch.manual_seed(2021)
    torch.cuda.manual_seed(2021)
    torch.cuda.manual_seed_all(2021)
    
    max_len = 250
    batch_size = 8

    model, tokenizer = make_model(
        model_name=model_name, 
        num_labels=1
    )
    model.load_state_dict(
        torch.load(f'{load_model_path}/model{fold}.bin')
    )
    test_loader = make_loader(
        test, tokenizer, max_len=max_len,
        batch_size=batch_size
    )

    if torch.cuda.device_count() >= 1:
        print('Model pushed to {} GPU(s), type {}.'.format(
            torch.cuda.device_count(), 
            torch.cuda.get_device_name(0))
        )
        model = model.cuda() 
    else:
        raise ValueError('CPU training is not supported')

    # scaler = torch.cuda.amp.GradScaler()
    scaler = None
    return (
        model, tokenizer, 
        test_loader, scaler
    )

### Run

In [None]:
def run(fold=0, model_name=None, load_model_path=None):
    model, tokenizer, \
        test_loader, scaler = config(fold, model_name, load_model_path)
    
    import time

    evaluator = Evaluator(model, scaler)

    test_time_list = []

    torch.cuda.synchronize()
    tic1 = time.time()

    preds = evaluator.evaluate(test_loader, tokenizer)

    torch.cuda.synchronize()
    tic2 = time.time() 
    test_time_list.append(tic2 - tic1)
    
    del model, tokenizer, test_loader, scaler
    gc.collect()
    torch.cuda.empty_cache()
    
    return preds

In [None]:
RBWRTbase_pred_df = pd.DataFrame()
RBWRTlarge_pred_df = pd.DataFrame()
for fold in tqdm(range(5)):
    RBWRTbase_pred_df[f'fold{fold}'] = run(fold, '../input/roberta-base/', '../input/commonlit-roberta-base-i/')
    RBWRTlarge_pred_df[f'fold{fold}'] = run(fold, '../input/robertalarge/', '../input/roberta-large-itptfit/')

In [None]:
RoBL_preds = RBWRTlarge_pred_df['fold0']*0
RoBB_preds = RBWRTbase_pred_df['fold0']*0
for i in range(FOLDS):
    RoBL_preds += RBWRTlarge_pred_df[f'fold{i}']*RoB_L_W[i]
    RoBB_preds += RBWRTbase_pred_df[f'fold{i}']*RoB_B_W[i]     
    


In [None]:
RoBL_preds[:7]

In [None]:
RoBB_preds[:7]

In [None]:
svm_preds[:7]

### Make Submission

In [None]:
W_M = weight_M_dict
pds1 = RoBL_preds
pds2 = RoBB_preds
pds3 = svm_preds

In [None]:
sub = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')
sub['target'] = ((RBWRTbase_pred_df.mean(axis=1) + RBWRTlarge_pred_df.mean(axis=1))/2).values.tolist()
# sub['target'] = pds1*W_M['RoBL'] + pds2*W_M['RoBB'] + pds3*W_M['SVM']
sub.to_csv('submission.csv', index=False)

In [None]:
sub