### reference
- https://www.kaggle.com/chamecall/train-val-split
- https://www.kaggle.com/chamecall/clrp-pretrain
- https://www.kaggle.com/chamecall/clrp-finetune-single-roberta-base?scriptVersionId=68893027
- https://www.kaggle.com/chamecall/clrp-inference

In [None]:
import os
import gc
import sys
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVR
from sklearn.linear_model import Ridge

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler, Sampler
from torch.nn.functional import mse_loss
from transformers import AutoModel,AutoTokenizer,get_cosine_schedule_with_warmup, AutoConfig, AdamW


In [None]:
class Config:
    pretrained_model_path = '../input/clrp-03-01-roberta-large-epoch2-pretrain/clrp_roberta_large'
    output_hidden_states = True
    epochs = 3
    evaluate_interval = 10
    batch_size = 8
    device = 'cuda'
    seed = 42
    max_len = 256
    lr = 2e-5
    wd = 0.01
    eval_schedule = [(float('inf'), 16), (0.5, 8), (0.49, 4), (0.48, 2), (0.47, 1), (0, 0)]

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=Config.seed)

In [None]:
kfold_df = pd.read_csv('../input/train-val-split/kfold.csv')

test_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
submission_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/sample_submission.csv")

In [None]:
from torch.utils.data import Dataset
import torch

def convert_examples_to_features(text, tokenizer, max_len):

    tok = tokenizer.encode_plus(
        text, 
        max_length=max_len, 
        truncation=True,
        padding='max_length',
    )
    return tok


class CLRPDataset(Dataset):
    def __init__(self, data, tokenizer, max_len, is_test=False):
        self.data = data
        self.excerpts = self.data.excerpt.tolist()
        if not is_test:
            self.targets = self.data.target.tolist()
            
        self.tokenizer = tokenizer
        self.is_test = is_test
        self.max_len = max_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, item):
        if not self.is_test:
            excerpt = self.excerpts[item]
            label = self.targets[item]
            features = convert_examples_to_features(
                excerpt, self.tokenizer, self.max_len
            )
            return {
                'input_ids':torch.tensor(features['input_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(features['attention_mask'], dtype=torch.long),
                'label':torch.tensor(label, dtype=torch.float),
            }
        else:
            excerpt = self.excerpts[item]
            features = convert_examples_to_features(
                excerpt, self.tokenizer, self.max_len
            )
            return {
                'input_ids':torch.tensor(features['input_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(features['attention_mask'], dtype=torch.long),
            }

In [None]:
def make_dataloader(data, tokenizer):
    dataset = CLRPDataset(data, tokenizer=tokenizer, max_len=Config.max_len, is_test=True)
    batch_dataloader = DataLoader(
        dataset, 
        shuffle=False,
        batch_size=Config.batch_size, 
        num_workers = 0,
        pin_memory=True,
        drop_last=False
    )
    return batch_dataloader

In [None]:
import torch
import torch.nn as nn


class AttentionHead(nn.Module):
    def __init__(self, h_size, hidden_dim=512):
        super().__init__()
        self.W = nn.Linear(h_size, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        
    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

class CLRPModel(nn.Module):
    def __init__(self,transformer,config):
        super(CLRPModel,self).__init__()
        self.h_size = config.hidden_size
        self.transformer = transformer
        self.head = AttentionHead(self.h_size)
        self.linear = nn.Linear(self.h_size, 1)
              
    def forward(self, input_ids, attention_mask):
        transformer_out = self.transformer(input_ids, attention_mask)
        x = self.head(transformer_out.last_hidden_state)
        x = self.linear(x)
        return x

In [None]:
# https://www.kaggle.com/rhtsingh/utilizing-transformer-representations-efficiently

class CLRPModel_CP(nn.Module):
    def __init__(self,transformer,config):
        super(CLRPModel_CP,self).__init__()
        self.h_size = config.hidden_size
        self.transformer = transformer

        self.linear1 = nn.Linear(config.hidden_size*4, 1)
        self.linear2 = nn.Linear(config.hidden_size*4, 1)
              
    def forward(self, input_ids, attention_mask):
        transformer_out = self.transformer(input_ids, attention_mask)
        all_hidden_states = torch.stack(transformer_out[2])
        concatenate_pooling = torch.cat(
                (all_hidden_states[-1], all_hidden_states[-2], all_hidden_states[-3], all_hidden_states[-4]),-1
            )
        concatenate_pooling = concatenate_pooling[:, 0]
        
        x1 = self.linear1(concatenate_pooling) #target
        x2 = self.linear2(concatenate_pooling) #standard_error
        return x1, x2

In [None]:
# https://www.kaggle.com/rhtsingh/utilizing-transformer-representations-efficiently

class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
            )

    def forward(self, all_hidden_states):
        all_layer_embedding = all_hidden_states[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average
    

class CLRPModel_WLP(nn.Module):
    def __init__(self,transformer,config):
        super(CLRPModel_WLP,self).__init__()
        self.h_size = config.hidden_size
        self.transformer = transformer
        
        layer_start = 9
        self.pooler = WeightedLayerPooling(
            config.num_hidden_layers, 
            layer_start=layer_start, layer_weights=None
        )

        self.linear = nn.Linear(self.h_size, 1)
        self.linear2 = nn.Linear(self.h_size, 1)
              
    def forward(self, input_ids, attention_mask):
        transformer_out = self.transformer(input_ids, attention_mask)
        all_hidden_states = torch.stack(transformer_out[2])
        x = self.pooler(all_hidden_states)[:, 0]
        
        x1 = self.linear(x)   #target
        x2 = self.linear2(x)  #standard_error
        return x1, x2

In [None]:
def predict(model, data_loader, device):
    
    model.eval()
    embeddings = []
    
    with torch.no_grad():
        for step,batch in enumerate(data_loader):
            sent_id, mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
            with torch.no_grad():
                outputs = model(sent_id, mask)
                outputs = outputs.detach().cpu().numpy().squeeze()
                embeddings.extend(outputs)
    
    return np.array( embeddings )

In [None]:
def predict2(model, data_loader, device):
    
    model.eval()
    embeddings1 = []
    embeddings2 = []
    
    with torch.no_grad():
        for step,batch in enumerate(data_loader):
            sent_id, mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
            with torch.no_grad():
                outputs1, outputs2 = model(sent_id, mask)
                outputs1 = outputs1.detach().cpu().numpy().squeeze()
                embeddings1.extend(outputs1)
                outputs2 = outputs2.detach().cpu().numpy().squeeze()
                embeddings2.extend(outputs2)
                
    return np.array( embeddings1 ), np.array( embeddings2 )

In [None]:
def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

In [None]:
# https://www.kaggle.com/pichenguang/commonlit-two-models-with-new-model

def get_preds_svm(X,y,X_test,RidgeReg,bins,nfolds=10,C=4,kernel='rbf'):
    
    if(RidgeReg):
        print("ridge...")
    else:
        print("SVR...")
            
    scores = list()
    preds = np.zeros((X_test.shape[0]))
    
    kfold = StratifiedKFold(n_splits=10,shuffle=True,random_state=12345)
    
    for k, (train_idx,valid_idx) in enumerate(kfold.split(X,bins)):
        
        if(RidgeReg):
            model = Ridge(alpha=1.0)
        else:
            model = SVR(C=C,kernel=kernel,gamma='auto')
            
        X_train,y_train = X[train_idx], y[train_idx]
        X_valid,y_valid = X[valid_idx], y[valid_idx]
        
        model.fit(X_train,y_train)
        prediction = model.predict(X_valid)
        score = rmse_score(prediction,y_valid)
        print(f'Fold {k} , rmse score: {score}')
        scores.append(score)
        preds += model.predict(X_test)
        
    print("mean rmse",np.mean(scores))
    
    return np.array(preds)/nfolds

In [None]:
model_info = [
    { 
        'name': 'roberta-large',
        'pretrained_model_path': '../input/clrp-03-01-roberta-large-epoch5-pretrain/clrp_roberta_large',
        'weight_path' : '../input/clrp-03-02-roberta-large-epoch5-finetune/models'
    },
    { 
        'name': 'roberta-base',
        'pretrained_model_path': '../input/clrp-03-01-roberta-base-epoch5-pretrain/clrp_roberta_base',
        'weight_path' : '../input/clrp-03-02-roberta-base-epoch5-finetune/models'
    },    
    { 
        'name': 'albert-base-v2',
        'pretrained_model_path': '../input/clrp-03-01-albert-base-v2-epoch5-pretrain/clrp',
        'weight_path' : '../input/clrp-03-02-albert-base-v2-epoch5-finetune/models'
    },  
    { 
        'name': 'distilroberta-base',
        'pretrained_model_path': '../input/clrp-03-01-distilroberta-base-epoch5-pretrain/clrp',
        'weight_path' : '../input/clrp-03-02-distilroberta-base-epoch5-finetune/models'
    },  
]

In [None]:
df_test = pd.DataFrame()
score_list = []
NUM_MODELS = 5

for i, info in enumerate(model_info):
    
    test_preds = np.zeros(( NUM_MODELS, len(test_df) )) 
    kfold_df[f'pred_{i}'] = 0

    for model_index in range(NUM_MODELS): 
        print(f'Model#{model_index+1}')

        model_path = info['weight_path'] + f"/best_model_{model_index}.pt"

        tokenizer = AutoTokenizer.from_pretrained(info['pretrained_model_path'])
        config = AutoConfig.from_pretrained(info['pretrained_model_path'])
        config.update({
                "hidden_dropout_prob": 0.0,
                "layer_norm_eps": 1e-7
                })     

        transformer = AutoModel.from_pretrained(info['pretrained_model_path'], config=config)  
        model = CLRPModel(transformer, config)
        model = torch.load(model_path)
        model = model.to(Config.device)

        # test
        test_dl = make_dataloader(test_df, tokenizer)
        test_preds[model_index] = predict(model, test_dl, Config.device)    

        # val
        x_val = kfold_df[kfold_df.fold==model_index]
        x_val.reset_index(drop=True, inplace=True)
        val_dl = make_dataloader(x_val, tokenizer)

        preds = predict(model, val_dl, Config.device)
        kfold_df.loc[kfold_df.fold==model_index, f'pred_{i}'] = preds

        del model, transformer, tokenizer 
        del test_dl, val_dl, x_val
        gc.collect()    


    score = rmse_score(kfold_df[f'pred_{i}'], kfold_df.target)
    score_list.append( score )
    print( info['name'], score )
    
    df_test[f'pred_{i}'] = test_preds.mean(axis=0) 

In [None]:
df_score = pd.DataFrame()
df_score['name'] = [info['name'] for info in model_info]
df_score['rmse'] = score_list

print( df_score )

In [None]:
kfold_df.head()

## WeightedLayerPooling, ConcatenatePooling
- model output: target, standard_error

In [None]:
model_info2 = [
    { 
        'name': 'roberta-large',
        'type': 'WeightedLayerPooling',
        'pretrained_model_path': '../input/clrp-03-01-roberta-large-epoch5-pretrain/clrp_roberta_large',
        'weight_path' : '../input/clrp-03-31-roberta-large-wlp-out2-finetune'
    },
     { 
        'name': 'roberta-large',
        'type': 'ConcatenatePooling',
        'pretrained_model_path': '../input/clrp-03-01-roberta-large-epoch5-pretrain/clrp_roberta_large',
        'weight_path' : '../input/clrp-03-32-roberta-large-cp-out2-finetune'
    },   
    { 
        'name': 'roberta-base',
        'type': 'WeightedLayerPooling',
        'pretrained_model_path': '../input/clrp-03-01-roberta-base-epoch5-pretrain/clrp_roberta_base',
        'weight_path' : '../input/clrp-03-32-roberta-base-cp-out2-finetun'
    },  
    { 
        'name': 'roberta-base',
        'type': 'WeightedLayerPooling',
        'pretrained_model_path': '../input/clrp-03-01-roberta-base-epoch5-pretrain/clrp_roberta_base',
        'weight_path' : '../input/clrp-03-31-roberta-bas-v2-wlp-out2-finetun'
    },    
    { 
        'name': 'albert-base-v2',
        'type': 'ConcatenatePooling',
        'pretrained_model_path': '../input/clrp-03-01-albert-base-v2-epoch5-pretrain/clrp',
        'weight_path' : '../input/clrp-03-32-albert-base-v2-cp-out2-finetun'
    },  
    { 
        'name': 'albert-base-v2',
        'type': 'WeightedLayerPooling',
        'pretrained_model_path': '../input/clrp-03-01-albert-base-v2-epoch5-pretrain/clrp',
        'weight_path' : '../input/clrp-03-31-albert-base-v2-wlp-out2-finetune'
    },      
]

In [None]:
kfold_df2 = pd.read_csv('../input/train-val-split-2/kfold.csv')

In [None]:
#df_test = pd.DataFrame()
score_list = []
NUM_MODELS = 5
k = len(model_info)

for i, info in enumerate(model_info2):
    
    test_preds1 = np.zeros(( NUM_MODELS, len(test_df) )) 
    test_preds2 = np.zeros(( NUM_MODELS, len(test_df) )) 
    kfold_df2[f'pred_{k}'] = 0
    kfold_df2[f'pred_{k+1}'] = 0

    for model_index in range(NUM_MODELS): 
        print(f'Model#{model_index+1}')

        model_path = info['weight_path'] + f"/best_model_{model_index}.pt"

        tokenizer = AutoTokenizer.from_pretrained(info['pretrained_model_path'])
        config = AutoConfig.from_pretrained(info['pretrained_model_path'])
        config.update({
            "output_hidden_states":True,
            "hidden_dropout_prob": 0.0,
            "layer_norm_eps": 1e-7
            })    

        transformer = AutoModel.from_pretrained(info['pretrained_model_path'], config=config)  
        
        if info['type'] == 'WeightedLayerPooling':
            model = CLRPModel_WLP(transformer, config)
        else:
            model = CLRPModel_CP(transformer, config)
        
        model = torch.load(model_path)
        model = model.to(Config.device)

        # test
        test_dl = make_dataloader(test_df, tokenizer)
        t_preds1, t_preds2 = predict2(model, test_dl, Config.device)  
        test_preds1[model_index] = t_preds1
        test_preds2[model_index] = t_preds2

        # val
        x_val = kfold_df2[kfold_df2.fold==model_index]
        x_val.reset_index(drop=True, inplace=True)
        val_dl = make_dataloader(x_val, tokenizer)

        preds1, preds2 = predict2(model, val_dl, Config.device)
        kfold_df2.loc[kfold_df2.fold==model_index, f'pred_{k}']   = preds1
        kfold_df2.loc[kfold_df2.fold==model_index, f'pred_{k+1}'] = preds2

        del model, transformer, tokenizer 
        del test_dl, val_dl, x_val
        gc.collect()    


    score = rmse_score(kfold_df2[f'pred_{k}'], kfold_df2.target)
    score_list.append( score )
    print( info['name'], score )
    
    df_test[f'pred_{k}'] = test_preds1.mean(axis=0) 
    df_test[f'pred_{k+1}'] = test_preds2.mean(axis=0) 
    
    k = k + 2

In [None]:
df_score = pd.DataFrame()
df_score['name'] = [f"{info['name']}: {info['type']}" for info in model_info2]
df_score['rmse'] = score_list

print( df_score )

In [None]:
kfold_df2.head()

In [None]:
kfold_df.drop(columns=['url_legal', 'license', 'excerpt', 'target', 'standard_error', 'bins', 'fold'], inplace=True)
len(kfold_df)

In [None]:
kfold_df2.drop(columns=['url_legal', 'license', 'excerpt', 'standard_error', 'fold'], inplace=True)
len(kfold_df2)

In [None]:
df = pd.merge(kfold_df2, kfold_df, on=['id'], how='left' )
df.head()

In [None]:
features_columns = [f'pred_{i}' for i in range( len(model_info) + len(model_info2)*2 )]

for column in features_columns:
    df = df[~df[column].isnull()]
    df.reset_index(drop=True, inplace=True)

In [None]:
#features_columns

In [None]:
predictions1 = get_preds_svm(
    df[features_columns].values,
    df['target'].values,
    df_test[features_columns].values,
    RidgeReg=0,
    bins=df['bins'].values
)

predictions2 = get_preds_svm(
    df[features_columns].values,
    df['target'].values,
    df_test[features_columns].values,
    RidgeReg=1,
    bins=df['bins'].values
)

In [None]:
predictions = predictions1 * 0.5 + predictions2 * 0.5

### submission

In [None]:
submission_df.target = predictions
print(submission_df)
submission_df.to_csv("submission.csv", index=False)