In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import gc
import sys
import time
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold, StratifiedKFold
import xgboost as xgb

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from transformers import AutoModel, AutoTokenizer


from keras.layers import Lambda, Input, Dense, Reshape, RepeatVector, Dropout
from keras.models import Model
from keras.losses import mse, binary_crossentropy
from keras import backend as K
from keras.constraints import unit_norm, max_norm

from scipy import stats
import argparse
import os
from sklearn.manifold import MDS
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
data_dir = '../input/commonlitreadabilityprize/'
train = pd.read_csv(data_dir + 'train.csv')
test = pd.read_csv(data_dir + 'test.csv')
sample_submission = pd.read_csv(data_dir + 'sample_submission.csv')

target = train['target'].to_numpy()

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


class CLRPDataset(nn.Module):
    def __init__(self, df, tokenizer, max_len=128):
        self.excerpt = df['excerpt'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        return encode
    
    def __len__(self):
        return len(self.excerpt)
    

In [None]:
def get_embeddings(df, path, plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    MODEL_PATH = path
    model = AutoModel.from_pretrained(MODEL_PATH)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model.to(device)
    model.eval()

    ds = CLRPDataset(df, tokenizer, config['max_len'])
    dl = DataLoader(ds,
                    batch_size=config["batch_size"],
                    shuffle=False,
                    num_workers = 4,
                    pin_memory=True,
                    drop_last=False)
        
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0], -1).to(device) for key, val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs[0][:, 0].detach().cpu().numpy()
            embeddings.extend(outputs)
    return np.array(embeddings)

In [None]:
config = {
    'batch_size': 128,
    'max_len': 256,
    'seed': 42,
}
seed_everything(seed=config['seed'])

train_embeddings_roberta =  get_embeddings(train,'../input/roberta-base')
test_embeddings_roberta = get_embeddings(test,'../input/roberta-base')
train_embeddings_minilm =  get_embeddings(train,'../input/minilm-base')
test_embeddings_minilm = get_embeddings(test,'../input/minilm-base')

In [None]:
train_embeddings_mpnet =  get_embeddings(train,'../input/mpnet-base')
test_embeddings_mpnet = get_embeddings(test,'../input/mpnet-base')

In [None]:
train_embeddings_abc=np.hstack((train_embeddings_roberta,train_embeddings_minilm,train_embeddings_mpnet))
test_embeddings_abc=np.hstack((test_embeddings_roberta,test_embeddings_minilm,test_embeddings_mpnet))

In [None]:
train_embeddings_ab=np.hstack((train_embeddings_roberta,train_embeddings_minilm))
test_embeddings_ab=np.hstack((test_embeddings_roberta,test_embeddings_minilm))
train_embeddings_ac=np.hstack((train_embeddings_roberta,train_embeddings_mpnet))
test_embeddings_ac=np.hstack((test_embeddings_roberta,test_embeddings_mpnet))
train_embeddings_bc=np.hstack((train_embeddings_minilm,train_embeddings_mpnet))
test_embeddings_bc=np.hstack((test_embeddings_minilm,test_embeddings_mpnet))

In [None]:
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    
    'eta': 0.05,
    'max_depth': 3,
    
    'gamma': 1,
    'subsample': 0.8,
    
    'nthread': 2
}

nfolds = 5
kf = KFold(n_splits=nfolds, shuffle=True, random_state=config['seed'])

In [None]:
best_iterations = []
oof_rmses = []
preds = np.zeros(test.shape[0])

for k, (train_idx, valid_idx) in enumerate(kf.split(train)):    
    
    dtrain = xgb.DMatrix(train_embeddings_ab[train_idx], target[train_idx])
    dvalid = xgb.DMatrix(train_embeddings_ab[valid_idx], target[valid_idx])
    evals_result = dict()
    booster = xgb.train(params,
                        dtrain,
                        evals=[(dtrain, 'train'), (dvalid, 'valid')],
                        num_boost_round=300,
                        early_stopping_rounds=20,
                        evals_result=evals_result,
                        verbose_eval=False)
    
    best_iteration = np.argmin(evals_result['valid']['rmse'])
    best_iterations.append(best_iteration)
    oof_rmse = evals_result['valid']['rmse'][best_iteration]
    oof_rmses.append(oof_rmse)
    preds += booster.predict(xgb.DMatrix(test_embeddings_ab), ntree_limit=int(best_iteration+1)) / nfolds
    
evals_df = pd.DataFrame()
evals_df['fold'] = range(1, nfolds+1)
evals_df['best_iteration'] = best_iterations
evals_df['oof_rmse'] = oof_rmses

display(evals_df)
print('mean oof rmse = {}'.format(np.mean(oof_rmses)))

In [None]:
preds1=preds

In [None]:
best_iterations = []
oof_rmses = []
preds = np.zeros(test.shape[0])

for k, (train_idx, valid_idx) in enumerate(kf.split(train)):    
    
    dtrain = xgb.DMatrix(train_embeddings_ac[train_idx], target[train_idx])
    dvalid = xgb.DMatrix(train_embeddings_ac[valid_idx], target[valid_idx])
    evals_result = dict()
    booster = xgb.train(params,
                        dtrain,
                        evals=[(dtrain, 'train'), (dvalid, 'valid')],
                        num_boost_round=300,
                        early_stopping_rounds=20,
                        evals_result=evals_result,
                        verbose_eval=False)
    
    best_iteration = np.argmin(evals_result['valid']['rmse'])
    best_iterations.append(best_iteration)
    oof_rmse = evals_result['valid']['rmse'][best_iteration]
    oof_rmses.append(oof_rmse)
    preds += booster.predict(xgb.DMatrix(test_embeddings_ac), ntree_limit=int(best_iteration+1)) / nfolds
    
evals_df = pd.DataFrame()
evals_df['fold'] = range(1, nfolds+1)
evals_df['best_iteration'] = best_iterations
evals_df['oof_rmse'] = oof_rmses

display(evals_df)
print('mean oof rmse = {}'.format(np.mean(oof_rmses)))

In [None]:
preds2=preds

In [None]:
best_iterations = []
oof_rmses = []
preds = np.zeros(test.shape[0])

for k, (train_idx, valid_idx) in enumerate(kf.split(train)):    
    
    dtrain = xgb.DMatrix(train_embeddings_bc[train_idx], target[train_idx])
    dvalid = xgb.DMatrix(train_embeddings_bc[valid_idx], target[valid_idx])
    evals_result = dict()
    booster = xgb.train(params,
                        dtrain,
                        evals=[(dtrain, 'train'), (dvalid, 'valid')],
                        num_boost_round=300,
                        early_stopping_rounds=20,
                        evals_result=evals_result,
                        verbose_eval=False)
    
    best_iteration = np.argmin(evals_result['valid']['rmse'])
    best_iterations.append(best_iteration)
    oof_rmse = evals_result['valid']['rmse'][best_iteration]
    oof_rmses.append(oof_rmse)
    preds += booster.predict(xgb.DMatrix(test_embeddings_bc), ntree_limit=int(best_iteration+1)) / nfolds
    
evals_df = pd.DataFrame()
evals_df['fold'] = range(1, nfolds+1)
evals_df['best_iteration'] = best_iterations
evals_df['oof_rmse'] = oof_rmses

display(evals_df)
print('mean oof rmse = {}'.format(np.mean(oof_rmses)))

In [None]:
preds3=preds

In [None]:
best_iterations = []
oof_rmses = []
preds = np.zeros(test.shape[0])

for k, (train_idx, valid_idx) in enumerate(kf.split(train)):    
    
    dtrain = xgb.DMatrix(train_embeddings_abc[train_idx], target[train_idx])
    dvalid = xgb.DMatrix(train_embeddings_abc[valid_idx], target[valid_idx])
    evals_result = dict()
    booster = xgb.train(params,
                        dtrain,
                        evals=[(dtrain, 'train'), (dvalid, 'valid')],
                        num_boost_round=300,
                        early_stopping_rounds=20,
                        evals_result=evals_result,
                        verbose_eval=False)
    
    best_iteration = np.argmin(evals_result['valid']['rmse'])
    best_iterations.append(best_iteration)
    oof_rmse = evals_result['valid']['rmse'][best_iteration]
    oof_rmses.append(oof_rmse)
    preds += booster.predict(xgb.DMatrix(test_embeddings_abc), ntree_limit=int(best_iteration+1)) / nfolds
    
evals_df = pd.DataFrame()
evals_df['fold'] = range(1, nfolds+1)
evals_df['best_iteration'] = best_iterations
evals_df['oof_rmse'] = oof_rmses

display(evals_df)
print('mean oof rmse = {}'.format(np.mean(oof_rmses)))

In [None]:
preds4 = preds

In [None]:
pred = (preds1+preds2+preds3+preds4)/4

In [None]:
test['prediction'] = pred
submission = pd.DataFrame()
submission['id'] = test['id'].copy()
submission['target'] = test['prediction'].copy()
submission.to_csv('submission.csv', index=False)
submission.head()