In [None]:
import gc
import re
import os
import numpy as np
import pandas as pd
import random
import math
import tensorflow as tf
import logging
from sklearn.metrics import mean_squared_error
from tensorflow.keras import backend as K
from transformers import TFRobertaModel, RobertaConfig, RobertaModel, RobertaTokenizer
from kaggle_datasets import KaggleDatasets
tf.get_logger().setLevel(logging.ERROR)


import sys
import time
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

import xgboost as xgb

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import json
from tensorflow.keras.models import load_model
import string
import keras
from sklearn.svm import SVR

import pickle

from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.linear_model import LinearRegression, Ridge

import tensorflow as tf 
from tensorflow.keras.layers import Input,LSTM,Bidirectional,Embedding,Dense, Conv1D, Dropout , MaxPool1D , MaxPooling1D, GlobalAveragePooling2D , GlobalAveragePooling1D , GlobalMaxPooling1D , concatenate , Flatten
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.models import Model,load_model,save_model , model_from_json
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau,ModelCheckpoint, EarlyStopping ,LearningRateScheduler
from tensorflow.keras.utils import plot_model
from tensorflow.keras import backend as K

from transformers import RobertaConfig, RobertaModel, RobertaTokenizer, AutoConfig, AutoModel, AutoTokenizer, AutoModelForSequenceClassification, TFBertModel, BertTokenizerFast, BertTokenizer, RobertaTokenizerFast, TFRobertaModel, TFAutoModel

In [None]:
# 108, gpt2, 0.48
# 115, roberta-base, 0.478
# 121, bart-large, 0.474

In [None]:
class Dataset:
    def __init__(self, excerpt, tokenizer, max_len):
        self.excerpt = excerpt
        tokenizer.add_special_tokens({'pad_token': '0'})
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.excerpt)

    def __getitem__(self, item):
        text = str(self.excerpt[item])
        inputs = self.tokenizer(
            text, 
            max_length=self.max_len, 
            padding="max_length", 
            truncation=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long),
        }

In [None]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim
        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector
    
    
class CLRPModel(nn.Module):
    def __init__(self, configuration):
        super(CLRPModel, self).__init__()
        self.in_features = 1280
        self.auto_model = AutoModel.from_config(configuration)
        self.head = AttentionHead(self.in_features,self.in_features,1)
        self.dropout = nn.Dropout(0.1)
        self.l0 = nn.Linear(self.in_features, 1)
        self.l1 = nn.Linear(self.in_features, 7)

    def forward(self, ids, mask):
        outputs = self.auto_model(
            ids,
            attention_mask=mask
        )

        x = self.head(outputs[0]) # bs, 1024

        logits = self.l0(self.dropout(x))
        aux_logits = torch.sigmoid(self.l1(self.dropout(x)))
        return logits.squeeze(-1), aux_logits

In [None]:
def generate_predictions(pretrain_path, max_len):
    device = "cuda"
    model_path = AutoConfig.from_pretrained('../input/gpt2-pytorch/gpt2-large-config.json')
    model = CLRPModel(model_path)
    model.to(device)
    model.load_state_dict(torch.load(pretrain_path))
    model.eval()
    tokenizer = AutoTokenizer.from_pretrained('../input/gpt2large345m/gpt2-345M/')
    
    df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

    dataset = Dataset(excerpt=df.excerpt.values, tokenizer=tokenizer, max_len=max_len)
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=32, num_workers=0, pin_memory=True, shuffle=False
    )

    final_output = []
    for b_idx, data in tqdm(enumerate(data_loader)):
        with torch.no_grad():
            inputs = data['input_ids'].to(device)
            masks = data['attention_mask'].to(device)
            output, _ = model(inputs, masks)
            output = output.detach().cpu().numpy().tolist()
            final_output.extend(output)
    
    del model, tokenizer, df, dataset, data_loader
    del inputs, masks, output
    gc.collect()
    torch.cuda.empty_cache()
    return np.array(final_output)

In [None]:
preds0 = generate_predictions("../input/kaerururu-commonlit-108/fold-0.bin", max_len=256)
preds1 = generate_predictions("../input/kaerururu-commonlit-108/fold-1.bin", max_len=256)
preds2 = generate_predictions("../input/kaerururu-commonlit-108/fold-2.bin", max_len=256)
preds3 = generate_predictions("../input/kaerururu-commonlit-108/fold-3.bin", max_len=256)
preds4 = generate_predictions("../input/kaerururu-commonlit-108/fold-4.bin", max_len=256)

preds_gpt2 = (preds0 + preds1 + preds2 + preds3 + preds4) / 5

del preds0, preds1, preds2, preds3, preds4; gc.collect()

In [None]:
# model 0

In [None]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu,True)
        
max_len = 250
batch_size = 32
AUTOTUNE = tf.data.AUTOTUNE

MODEL=['bert-base-uncased' , 'roberta-base']

model_name = MODEL[1]

path=[
    "../input/commonlitreadabilityprize/sample_submission.csv",
    "../input/commonlitreadabilityprize/test.csv",
    "../input/commonlitreadabilityprize/train.csv"
]

df_train = pd.read_csv(path[2])
df_test = pd.read_csv(path[1])
df_ss = pd.read_csv(path[0])
                         
df_train = df_train.drop(['url_legal','license','standard_error'],axis='columns')
df_test = df_test.drop(['url_legal','license'],axis='columns')
X= df_train['excerpt']
y=df_train['target'].values

X_test = df_test['excerpt']

tokenizer1 = AutoTokenizer.from_pretrained("../input/huggingface-roberta-variants/roberta-base/roberta-base")

print('tokenization')
train_embeddings = tokenizer1(X.to_list(), truncation = True , padding = 'max_length' , max_length=max_len)
test_embeddings = tokenizer1(X_test.to_list() , truncation = True , padding = 'max_length' , max_length = max_len)
                         
@tf.function
def map_function(encodings):
    input_ids = encodings['input_ids']
    
    return {'input_word_ids': input_ids}

print("generating train and test")    
train = tf.data.Dataset.from_tensor_slices((train_embeddings))
train = (
            train
            .map(map_function, num_parallel_calls=AUTOTUNE)
            .batch(16)
            .prefetch(AUTOTUNE)
        )


test = tf.data.Dataset.from_tensor_slices((test_embeddings))
test = (
        test
        .map(map_function, num_parallel_calls = AUTOTUNE)
        .batch(16)
        .prefetch(AUTOTUNE)
    )
                         
                         
def build_roberta_base_model(max_len=max_len ):
    
    transformer = TFAutoModel.from_pretrained("../input/huggingface-roberta-variants/roberta-base/roberta-base")
    
    input_word_ids = tf.keras.layers.Input(shape = (max_len, ), dtype = tf.int32, name = 'input_word_ids')
    sequence_output = transformer(input_word_ids)[0]
    
    # We only need the cls_token, resulting in a 2d array
    cls_token = sequence_output[:, 0, :]
    output = tf.keras.layers.Dense(1, activation = 'linear', dtype = 'float32')(cls_token)
    
    model = tf.keras.models.Model(inputs = [input_word_ids], outputs = output)
    
    return model
                         
ragnar_model = build_roberta_base_model()
def feature_extractor(path):
    print("loading weights")
    ragnar_model.load_weights(path)
    x= ragnar_model.layers[-3].output
    model = Model(inputs = ragnar_model.inputs , outputs = x)
    return model
                         
def get_preds(model,train,test):
    print("Extracting Features from train data")
    train_features = model.predict( train , verbose =1)
    train_features = train_features.last_hidden_state
    train_features = train_features[: , 0 , :]
    print("Extracting Features from train data")
    test_features = model.predict( test , verbose =1)
    test_features = test_features.last_hidden_state
    test_features = test_features[: , 0 , :]
    
    return np.array(train_features , dtype= np.float16) , np.array(test_features , dtype= np.float16) 
                         
#model weight paths
paths=["../input/commonlit-readability-roberta-base/Roberta_Base_123_1.h5",
       "../input/commonlit-readability-roberta-base/Roberta_Base_123_2.h5",
       "../input/commonlit-readability-roberta-base/Roberta_Base_123_3.h5",
       "../input/commonlit-readability-roberta-base/Roberta_Base_123_4.h5",
       "../input/commonlit-readability-roberta-base/Roberta_Base_123_5.h5"
      ]
                         
#1
extraction_model = feature_extractor(paths[0])
train_embeddings1 , test_embeddings1 = get_preds(extraction_model , train , test)
                         
#2
extraction_model = feature_extractor(paths[1])
train_embeddings2 , test_embeddings2 = get_preds(extraction_model , train , test)
                         
#3
extraction_model = feature_extractor(paths[2])
train_embeddings3 , test_embeddings3 = get_preds(extraction_model , train , test)
                         
#4
extraction_model = feature_extractor(paths[3])
train_embeddings4 , test_embeddings4 = get_preds(extraction_model , train , test)
                         
#5
extraction_model = feature_extractor(paths[4])
train_embeddings5 , test_embeddings5 = get_preds(extraction_model , train , test)

In [None]:
np.save('test_embeddings1_Roberta_Base_123_1_h5.npy', test_embeddings1)
np.save('test_embeddings2_Roberta_Base_123_2_h5.npy', test_embeddings2)
np.save('test_embeddings3_Roberta_Base_123_3_h5.npy', test_embeddings3)
np.save('test_embeddings4_Roberta_Base_123_4_h5.npy', test_embeddings4)
np.save('test_embeddings5_Roberta_Base_123_5_h5.npy', test_embeddings5)

In [None]:
del tokenizer1, train_embeddings, test_embeddings;gc.collect()
del extraction_model, train_embeddings1, test_embeddings1, train_embeddings2, test_embeddings2, train_embeddings3, test_embeddings3, train_embeddings4, test_embeddings4, train_embeddings5, test_embeddings5;gc.collect()
# del preds,preds1,preds2,preds3,preds4,preds5;gc.collect()

In [None]:
# adding wordcloud in offline mode
import sys
sys.path.append('../input/sentence-transformers/sentence-transformers-master')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Ridge
from scipy import stats
from scipy.stats import norm

import matplotlib.gridspec as gridspec
from matplotlib.ticker import MaxNLocator


import warnings
warnings.filterwarnings("ignore")

# loding train and test data

train_df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

# dropping some columns

train_df=train_df[['id','excerpt','target','standard_error']]

# creating corpus

from nltk import FreqDist
from nltk import word_tokenize
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

import sentence_transformers
from sentence_transformers import SentenceTransformer, models

model_path = '../input/finetuned-model1/checkpoint-568'
word_embedding_model = models.Transformer(model_path, max_seq_length=275)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

X_train = model.encode(train_df.excerpt, device='cuda')
X_test = model.encode(test_df.excerpt, device='cuda')

from sklearn.model_selection import StratifiedKFold
from datetime import datetime
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor

train_df['Character Count'] = train_df['excerpt'].apply(lambda x: len(str(x)))
preds = []


df_oof=train_df.copy()
df_oof['oof'] = 0


skf = StratifiedKFold(10, shuffle=True, random_state=42)

splits = list(skf.split(X=X_train, y=train_df['Character Count']))
for i, (train_idx, val_idx) in enumerate(splits):
    print(f'\n------------- Training Fold {i + 1} / {10}')
    print("Current Time =", datetime.now().strftime("%H:%M:%S"))
    r1 = LinearRegression()
    r2 = RandomForestRegressor(n_estimators=30, random_state=43)
    ridge = Ridge(alpha=50.0)
    br = BayesianRidge(n_iter=30, verbose=True)

    clf =   BayesianRidge(n_iter=30, verbose=True) #VotingRegressor([('r2', r2), ('br', br)])
    clf.fit(X_train[train_idx],train_df.target[train_idx])
    
    preds.append(clf.predict(X_test))
    x=clf.predict(X_train[val_idx])
    df_oof['oof'].iloc[val_idx]+= x

print(f'Training score: {mean_squared_error(train_df.target, clf.predict(X_train), squared=False)}')
print(f'OOF score across folds: {mean_squared_error(df_oof.target, df_oof.oof, squared=False)}')

# getting mean prediction across 5 folds
# y_pred = np.mean(preds,0)
preds_sentence_transformer = np.mean(preds,0)

# creating submission csv

# mysub = test_df[["id"]].copy()
# mysub["target"] = y_pred

In [None]:
import os
import gc
import sys
import cv2
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader

In [None]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

num_bins = int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:,'bins'] = pd.cut(train_data['target'],bins=num_bins,labels=False)

target = train_data['target'].to_numpy()
bins = train_data.bins.to_numpy()

def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))


config = {
    'batch_size':128,
    'max_len':256,
    'nfolds':5,
    'seed':42,
}

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])


class CLRPDataset(Dataset):
    def __init__(self,df,tokenizer):
        self.excerpt = df['excerpt'].to_numpy()
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],return_tensors='pt',
                                max_length=config['max_len'],
                                padding='max_length',truncation=True)
        return encode
    
    def __len__(self):
        return len(self.excerpt)
    
    
class Model(nn.Module):
    def __init__(self):
        super(Model,self).__init__()
        self.roberta = AutoModel.from_pretrained('../input/roberta-base')    
        self.head = AttentionHead(768,768,1)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.head.out_features,1)

    def forward(self,**xb):
        x = self.roberta(**xb)[0]
        x = self.head(x)
        return x
    
    
def get_embeddings(df,path,plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    model = Model()
    model.load_state_dict(torch.load(path))
    model.to(device)
    model.eval()
    
    tokenizer = AutoTokenizer.from_pretrained('../input/roberta-base')
    
    ds = CLRPDataset(df,tokenizer)
    dl = DataLoader(ds,
                  batch_size = config["batch_size"],
                  shuffle=False,
                  num_workers = 4,
                  pin_memory=True,
                  drop_last=False
                 )
        
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs.detach().cpu().numpy()
            embeddings.extend(outputs)
    return np.array(embeddings)

In [None]:
test_embeddings1 = get_embeddings(test_data,'../input/clr-roberta/model0/model0.bin')
test_embeddings2 = get_embeddings(test_data,'../input/clr-roberta/model1/model1.bin')
test_embeddings3 = get_embeddings(test_data,'../input/clr-roberta/model2/model2.bin')
test_embeddings4 = get_embeddings(test_data,'../input/clr-roberta/model3/model3.bin')
test_embeddings5 = get_embeddings(test_data,'../input/clr-roberta/model4/model4.bin')

In [None]:
np.save('test_embeddings1_clr_roberta_model0.npy', test_embeddings1)
np.save('test_embeddings2_clr_roberta_model1.npy', test_embeddings2)
np.save('test_embeddings3_clr_roberta_model2.npy', test_embeddings3)
np.save('test_embeddings4_clr_roberta_model3.npy', test_embeddings4)
np.save('test_embeddings5_clr_roberta_model4.npy', test_embeddings5)

In [None]:
del test_embeddings1, test_embeddings2, test_embeddings3, test_embeddings4, test_embeddings5;gc.collect()

In [None]:
class Dataset:
    def __init__(self, excerpt, tokenizer, max_len, numerical_features, tfidf):
        self.excerpt = excerpt
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.numerical_features = numerical_features
        self.tfidf_df = tfidf

    def __len__(self):
        return len(self.excerpt)

    def __getitem__(self, item):
        text = str(self.excerpt[item])
        inputs = self.tokenizer(
            text, 
            max_length=self.max_len, 
            padding="max_length", 
            truncation=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        numerical_features = self.numerical_features[item]
        tfidf = self.tfidf_df.values[item]

        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long),
            "numerical_features" : torch.tensor(numerical_features, dtype=torch.float32),
            "tfidf" : torch.tensor(tfidf, dtype=torch.float32),
        }

In [None]:
class RoBERTaLarge(nn.Module):
    def __init__(self, model_path):
        super(RoBERTaLarge, self).__init__()
        self.in_features = 768 # 1024
        self.roberta = RobertaModel.from_pretrained(model_path)
        self.head = AttentionHead(self.in_features,self.in_features,1)
        self.dropout = nn.Dropout(0.1)
        self.process_num = nn.Sequential(
            nn.Linear(10, 8),
            nn.BatchNorm1d(8),
            nn.PReLU(),
            nn.Dropout(0.1),
        )
        self.process_tfidf = nn.Sequential(
            nn.Linear(100, 32),
            nn.BatchNorm1d(32),
            nn.PReLU(),
            nn.Dropout(0.1),
        )
        self.l0 = nn.Linear(self.in_features + 8 + 32, 1)
        self.l1 = nn.Linear(self.in_features + 8 + 32, 12)

    def forward(self, ids, mask, numerical_features, tfidf):
        roberta_outputs = self.roberta(
            ids,
            attention_mask=mask
        )

        x1 = self.head(roberta_outputs[0]) # bs, 1024

        x2 = self.process_num(numerical_features) # bs, 8

        x3 = self.process_tfidf(tfidf) # bs, 32

        x = torch.cat([x1, x2, x3], 1) # bs, 1024 + 8 + 32

        logits = self.l0(self.dropout(x))
        aux_logits = torch.sigmoid(self.l1(self.dropout(x)))
        return logits.squeeze(-1), aux_logits

In [None]:
import nltk
import re
import scipy as sp
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.feature_extraction.text import _document_frequency
from sklearn.pipeline import make_pipeline, make_union
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD


class BM25Transformer(BaseEstimator, TransformerMixin):
    def __init__(self, use_idf=True, k1=2.0, b=0.75):
        self.use_idf = use_idf
        self.k1 = k1
        self.b = b

    def fit(self, X):
        if not sp.sparse.issparse(X):
            X = sp.sparse.csc_matrix(X)
        if self.use_idf:
            n_samples, n_features = X.shape
            df = _document_frequency(X)
            idf = np.log((n_samples - df + 0.5) / (df + 0.5))
            self._idf_diag = sp.sparse.spdiags(idf, diags=0, m=n_features, n=n_features)

        doc_len = X.sum(axis=1)
        self._average_document_len = np.average(doc_len)

        return self

    def transform(self, X, copy=True):
        if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
            X = sp.sparse.csr_matrix(X, copy=copy)
        else:
            X = sp.sparse.csr_matrix(X, dtype=np.float, copy=copy)

        n_samples, n_features = X.shape
        doc_len = X.sum(axis=1)
        sz = X.indptr[1:] - X.indptr[0:-1]
        rep = np.repeat(np.asarray(doc_len), sz)

        nom = self.k1 + 1
        denom = X.data + self.k1 * (1 - self.b + self.b * rep / self._average_document_len)
        data = X.data * nom / denom

        X = sp.sparse.csr_matrix((data, X.indices, X.indptr), shape=X.shape)

        if self.use_idf:
            check_is_fitted(self, '_idf_diag', 'idf vector is not fitted')

            expected_n_features = self._idf_diag.shape[0]
            if n_features != expected_n_features:
                raise ValueError("Input has n_features=%d while the model"
                                 " has been trained with n_features=%d" % (
                                     n_features, expected_n_features))
            X = X * self._idf_diag

        return X 


class TextPreprocessor(object):
    def __init__(self):
        self.puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
                       '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…',
                       '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─',
                       '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '«',
                       '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', '（', '）', '～',
                       '➡', '％', '⇒', '▶', '「', '➄', '➆',  '➊', '➋', '➌', '➍', '⓪', '①', '②', '③', '④', '⑤', '⑰', '❶', '❷', '❸', '❹', '❺', '❻', '❼', '❽',  
                       '＝', '※', '㈱', '､', '△', '℮', 'ⅼ', '‐', '｣', '┝', '↳', '◉', '／', '＋', '○',
                       '【', '】', '✅', '☑', '➤', 'ﾞ', '↳', '〶', '☛', '｢', '⁺', '『', '≫',
                       ]

        self.numbers = ["0","1","2","3","4","5","6","7","8","9","０","１","２","３","４","５","６","７","８","９"]
        self.stopwords = nltk.corpus.stopwords.words('english')

    def _pre_preprocess(self, x):
        return str(x).lower() 

    def rm_num(self, x, use_num=True):
        x = re.sub('[0-9]{5,}', '', x)
        x = re.sub('[0-9]{4}', '', x)
        x = re.sub('[0-9]{3}', '', x)
        x = re.sub('[0-9]{2}', '', x)    
        for i in self.numbers:
            x = x.replace(str(i), '')        
        return x

    def clean_puncts(self, x):
        for punct in self.puncts:
            x = x.replace(punct, '')
        return x
    
    def clean_stopwords(self, x):
        list_x = x.split()
        res = []
        for w in list_x:
            if w not in self.stopwords:
                res.append(w)
        return ' '.join(res)

    def preprocess(self, sentence):
        sentence = sentence.fillna(" ")
        sentence = sentence.map(lambda x: self._pre_preprocess(x))
        sentence = sentence.map(lambda x: self.clean_puncts(x))
        sentence = sentence.map(lambda x: self.clean_stopwords(x))
        sentence = sentence.map(lambda x: self.rm_num(x))
        return sentence


def get_sentence_features(train, col):
    train[col + '_num_chars'] = train[col].apply(len)
    train[col + '_num_capitals'] = train[col].apply(lambda x: sum(1 for c in x if c.isupper()))
    train[col + '_caps_vs_length'] = train.apply(lambda row: row[col + '_num_chars'] / (row[col + '_num_capitals']+1e-5), axis=1)
    train[col + '_num_exclamation_marks'] = train[col].apply(lambda x: x.count('!'))
    train[col + '_num_question_marks'] = train[col].apply(lambda x: x.count('?'))
    train[col + '_num_punctuation'] = train[col].apply(lambda x: sum(x.count(w) for w in '.,;:'))
    train[col + '_num_symbols'] = train[col].apply(lambda x: sum(x.count(w) for w in '*&$%'))
    train[col + '_num_words'] = train[col].apply(lambda x: len(x.split()))
    train[col + '_num_unique_words'] = train[col].apply(lambda comment: len(set(w for w in comment.split())))
    train[col + '_words_vs_unique'] = train[col + '_num_unique_words'] / train[col + '_num_words'] 
    return train


numerical_cols = [
       'excerpt_num_chars', 'excerpt_num_capitals', 'excerpt_caps_vs_length',
       'excerpt_num_exclamation_marks', 'excerpt_num_question_marks',
       'excerpt_num_punctuation', 'excerpt_num_symbols', 'excerpt_num_words',
       'excerpt_num_unique_words', 'excerpt_words_vs_unique'
]

In [None]:
def generate_predictions(pretrain_path, max_len):
    device = "cuda"
    model_path = '../input/roberta-base/'
    model = RoBERTaLarge(model_path)
    model.to(device)
    model.load_state_dict(torch.load(pretrain_path))
    model.eval()
    tokenizer = RobertaTokenizer.from_pretrained(model_path)
    
    df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
    df = get_sentence_features(df, 'excerpt')
    
    train = pd.read_csv("../input/step-1-create-folds/train_folds.csv")
    train = pd.concat([train, df]).reset_index(drop=True)
    
    TP = TextPreprocessor()
    preprocessed_text = TP.preprocess(train['excerpt'])

    pipeline = make_pipeline(
                TfidfVectorizer(max_features=100000),
                make_union(
                    TruncatedSVD(n_components=50, random_state=42),
                    make_pipeline(
                        BM25Transformer(use_idf=True, k1=2.0, b=0.75),
                        TruncatedSVD(n_components=50, random_state=42)
                    ),
                    n_jobs=1,
                ),
             )

    z = pipeline.fit_transform(preprocessed_text)
    tfidf_df = pd.DataFrame(z, columns=[f'cleaned_excerpt_tf_idf_svd_{i}' for i in range(50*2)])
    
    dataset = Dataset(excerpt=df.excerpt.values, tokenizer=tokenizer, max_len=max_len, numerical_features=df[numerical_cols].values, tfidf=tfidf_df)
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=32, num_workers=0, pin_memory=True, shuffle=False
    )

    final_output = []
    for b_idx, data in tqdm(enumerate(data_loader)):
        with torch.no_grad():
            inputs = data['input_ids'].to(device)
            masks = data['attention_mask'].to(device)
            numerical_features = data['numerical_features'].to(device)
            tfidf = data['tfidf'].to(device)
            output, _ = model(inputs, masks, numerical_features, tfidf)
            output = output.detach().cpu().numpy().tolist()
            final_output.extend(output)

    del model, tokenizer, train, df, preprocessed_text, pipeline, z, tfidf_df, dataset, data_loader
    del inputs, masks, numerical_features, tfidf, output
    gc.collect()
    torch.cuda.empty_cache()
    return np.array(final_output)

In [None]:
preds0 = generate_predictions("../input/kaerururu-commonlit-115/fold-0.bin", max_len=256)
preds1 = generate_predictions("../input/kaerururu-commonlit-115/fold-1.bin", max_len=256)
preds2 = generate_predictions("../input/kaerururu-commonlit-115/fold-2.bin", max_len=256)
preds3 = generate_predictions("../input/kaerururu-commonlit-115/fold-3.bin", max_len=256)
preds4 = generate_predictions("../input/kaerururu-commonlit-115/fold-4.bin", max_len=256)

preds_roberta_base = (preds0 + preds1 + preds2 + preds3 + preds4) / 5

del preds0, preds1, preds2, preds3, preds4; gc.collect()

In [None]:
class RoBERTaLarge(nn.Module):
    def __init__(self, model_path):
        super(RoBERTaLarge, self).__init__()
        self.in_features = 1024
        self.roberta = RobertaModel.from_pretrained(model_path)
        self.head = AttentionHead(self.in_features,self.in_features,1)
        self.dropout = nn.Dropout(0.1)
        self.process_num = nn.Sequential(
            nn.Linear(10, 8),
            nn.BatchNorm1d(8),
            nn.PReLU(),
            nn.Dropout(0.1),
        )
        self.process_tfidf = nn.Sequential(
            nn.Linear(100, 32),
            nn.BatchNorm1d(32),
            nn.PReLU(),
            nn.Dropout(0.1),
        )
        self.l0 = nn.Linear(self.in_features + 8 + 32, 1)
        self.l1 = nn.Linear(self.in_features + 8 + 32, 7)

    def forward(self, ids, mask, numerical_features, tfidf):
        roberta_outputs = self.roberta(
            ids,
            attention_mask=mask
        )

        x1 = self.head(roberta_outputs[0]) # bs, 1024

        x2 = self.process_num(numerical_features) # bs, 8

        x3 = self.process_tfidf(tfidf) # bs, 32

        x = torch.cat([x1, x2, x3], 1) # bs, 1024 + 8 + 32

        logits = self.l0(self.dropout(x))
        aux_logits = torch.sigmoid(self.l1(self.dropout(x)))
        return logits.squeeze(-1), aux_logits

In [None]:
def generate_predictions(pretrain_path, max_len):
    device = "cuda"
    model_path = '../input/robertalarge/'
    model = RoBERTaLarge(model_path)
    model.to(device)
    model.load_state_dict(torch.load(pretrain_path))
    model.eval()
    tokenizer = RobertaTokenizer.from_pretrained(model_path)
    
    df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
    df = get_sentence_features(df, 'excerpt')
    
    train = pd.read_csv("../input/step-1-create-folds/train_folds.csv")
    train = pd.concat([train, df]).reset_index(drop=True)
    
    TP = TextPreprocessor()
    preprocessed_text = TP.preprocess(train['excerpt'])

    pipeline = make_pipeline(
                TfidfVectorizer(max_features=100000),
                make_union(
                    TruncatedSVD(n_components=50, random_state=42),
                    make_pipeline(
                        BM25Transformer(use_idf=True, k1=2.0, b=0.75),
                        TruncatedSVD(n_components=50, random_state=42)
                    ),
                    n_jobs=1,
                ),
             )

    z = pipeline.fit_transform(preprocessed_text)
    tfidf_df = pd.DataFrame(z, columns=[f'cleaned_excerpt_tf_idf_svd_{i}' for i in range(50*2)])
    
    dataset = Dataset(excerpt=df.excerpt.values, tokenizer=tokenizer, max_len=max_len, numerical_features=df[numerical_cols].values, tfidf=tfidf_df)
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=32, num_workers=0, pin_memory=True, shuffle=False
    )

    final_output = []
    for b_idx, data in tqdm(enumerate(data_loader)):
        with torch.no_grad():
            inputs = data['input_ids'].to(device)
            masks = data['attention_mask'].to(device)
            numerical_features = data['numerical_features'].to(device)
            tfidf = data['tfidf'].to(device)
            output, _ = model(inputs, masks, numerical_features, tfidf)
            output = output.detach().cpu().numpy().tolist()
            final_output.extend(output)

    del model, tokenizer, train, df, preprocessed_text, pipeline, z, tfidf_df, dataset, data_loader
    del inputs, masks, numerical_features, tfidf, output
    gc.collect()
    torch.cuda.empty_cache()
    return np.array(final_output)

In [None]:
# https://huggingface.co/deepset/roberta-large-squad2
preds0 = generate_predictions("../input/kaerururu-commonlit-084/fold-0.bin", max_len=256)
preds1 = generate_predictions("../input/kaerururu-commonlit-084/fold-1.bin", max_len=256)
preds2 = generate_predictions("../input/kaerururu-commonlit-084/fold-2.bin", max_len=256)
preds3 = generate_predictions("../input/kaerururu-commonlit-084/fold-3.bin", max_len=256)
preds4 = generate_predictions("../input/kaerururu-commonlit-084/fold-4.bin", max_len=256)

preds0462 = (preds0 + preds1 + preds2 + preds3 + preds4) / 5

del preds0, preds1, preds2, preds3, preds4; gc.collect()

In [None]:
# https://huggingface.co/phiyodr/roberta-large-finetuned-squad2
preds0 = generate_predictions("../input/kaerururu-commonlit-086/fold-0.bin", max_len=256)
preds1 = generate_predictions("../input/kaerururu-commonlit-086/fold-1.bin", max_len=256)
preds2 = generate_predictions("../input/kaerururu-commonlit-086/fold-2.bin", max_len=256)
preds3 = generate_predictions("../input/kaerururu-commonlit-086/fold-3.bin", max_len=256)
preds4 = generate_predictions("../input/kaerururu-commonlit-086/fold-4.bin", max_len=256)

preds0461 = (preds0 + preds1 + preds2 + preds3 + preds4) / 5

del preds0, preds1, preds2, preds3, preds4; gc.collect()

In [None]:
# https://huggingface.co/tli8hf/unqover-roberta-large-newsqa
preds0 = generate_predictions("../input/kaerururu-commonlit-088/fold-0.bin", max_len=256)
preds1 = generate_predictions("../input/kaerururu-commonlit-088/fold-1.bin", max_len=256)
preds2 = generate_predictions("../input/kaerururu-commonlit-088/fold-2.bin", max_len=256)
preds3 = generate_predictions("../input/kaerururu-commonlit-088/fold-3.bin", max_len=256)
preds4 = generate_predictions("../input/kaerururu-commonlit-088/fold-4.bin", max_len=256)

preds0463 = (preds0 + preds1 + preds2 + preds3 + preds4) / 5

del preds0, preds1, preds2, preds3, preds4; gc.collect()

In [None]:
# https://huggingface.co/phiyodr/roberta-large-finetuned-squad2, MSE ver
preds0 = generate_predictions("../input/kaerururu-commonlit-095/fold-0.bin", max_len=256)
preds1 = generate_predictions("../input/kaerururu-commonlit-095/fold-1.bin", max_len=256)
preds2 = generate_predictions("../input/kaerururu-commonlit-095/fold-2.bin", max_len=256)
preds3 = generate_predictions("../input/kaerururu-commonlit-095/fold-3.bin", max_len=256)
preds4 = generate_predictions("../input/kaerururu-commonlit-095/fold-4.bin", max_len=256)

preds0459 = (preds0 + preds1 + preds2 + preds3 + preds4) / 5

del preds0, preds1, preds2, preds3, preds4; gc.collect()

In [None]:
# max_len 512, cv score 0.493933596032441 <-- 0.499383855892775 (076)

preds0 = generate_predictions("../input/kaerururu-commonlit-098/fold-0.bin", max_len=512)
preds1 = generate_predictions("../input/kaerururu-commonlit-098/fold-1.bin", max_len=512)
preds2 = generate_predictions("../input/kaerururu-commonlit-098/fold-2.bin", max_len=512)
preds3 = generate_predictions("../input/kaerururu-commonlit-098/fold-3.bin", max_len=512)
preds4 = generate_predictions("../input/kaerururu-commonlit-098/fold-4.bin", max_len=512)

preds_max_len_512 = (preds0 + preds1 + preds2 + preds3 + preds4) / 5

del preds0, preds1, preds2, preds3, preds4; gc.collect()

In [None]:
# 0457

In [None]:
class RoBERTaLarge(nn.Module):
    def __init__(self, model_path):
        super(RoBERTaLarge, self).__init__()
        self.in_features = 1024
        self.roberta = RobertaModel.from_pretrained(model_path)
        self.head = AttentionHead(self.in_features,self.in_features,1)
        self.dropout = nn.Dropout(0.1)
        self.process_num = nn.Sequential(
            nn.Linear(10, 8),
            nn.BatchNorm1d(8),
            nn.PReLU(),
            nn.Dropout(0.1),
        )
        self.process_tfidf = nn.Sequential(
            nn.Linear(100, 32),
            nn.BatchNorm1d(32),
            nn.PReLU(),
            nn.Dropout(0.1),
        )
        self.l0 = nn.Linear(self.in_features + 8 + 32, 1)
        self.l1 = nn.Linear(self.in_features + 8 + 32, 12)

    def forward(self, ids, mask, numerical_features, tfidf):
        roberta_outputs = self.roberta(
            ids,
            attention_mask=mask
        )

        x1 = self.head(roberta_outputs[0]) # bs, 1024

        x2 = self.process_num(numerical_features) # bs, 8

        x3 = self.process_tfidf(tfidf) # bs, 32

        x = torch.cat([x1, x2, x3], 1) # bs, 1024 + 8 + 32

        logits = self.l0(self.dropout(x))
        aux_logits = torch.sigmoid(self.l1(self.dropout(x)))
        return logits.squeeze(-1), aux_logits

In [None]:
preds0 = generate_predictions("../input/kaerururu-commonlit-110/fold-0.bin", max_len=256)
preds1 = generate_predictions("../input/kaerururu-commonlit-110/fold-1.bin", max_len=256)
preds2 = generate_predictions("../input/kaerururu-commonlit-110/fold-2.bin", max_len=256)
preds3 = generate_predictions("../input/kaerururu-commonlit-110/fold-3.bin", max_len=256)
preds4 = generate_predictions("../input/kaerururu-commonlit-110/fold-4.bin", max_len=256)

preds0457 = (preds0 + preds1 + preds2 + preds3 + preds4) / 5

del preds0, preds1, preds2, preds3, preds4; gc.collect()

In [None]:
# add conv

In [None]:
class RoBERTaLarge(nn.Module):
    def __init__(self, model_path):
        super(RoBERTaLarge, self).__init__()
        self.in_features = 1024
        self.roberta = RobertaModel.from_pretrained(model_path)

        lstm_hidden_size = 128 * 2
        gru_hidden_size = 128 * 2
        n_channels = 64 * 2
        self.embedding_dropout = nn.Dropout2d(0.2)
        self.lstm = nn.LSTM(256, lstm_hidden_size, bidirectional=True, batch_first=True)
        self.gru = nn.GRU(lstm_hidden_size * 2, gru_hidden_size, bidirectional=True, batch_first=True)
        self.conv = nn.Conv1d(gru_hidden_size * 2, n_channels, 3, padding=2)
        nn.init.xavier_uniform_(self.conv.weight)

        self.head = AttentionHead(self.in_features,self.in_features,1)
        self.dropout = nn.Dropout(0.1)
        self.process_num = nn.Sequential(
            nn.Linear(10, 8),
            nn.BatchNorm1d(8),
            nn.PReLU(),
            nn.Dropout(0.1),
        )
        self.process_tfidf = nn.Sequential(
            nn.Linear(100, 32),
            nn.BatchNorm1d(32),
            nn.PReLU(),
            nn.Dropout(0.1),
        )
        self.l0 = nn.Linear(self.in_features + 8 + 32 + 128 * 2, 1)
        self.l1 = nn.Linear(self.in_features + 8 + 32 + 128 * 2, 7)

    def apply_spatial_dropout(self, h_embedding):
        h_embedding = h_embedding.transpose(1, 2).unsqueeze(2)
        h_embedding = self.embedding_dropout(h_embedding).squeeze(2).transpose(1, 2)
        return h_embedding

    def forward(self, ids, mask, numerical_features, tfidf):
        roberta_outputs = self.roberta(
            ids,
            attention_mask=mask
        )

        h_embedding = self.apply_spatial_dropout(roberta_outputs[0]).transpose(2, 1) # bs, 1024, 256
        h_lstm, _ = self.lstm(h_embedding)
        h_gru, _ = self.gru(h_lstm)
        h_gru = h_gru.transpose(2, 1)
        conv = self.conv(h_gru) # bs, 128, 258
        conv_avg_pool = torch.mean(conv, 2) # bs, 128
        conv_max_pool, _ = torch.max(conv, 2) # bs, 128

        x1 = self.head(roberta_outputs[0]) # bs, 1024

        x2 = self.process_num(numerical_features) # bs, 8

        x3 = self.process_tfidf(tfidf) # bs, 32

        x = torch.cat([x1, x2, x3, conv_avg_pool, conv_max_pool], 1) # bs, 1024 + 8 + 32 + 128 * 2

        logits = self.l0(self.dropout(x))
        aux_logits = torch.sigmoid(self.l1(self.dropout(x)))
        return logits.squeeze(-1), aux_logits

In [None]:
preds0 = generate_predictions("../input/kaerururu-commonlit-074/fold-0.bin", max_len=256)
preds1 = generate_predictions("../input/kaerururu-commonlit-074/fold-1.bin", max_len=256)
preds2 = generate_predictions("../input/kaerururu-commonlit-074/fold-2.bin", max_len=256)
preds3 = generate_predictions("../input/kaerururu-commonlit-074/fold-3.bin", max_len=256)
preds4 = generate_predictions("../input/kaerururu-commonlit-074/fold-4.bin", max_len=256)

preds_conv_roberta = (preds0 + preds1 + preds2 + preds3 + preds4) / 5

del preds0, preds1, preds2, preds3, preds4; gc.collect()

In [None]:
# electra-large

In [None]:
class RoBERTaLarge(nn.Module):
    def __init__(self, model_path):
        super(RoBERTaLarge, self).__init__()
        self.in_features = 1024
        self.roberta = AutoModel.from_pretrained(model_path)
        self.head = AttentionHead(self.in_features,self.in_features,1)
        self.dropout = nn.Dropout(0.1)
        self.process_num = nn.Sequential(
            nn.Linear(10, 8),
            nn.BatchNorm1d(8),
            nn.PReLU(),
            nn.Dropout(0.1),
        )
        self.process_tfidf = nn.Sequential(
            nn.Linear(100, 32),
            nn.BatchNorm1d(32),
            nn.PReLU(),
            nn.Dropout(0.1),
        )
        self.l0 = nn.Linear(self.in_features + 8 + 32, 1)
        self.l1 = nn.Linear(self.in_features + 8 + 32, 12)

    def forward(self, ids, mask, numerical_features, tfidf):
        roberta_outputs = self.roberta(
            ids,
            attention_mask=mask
        )

        x1 = self.head(roberta_outputs[0]) # bs, 1024

        x2 = self.process_num(numerical_features) # bs, 8

        x3 = self.process_tfidf(tfidf) # bs, 32

        x = torch.cat([x1, x2, x3], 1) # bs, 1024 + 8 + 32

        logits = self.l0(self.dropout(x))
        aux_logits = torch.sigmoid(self.l1(self.dropout(x)))
        return logits.squeeze(-1), x1

In [None]:
def generate_predictions(pretrain_path, max_len):
    device = "cuda"
    model_path = '../input/electra/large-discriminator/'
    model = RoBERTaLarge(model_path)
    model.to(device)
    model.load_state_dict(torch.load(pretrain_path))
    model.eval()
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
    df = get_sentence_features(df, 'excerpt')
    
    train = pd.read_csv("../input/step-1-create-folds/train_folds.csv")
    train = pd.concat([train, df]).reset_index(drop=True)
    
    TP = TextPreprocessor()
    preprocessed_text = TP.preprocess(train['excerpt'])

    pipeline = make_pipeline(
                TfidfVectorizer(max_features=100000),
                make_union(
                    TruncatedSVD(n_components=50, random_state=42),
                    make_pipeline(
                        BM25Transformer(use_idf=True, k1=2.0, b=0.75),
                        TruncatedSVD(n_components=50, random_state=42)
                    ),
                    n_jobs=1,
                ),
             )

    z = pipeline.fit_transform(preprocessed_text)
    tfidf_df = pd.DataFrame(z, columns=[f'cleaned_excerpt_tf_idf_svd_{i}' for i in range(50*2)])
    
    dataset = Dataset(excerpt=df.excerpt.values, tokenizer=tokenizer, max_len=max_len, numerical_features=df[numerical_cols].values, tfidf=tfidf_df)
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=32, num_workers=0, pin_memory=True, shuffle=False
    )

    final_output = []
    emb = []
    for b_idx, data in tqdm(enumerate(data_loader)):
        with torch.no_grad():
            inputs = data['input_ids'].to(device)
            masks = data['attention_mask'].to(device)
            numerical_features = data['numerical_features'].to(device)
            tfidf = data['tfidf'].to(device)
            output, emb_out = model(inputs, masks, numerical_features, tfidf)
            output = output.detach().cpu().numpy().tolist()
            emb_out = emb_out.detach().cpu().numpy()
            final_output.extend(output)
            emb.append(emb_out)

    del model, tokenizer, train, df, preprocessed_text, pipeline, z, tfidf_df, dataset, data_loader
    del inputs, masks, numerical_features, tfidf, output, emb_out
    gc.collect()
    torch.cuda.empty_cache()
    return np.array(final_output), np.concatenate(emb)

In [None]:
preds0, emb_preds0 = generate_predictions("../input/kaeruru-commonlit-129/fold-0.bin", max_len=256)
preds1, emb_preds1 = generate_predictions("../input/kaeruru-commonlit-129/fold-1.bin", max_len=256)
preds2, emb_preds2 = generate_predictions("../input/kaeruru-commonlit-129/fold-2.bin", max_len=256)
preds3, emb_preds3 = generate_predictions("../input/kaeruru-commonlit-129/fold-3.bin", max_len=256)
preds4, emb_preds4 = generate_predictions("../input/kaeruru-commonlit-129/fold-4.bin", max_len=256)

preds_electra_large = (preds0 + preds1 + preds2 + preds3 + preds4) / 5
emb_preds_electra_large = (emb_preds0 + emb_preds1 + emb_preds2 + emb_preds3 + emb_preds4) / 5

del preds0, preds1, preds2, preds3, preds4; gc.collect()
del emb_preds0, emb_preds1, emb_preds2, emb_preds3, emb_preds4; gc.collect()

In [None]:
# distil-roberta-base

In [None]:
class RoBERTaLarge(nn.Module):
    def __init__(self, model_path):
        super(RoBERTaLarge, self).__init__()
        self.in_features = 768 # 1024
        self.roberta = AutoModel.from_pretrained(model_path)
        self.head = AttentionHead(self.in_features,self.in_features,1)
        self.dropout = nn.Dropout(0.1)
        self.process_num = nn.Sequential(
            nn.Linear(10, 8),
            nn.BatchNorm1d(8),
            nn.PReLU(),
            nn.Dropout(0.1),
        )
        self.process_tfidf = nn.Sequential(
            nn.Linear(100, 32),
            nn.BatchNorm1d(32),
            nn.PReLU(),
            nn.Dropout(0.1),
        )
        self.l0 = nn.Linear(self.in_features + 8 + 32, 2)
        self.l1 = nn.Linear(self.in_features + 8 + 32, 12)

    def forward(self, ids, mask, numerical_features, tfidf):
        roberta_outputs = self.roberta(
            ids,
            attention_mask=mask
        )

        x1 = self.head(roberta_outputs[0]) # bs, 1024

        x2 = self.process_num(numerical_features) # bs, 8

        x3 = self.process_tfidf(tfidf) # bs, 32

        x = torch.cat([x1, x2, x3], 1) # bs, 1024 + 8 + 32

        logits = self.l0(self.dropout(x))
        aux_logits = torch.sigmoid(self.l1(self.dropout(x)))

        return logits[:, 0].squeeze(-1), x1
    
    
def generate_predictions(pretrain_path, max_len):
    device = "cuda"
    model_path = '../input/distil-roberta-base/'
    model = RoBERTaLarge(model_path)
    model.to(device)
    model.load_state_dict(torch.load(pretrain_path))
    model.eval()
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
    df = get_sentence_features(df, 'excerpt')
    
    train = pd.read_csv("../input/step-1-create-folds/train_folds.csv")
    train = pd.concat([train, df]).reset_index(drop=True)
    
    TP = TextPreprocessor()
    preprocessed_text = TP.preprocess(train['excerpt'])

    pipeline = make_pipeline(
                TfidfVectorizer(max_features=100000),
                make_union(
                    TruncatedSVD(n_components=50, random_state=42),
                    make_pipeline(
                        BM25Transformer(use_idf=True, k1=2.0, b=0.75),
                        TruncatedSVD(n_components=50, random_state=42)
                    ),
                    n_jobs=1,
                ),
             )

    z = pipeline.fit_transform(preprocessed_text)
    tfidf_df = pd.DataFrame(z, columns=[f'cleaned_excerpt_tf_idf_svd_{i}' for i in range(50*2)])
    
    dataset = Dataset(excerpt=df.excerpt.values, tokenizer=tokenizer, max_len=max_len, numerical_features=df[numerical_cols].values, tfidf=tfidf_df)
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=32, num_workers=0, pin_memory=True, shuffle=False
    )

    final_output = []
    emb = []
    for b_idx, data in tqdm(enumerate(data_loader)):
        with torch.no_grad():
            inputs = data['input_ids'].to(device)
            masks = data['attention_mask'].to(device)
            numerical_features = data['numerical_features'].to(device)
            tfidf = data['tfidf'].to(device)
            output, emb_out = model(inputs, masks, numerical_features, tfidf)
            output = output.detach().cpu().numpy().tolist()
            emb_out = emb_out.detach().cpu().numpy()
            final_output.extend(output)
            emb.append(emb_out)

    del model, tokenizer, train, df, preprocessed_text, pipeline, z, tfidf_df, dataset, data_loader
    del inputs, masks, numerical_features, tfidf, output, emb_out
    gc.collect()
    torch.cuda.empty_cache()
    return np.array(final_output), np.concatenate(emb)

In [None]:
preds0, emb_preds0 = generate_predictions("../input/kaerururu-commonlit-125/fold-0.bin", max_len=248)
preds1, emb_preds1 = generate_predictions("../input/kaerururu-commonlit-125/fold-1.bin", max_len=248)
preds2, emb_preds2 = generate_predictions("../input/kaerururu-commonlit-125/fold-2.bin", max_len=248)
preds3, emb_preds3 = generate_predictions("../input/kaerururu-commonlit-125/fold-3.bin", max_len=248)
preds4, emb_preds4 = generate_predictions("../input/kaerururu-commonlit-125/fold-4.bin", max_len=248)

preds_distil_roberta_base = (preds0 + preds1 + preds2 + preds3 + preds4) / 5
emb_preds_distil_roberta_base = (emb_preds0 + emb_preds1 + emb_preds2 + emb_preds3 + emb_preds4) / 5

del preds0, preds1, preds2, preds3, preds4; gc.collect()
del emb_preds0, emb_preds1, emb_preds2, emb_preds3, emb_preds4; gc.collect()

print(preds_distil_roberta_base.shape, emb_preds_distil_roberta_base.shape)

In [None]:
# meanpooling


class RoBERTaLarge(nn.Module):
    def __init__(self, model_path):
        super(RoBERTaLarge, self).__init__()
        self.in_features = 1024
        self.roberta = RobertaModel.from_pretrained(model_path)
        self.layer_norm = nn.LayerNorm(self.in_features)
        self.dropout = nn.Dropout(0.1)
        self.process_num = nn.Sequential(
            nn.Linear(10, 8),
            nn.BatchNorm1d(8),
            nn.PReLU(),
            nn.Dropout(0.1),
        )
        self.process_tfidf = nn.Sequential(
            nn.Linear(100, 32),
            nn.BatchNorm1d(32),
            nn.PReLU(),
            nn.Dropout(0.1),
        )
        self.l0 = nn.Linear(self.in_features + 8 + 32, 1)
        self.l1 = nn.Linear(self.in_features + 8 + 32, 12)

    def forward(self, ids, mask, numerical_features, tfidf):
        roberta_outputs = self.roberta(
            ids,
            attention_mask=mask
        )

        last_hidden_state = roberta_outputs[0]
        input_mask_expanded = mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        norm_mean_embeddings = self.layer_norm(mean_embeddings) # bs, 1024

        x2 = self.process_num(numerical_features) # bs, 8

        x3 = self.process_tfidf(tfidf) # bs, 32

        x = torch.cat([norm_mean_embeddings, x2, x3], 1) # bs, 1024 + 8 + 32

        logits = self.l0(self.dropout(x))
        aux_logits = torch.sigmoid(self.l1(self.dropout(x)))
        return logits.squeeze(-1), aux_logits

    
def generate_predictions(pretrain_path, max_len):
    device = "cuda"
    model_path = '../input/robertalarge/'
    model = RoBERTaLarge(model_path)
    model.to(device)
    model.load_state_dict(torch.load(pretrain_path))
    model.eval()
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
    df = get_sentence_features(df, 'excerpt')
    
    train = pd.read_csv("../input/step-1-create-folds/train_folds.csv")
    train = pd.concat([train, df]).reset_index(drop=True)
    
    TP = TextPreprocessor()
    preprocessed_text = TP.preprocess(train['excerpt'])

    pipeline = make_pipeline(
                TfidfVectorizer(max_features=100000),
                make_union(
                    TruncatedSVD(n_components=50, random_state=42),
                    make_pipeline(
                        BM25Transformer(use_idf=True, k1=2.0, b=0.75),
                        TruncatedSVD(n_components=50, random_state=42)
                    ),
                    n_jobs=1,
                ),
             )

    z = pipeline.fit_transform(preprocessed_text)
    tfidf_df = pd.DataFrame(z, columns=[f'cleaned_excerpt_tf_idf_svd_{i}' for i in range(50*2)])
    
    dataset = Dataset(excerpt=df.excerpt.values, tokenizer=tokenizer, max_len=max_len, numerical_features=df[numerical_cols].values, tfidf=tfidf_df)
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=32, num_workers=0, pin_memory=True, shuffle=False
    )

    final_output = []
    for b_idx, data in tqdm(enumerate(data_loader)):
        with torch.no_grad():
            inputs = data['input_ids'].to(device)
            masks = data['attention_mask'].to(device)
            numerical_features = data['numerical_features'].to(device)
            tfidf = data['tfidf'].to(device)
            output, _ = model(inputs, masks, numerical_features, tfidf)
            output = output.detach().cpu().numpy().tolist()
            final_output.extend(output)

    del model, tokenizer, train, df, preprocessed_text, pipeline, z, tfidf_df, dataset, data_loader
    del inputs, masks, numerical_features, tfidf, output
    gc.collect()
    torch.cuda.empty_cache()
    return np.array(final_output)


preds0 = generate_predictions("../input/kaerururu-commonlit-124/fold-0.bin", max_len=256)
preds1 = generate_predictions("../input/kaerururu-commonlit-124/fold-1.bin", max_len=256)
preds2 = generate_predictions("../input/kaerururu-commonlit-124/fold-2.bin", max_len=256)
preds3 = generate_predictions("../input/kaerururu-commonlit-124/fold-3.bin", max_len=256)
preds4 = generate_predictions("../input/kaerururu-commonlit-124/fold-4.bin", max_len=256)

preds_roberta_large_meanpooling = (preds0 + preds1 + preds2 + preds3 + preds4) / 5

del preds0, preds1, preds2, preds3, preds4; gc.collect()

In [None]:
# xlnet-base-cased

class RoBERTaLarge(nn.Module):
    def __init__(self, model_path):
        super(RoBERTaLarge, self).__init__()
        self.in_features = 768 # 1024
        self.roberta = AutoModel.from_pretrained(model_path)
        self.head = AttentionHead(self.in_features,self.in_features,1)
        self.dropout = nn.Dropout(0.1)
        self.process_num = nn.Sequential(
            nn.Linear(10, 8),
            nn.BatchNorm1d(8),
            nn.PReLU(),
            nn.Dropout(0.1),
        )
        self.process_tfidf = nn.Sequential(
            nn.Linear(100, 32),
            nn.BatchNorm1d(32),
            nn.PReLU(),
            nn.Dropout(0.1),
        )
        self.l0 = nn.Linear(self.in_features + 8 + 32, 2)
        self.l1 = nn.Linear(self.in_features + 8 + 32, 12)

    def forward(self, ids, mask, numerical_features, tfidf):
        roberta_outputs = self.roberta(
            ids,
            attention_mask=mask
        )

        x1 = self.head(roberta_outputs[0]) # bs, 1024

        x2 = self.process_num(numerical_features) # bs, 8

        x3 = self.process_tfidf(tfidf) # bs, 32

        x = torch.cat([x1, x2, x3], 1) # bs, 1024 + 8 + 32

        logits = self.l0(self.dropout(x))
        aux_logits = torch.sigmoid(self.l1(self.dropout(x)))

        return logits[:, 0].squeeze(-1), aux_logits

In [None]:
def generate_predictions(pretrain_path, max_len):
    device = "cuda"
    model_path = '../input/xlnetbasecased/'
    model = RoBERTaLarge(model_path)
    model.to(device)
    model.load_state_dict(torch.load(pretrain_path))
    model.eval()
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
    df = get_sentence_features(df, 'excerpt')
    
    train = pd.read_csv("../input/step-1-create-folds/train_folds.csv")
    train = pd.concat([train, df]).reset_index(drop=True)
    
    TP = TextPreprocessor()
    preprocessed_text = TP.preprocess(train['excerpt'])

    pipeline = make_pipeline(
                TfidfVectorizer(max_features=100000),
                make_union(
                    TruncatedSVD(n_components=50, random_state=42),
                    make_pipeline(
                        BM25Transformer(use_idf=True, k1=2.0, b=0.75),
                        TruncatedSVD(n_components=50, random_state=42)
                    ),
                    n_jobs=1,
                ),
             )

    z = pipeline.fit_transform(preprocessed_text)
    tfidf_df = pd.DataFrame(z, columns=[f'cleaned_excerpt_tf_idf_svd_{i}' for i in range(50*2)])
    
    dataset = Dataset(excerpt=df.excerpt.values, tokenizer=tokenizer, max_len=max_len, numerical_features=df[numerical_cols].values, tfidf=tfidf_df)
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=32, num_workers=0, pin_memory=True, shuffle=False
    )

    final_output = []
    for b_idx, data in tqdm(enumerate(data_loader)):
        with torch.no_grad():
            inputs = data['input_ids'].to(device)
            masks = data['attention_mask'].to(device)
            numerical_features = data['numerical_features'].to(device)
            tfidf = data['tfidf'].to(device)
            output, _ = model(inputs, masks, numerical_features, tfidf)
            output = output.detach().cpu().numpy().tolist()
            final_output.extend(output)

    del model, tokenizer, train, df, preprocessed_text, pipeline, z, tfidf_df, dataset, data_loader
    del inputs, masks, numerical_features, tfidf, output
    gc.collect()
    torch.cuda.empty_cache()
    return np.array(final_output)


preds0 = generate_predictions("../input/kaerururu-commonlit-126/fold-0.bin", max_len=275)
preds1 = generate_predictions("../input/kaerururu-commonlit-126/fold-1.bin", max_len=275)
preds2 = generate_predictions("../input/kaerururu-commonlit-126/fold-2.bin", max_len=275)
preds3 = generate_predictions("../input/kaerururu-commonlit-126/fold-3.bin", max_len=275)
preds4 = generate_predictions("../input/kaerururu-commonlit-126/fold-4.bin", max_len=275)

preds_xlnet_base_cased = (preds0 + preds1 + preds2 + preds3 + preds4) / 5

del preds0, preds1, preds2, preds3, preds4; gc.collect()

In [None]:
# bart-large

In [None]:
class BARTLarge(nn.Module):
    def __init__(self, model_path):
        super(BARTLarge, self).__init__()
        self.in_features = 1024
        self.roberta = AutoModel.from_pretrained(model_path)
        self.head = AttentionHead(self.in_features,self.in_features,1)
        self.dropout = nn.Dropout(0.1)
        self.process_num = nn.Sequential(
            nn.Linear(10, 8),
            nn.BatchNorm1d(8),
            nn.PReLU(),
            nn.Dropout(0.1),
        )
        self.process_tfidf = nn.Sequential(
            nn.Linear(100, 32),
            nn.BatchNorm1d(32),
            nn.PReLU(),
            nn.Dropout(0.1),
        )
        self.l0 = nn.Linear(self.in_features + 8 + 32, 1)
        self.l1 = nn.Linear(self.in_features + 8 + 32, 12)

    def forward(self, ids, mask, numerical_features, tfidf):
        roberta_outputs = self.roberta(
            ids,
            attention_mask=mask
        )

        x1 = self.head(roberta_outputs[0]) # bs, 1024

        x2 = self.process_num(numerical_features) # bs, 8

        x3 = self.process_tfidf(tfidf) # bs, 32

        x = torch.cat([x1, x2, x3], 1) # bs, 1024 + 8 + 32

        logits = self.l0(self.dropout(x))
        aux_logits = torch.sigmoid(self.l1(self.dropout(x)))
        return logits.squeeze(-1), aux_logits

In [None]:
def generate_predictions(pretrain_path, max_len):
    device = "cuda"
    model_path = '../input/bart-models-hugging-face-model-repository/bart-large/'
    model = BARTLarge(model_path)
    model.to(device)
    model.load_state_dict(torch.load(pretrain_path))
    model.eval()
    tokenizer = AutoTokenizer.from_pretrained('../input/bartbase/')
    
    df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
    df = get_sentence_features(df, 'excerpt')
    
    train = pd.read_csv("../input/step-1-create-folds/train_folds.csv")
    train = pd.concat([train, df]).reset_index(drop=True)
    
    TP = TextPreprocessor()
    preprocessed_text = TP.preprocess(train['excerpt'])

    pipeline = make_pipeline(
                TfidfVectorizer(max_features=100000),
                make_union(
                    TruncatedSVD(n_components=50, random_state=42),
                    make_pipeline(
                        BM25Transformer(use_idf=True, k1=2.0, b=0.75),
                        TruncatedSVD(n_components=50, random_state=42)
                    ),
                    n_jobs=1,
                ),
             )

    z = pipeline.fit_transform(preprocessed_text)
    tfidf_df = pd.DataFrame(z, columns=[f'cleaned_excerpt_tf_idf_svd_{i}' for i in range(50*2)])
    
    dataset = Dataset(excerpt=df.excerpt.values, tokenizer=tokenizer, max_len=max_len, numerical_features=df[numerical_cols].values, tfidf=tfidf_df)
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=32, num_workers=0, pin_memory=True, shuffle=False
    )

    final_output = []
    for b_idx, data in tqdm(enumerate(data_loader)):
        with torch.no_grad():
            inputs = data['input_ids'].to(device)
            masks = data['attention_mask'].to(device)
            numerical_features = data['numerical_features'].to(device)
            tfidf = data['tfidf'].to(device)
            output, _ = model(inputs, masks, numerical_features, tfidf)
            output = output.detach().cpu().numpy().tolist()
            final_output.extend(output)
  
    del model, tokenizer, train, df, preprocessed_text, pipeline, z, tfidf_df, dataset, data_loader
    del inputs, masks, numerical_features, tfidf, output
    gc.collect()
    torch.cuda.empty_cache()
    return np.array(final_output)

In [None]:
preds0 = generate_predictions("../input/kaerururu-commonlit-121/fold-0.bin", max_len=256)
preds1 = generate_predictions("../input/kaerururu-commonlit-121/fold-1.bin", max_len=256)
preds2 = generate_predictions("../input/kaerururu-commonlit-121/fold-2.bin", max_len=256)
preds3 = generate_predictions("../input/kaerururu-commonlit-121/fold-3.bin", max_len=256)
preds4 = generate_predictions("../input/kaerururu-commonlit-121/fold-4.bin", max_len=256)

preds_bart_large = (preds0 + preds1 + preds2 + preds3 + preds4) / 5

del preds0, preds1, preds2, preds3, preds4; gc.collect()

In [None]:
train_embeddings1 = np.load('../input/commonlit-2nd-stacking-4/train_embeddings1_clr_roberta_model0.npy')
train_embeddings2 = np.load('../input/commonlit-2nd-stacking-4/train_embeddings2_clr_roberta_model1.npy')
train_embeddings3 = np.load('../input/commonlit-2nd-stacking-4/train_embeddings3_clr_roberta_model2.npy')
train_embeddings4 = np.load('../input/commonlit-2nd-stacking-4/train_embeddings4_clr_roberta_model3.npy')
train_embeddings5 = np.load('../input/commonlit-2nd-stacking-4/train_embeddings5_clr_roberta_model4.npy')

train_embeddings_clr_roberta = (train_embeddings1+train_embeddings2+train_embeddings3+train_embeddings4+train_embeddings5)/5
del train_embeddings1,train_embeddings2,train_embeddings3,train_embeddings4,train_embeddings5
gc.collect()

train_embeddings1 = np.load('../input/commonlit-2nd-stacking-4/train_embeddings1_Roberta_Base_123_1_h5.npy')
train_embeddings2 = np.load('../input/commonlit-2nd-stacking-4/train_embeddings2_Roberta_Base_123_2_h5.npy')
train_embeddings3 = np.load('../input/commonlit-2nd-stacking-4/train_embeddings3_Roberta_Base_123_3_h5.npy')
train_embeddings4 = np.load('../input/commonlit-2nd-stacking-4/train_embeddings4_Roberta_Base_123_4_h5.npy')
train_embeddings5 = np.load('../input/commonlit-2nd-stacking-4/train_embeddings5_Roberta_Base_123_5_h5.npy')

train_embeddings_Roberta_Base_123 = (train_embeddings1+train_embeddings2+train_embeddings3+train_embeddings4+train_embeddings5)/5
del train_embeddings1,train_embeddings2,train_embeddings3,train_embeddings4,train_embeddings5
gc.collect()

In [None]:
test_embeddings1 = np.load('test_embeddings1_clr_roberta_model0.npy')
test_embeddings2 = np.load('test_embeddings2_clr_roberta_model1.npy')
test_embeddings3 = np.load('test_embeddings3_clr_roberta_model2.npy')
test_embeddings4 = np.load('test_embeddings4_clr_roberta_model3.npy')
test_embeddings5 = np.load('test_embeddings5_clr_roberta_model4.npy')

test_embeddings_clr_roberta = (test_embeddings1+test_embeddings2+test_embeddings3+test_embeddings4+test_embeddings5)/5
del test_embeddings1,test_embeddings2,test_embeddings3,test_embeddings4,test_embeddings5
gc.collect()

test_embeddings1 = np.load('test_embeddings1_Roberta_Base_123_1_h5.npy')
test_embeddings2 = np.load('test_embeddings2_Roberta_Base_123_2_h5.npy')
test_embeddings3 = np.load('test_embeddings3_Roberta_Base_123_3_h5.npy')
test_embeddings4 = np.load('test_embeddings4_Roberta_Base_123_4_h5.npy')
test_embeddings5 = np.load('test_embeddings5_Roberta_Base_123_5_h5.npy')

test_embeddings_Roberta_Base_123 = (test_embeddings1+test_embeddings2+test_embeddings3+test_embeddings4+test_embeddings5)/5
del test_embeddings1,test_embeddings2,test_embeddings3,test_embeddings4,test_embeddings5
gc.collect()

In [None]:
oofs_gpt2 = pd.read_csv('../input/commonlit-2nd-stacking-1/oofs_gpt2.csv')
oofs_roberta_base = pd.read_csv('../input/commonlit-2nd-stacking-1/oofs_roberta_base.csv')
oofs_bart_large = pd.read_csv('../input/commonlit-2nd-stacking-1/oofs_bart_large.csv')
oofs0457 = pd.read_csv('../input/commonlit-2nd-stacking-3/oofs0457.csv')
oofs0462 = pd.read_csv('../input/commonlit-2nd-stacking-prepare-oof/oofs0462.csv')
oofs0461 = pd.read_csv('../input/commonlit-2nd-stacking-prepare-oof/oofs0461.csv')
oofs0463 = pd.read_csv('../input/commonlit-2nd-stacking-prepare-oof/oofs0463.csv')
oofs0459 = pd.read_csv('../input/commonlit-2nd-stacking-prepare-oof/oofs0459.csv')
oofs_max_len_512 = pd.read_csv('../input/commonlit-2nd-stacking-prepare-oof/oofs_max_len_512.csv')
oofs_conv_roberta = pd.read_csv('../input/commonlit-2nd-stacking-prepare-oof/oofs_conv_roberta.csv')
oofs_electra_large = pd.read_csv('../input/commonlit-2nd-stacking-prepare-oof-2/oofs_electra_large.csv')
oofs_distil_roberta_base = pd.read_csv('../input/commonlit-2nd-stacking-prepare-oof-2/oofs_distil_roberta_base.csv')
oofs_roberta_large_meanpooling = pd.read_csv('../input/commonlit-2nd-stacking-prepare-oof-3/oofs_roberta_large_meanpooling.csv')
oofs_xlnet_base_cased = pd.read_csv('../input/commonlit-2nd-stacking-prepare-oof-4/oofs_xlnet_base_cased.csv')
oofs_sentence_transformer = pd.read_csv('../input/commonlit-stacking-prepare-ooemb-pub/sentence_transformer_oof_df.csv')[['id', 'oof']]

# emb_distil_roberta_base = pd.read_csv('../input/commonlit-stacking-prepare-ooemb/emb_distil_roberta_base.csv')
emb_electra_large = pd.read_csv('../input/commonlit-stacking-prepare-ooemb/emb_electra_large.csv')

oofs_gpt2.columns = ['id', 'oof0']
oofs_roberta_base.columns = ['id', 'oof1']
oofs_bart_large.columns = ['id', 'oof2']
oofs0457.columns = ['id', 'oof3']
oofs0462.columns = ['id', 'oof4']
oofs0461.columns = ['id', 'oof5']
oofs0463.columns = ['id', 'oof6']
oofs0459.columns = ['id', 'oof7']
oofs_max_len_512.columns = ['id', 'oof8']
oofs_conv_roberta.columns = ['id', 'oof9']
oofs_electra_large.columns = ['id', 'oof10']
oofs_distil_roberta_base.columns = ['id', 'oof11']
oofs_roberta_large_meanpooling.columns = ['id', 'oof12']
oofs_xlnet_base_cased.columns = ['id', 'oof13']
oofs_sentence_transformer.columns = ['id', 'oof14']

In [None]:
# oof_all = pd.read_csv("../input/step-1-create-folds/train_folds.csv")
oof_all = pd.read_csv('../input/commonlitreadabilityprize/train.csv')

oof_all = pd.merge(oof_all, oofs_gpt2, on='id')
oof_all = pd.merge(oof_all, oofs_roberta_base, on='id')
oof_all = pd.merge(oof_all, oofs_bart_large, on='id')
oof_all = pd.merge(oof_all, oofs0457, on='id')
oof_all = pd.merge(oof_all, oofs0462, on='id')
oof_all = pd.merge(oof_all, oofs0461, on='id')
oof_all = pd.merge(oof_all, oofs0463, on='id')
oof_all = pd.merge(oof_all, oofs0459, on='id')
oof_all = pd.merge(oof_all, oofs_max_len_512, on='id')
oof_all = pd.merge(oof_all, oofs_conv_roberta, on='id')
oof_all = pd.merge(oof_all, oofs_electra_large, on='id')
oof_all = pd.merge(oof_all, oofs_distil_roberta_base, on='id')
oof_all = pd.merge(oof_all, oofs_roberta_large_meanpooling, on='id')
oof_all = pd.merge(oof_all, oofs_xlnet_base_cased, on='id')
oof_all = pd.merge(oof_all, oofs_sentence_transformer, on='id')

# oof_all = pd.merge(oof_all, emb_distil_roberta_base, on='id')
oof_all = pd.merge(oof_all, emb_electra_large, on='id')

del oofs_gpt2, oofs_roberta_base, oofs_bart_large, oofs0457;gc.collect()
del oofs0462, oofs0461, oofs0463;gc.collect()
del oofs0459, oofs_max_len_512, oofs_conv_roberta;gc.collect()
del oofs_electra_large, oofs_distil_roberta_base;gc.collect()
del oofs_roberta_large_meanpooling, oofs_xlnet_base_cased;gc.collect()
# del emb_distil_roberta_base;gc.collect() 
del emb_electra_large;gc.collect() 

oof_all = pd.concat([oof_all, pd.DataFrame(train_embeddings_clr_roberta, columns=[f'clr_{i}' for i in range(768)])], 1).reset_index(drop=True)
oof_all = pd.concat([oof_all, pd.DataFrame(train_embeddings_Roberta_Base_123, columns=[f'roberta_base_{i}' for i in range(768)])], 1).reset_index(drop=True)

del train_embeddings_clr_roberta, train_embeddings_Roberta_Base_123;gc.collect()
print(oof_all.shape)
oof_all.head()

In [None]:
preds_all = np.stack([
    preds_gpt2,preds_roberta_base,preds_bart_large,preds0457,
    preds0462,preds0461,preds0463,preds0459,
    preds_max_len_512,preds_conv_roberta,
    preds_electra_large,preds_distil_roberta_base,
    preds_roberta_large_meanpooling, preds_xlnet_base_cased,
    # preds_sentence_transformer,
], 1)

print(preds_all.shape)

In [None]:
X_test = np.concatenate([preds_all, emb_preds_electra_large], 1)
X_test = np.concatenate([X_test, test_embeddings_clr_roberta], 1)
X_test = np.concatenate([X_test, test_embeddings_Roberta_Base_123], 1)

X_test1 = np.concatenate([preds_all, preds_sentence_transformer.reshape(-1, 1)], 1)
X_test1 = np.concatenate([X_test1, emb_preds_electra_large], 1)
X_test1 = np.concatenate([X_test1, test_embeddings_clr_roberta], 1)
X_test1 = np.concatenate([X_test1, test_embeddings_Roberta_Base_123], 1)

In [None]:
# preds_all = np.concatenate([preds_all, emb_preds_distil_roberta_base], 1)
# preds_all = np.concatenate([preds_all, emb_preds_electra_large], 1)

# preds_all = np.concatenate([preds_all, test_embeddings_clr_roberta], 1)
# preds_all = np.concatenate([preds_all, test_embeddings_Roberta_Base_123], 1)

del test_embeddings_clr_roberta, test_embeddings_Roberta_Base_123;gc.collect()
print(preds_all.shape)

In [None]:
X = oof_all[[f'oof{i}' for i in range(14)]+[f'emb_electra_large_{i}' for i in range(1024)]+[f'clr_{i}' for i in range(768)]+[f'roberta_base_{i}' for i in range(768)]].values
X1 = oof_all[[f'oof{i}' for i in range(15)]+[f'emb_electra_large_{i}' for i in range(1024)]+[f'clr_{i}' for i in range(768)]+[f'roberta_base_{i}' for i in range(768)]].values

y = oof_all[['target']].values

In [None]:
# XX1 = oof_all[[f'oof{i}' for i in range(5)]+['oof10', 'oof12']+[f'clr_{i}' for i in range(768)]].values
# XX2 = oof_all[[f'oof{i+5}' for i in range(5)]+['oof11', 'oof13']+[f'roberta_base_{i}' for i in range(768)]].values
# XX_test1 = np.concatenate([X_test[:, [0,1,2,3,4,10,12]], X_test[:, -768*2:-768*1]], 1)
# XX_test2 = np.concatenate([X_test[:, [5,6,7,8,9,11,13]], X_test[:, -768*1:]], 1)

XX1 = oof_all[[f'oof{i}' for i in range(5)]+[f'clr_{i}' for i in range(768)]].values
XX2 = oof_all[[f'oof{i+5}' for i in range(5)]+[f'roberta_base_{i}' for i in range(768)]].values

XX_test1 = np.concatenate([X_test[:, [0,1,2,3,4]], X_test[:, -768*2:-768*1]], 1)
XX_test2 = np.concatenate([X_test[:, [5,6,7,8,9]], X_test[:, -768*1:]], 1)

XX = np.stack([XX1, XX2], 1)
XX_test = np.stack([XX_test1, XX_test2], 1)

XX.shape, XX_test.shape

In [None]:
from sklearn.linear_model import BayesianRidge, LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR


def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

def get_preds_2nd_stages(X,y,X_test,nfolds=5,C=10,kernel='rbf'):
    scores = list()
    preds = np.zeros((X_test.shape[0]))
    new_oof = []
    new_true = []
    
    kfold = StratifiedKFold(n_splits=5,shuffle=True,random_state=71)
    num_bins = int(np.floor(1 + np.log2(len(X)))) # 12
    y2 = pd.cut(y.reshape(-1),bins=num_bins,labels=False)
    for k, (train_idx,valid_idx) in enumerate(kfold.split(X, y2)):
    # kfold = KFold(n_splits=5,shuffle=True,random_state=71)
    # for k, (train_idx,valid_idx) in enumerate(kfold.split(X)):
        model = BayesianRidge(n_iter=30, verbose=True) 
        X_train,y_train = X[train_idx], y[train_idx]
        X_valid,y_valid = X[valid_idx], y[valid_idx]
        
        model.fit(X_train,y_train)
        prediction = model.predict(X_valid)
        score = rmse_score(prediction,y_valid)
        
        print(f'Fold {k} , rmse score: {score}')
        scores.append(score)
        preds += np.squeeze(model.predict(X_test))
        new_oof.append(prediction)
        new_true.append(y_valid)
        
    print("mean rmse",np.mean(scores))
    return np.array(preds)/nfolds, np.concatenate(new_oof, 0), np.concatenate(new_true, 0)


br_stacked_preds, br_stacked_oofs, br_stacked_trues = get_preds_2nd_stages(X1,y,X_test1)
print(br_stacked_preds.shape)

In [None]:
# MLP Stacking

In [None]:
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler


def calc_loss(y_true, y_pred):
    return  np.sqrt(mean_squared_error(y_true, y_pred))
    
    
SEED = 71
seed_everything(SEED)


class MLPDataset:
    def __init__(self, X, y=None):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, item):

        features = self.X[item]

        if self.y is not None:
            targets = self.y[item]
        
            return {
                'x': torch.tensor(features, dtype=torch.float32),
                'y': torch.tensor(targets, dtype=torch.float32),
            }
          
        else:
            return {
                'x': torch.tensor(features, dtype=torch.float32),
            }  


class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


class MetricMeter(object):
    def __init__(self):
        self.reset()
    
    def reset(self):
        self.y_true = []
        self.y_pred = []
    
    def update(self, y_true, y_pred):
        self.y_true.extend(y_true.cpu().detach().numpy().tolist())
        self.y_pred.extend(y_pred.cpu().detach().numpy().tolist())

    @property
    def avg(self):
        self.rmse = calc_loss(self.y_true, self.y_pred)
       
        return {
            "RMSE" : self.rmse,
        }
    
        
class RMSELoss(torch.nn.Module):
    def __init__(self):
        super(RMSELoss,self).__init__()

    def forward(self,x,y):
        criterion = nn.MSELoss()
        loss = torch.sqrt(criterion(x, y))
        return loss


def loss_fn(logits, targets):
    loss_fct = RMSELoss()
    loss = loss_fct(logits, targets)
    return loss
        
        
def train_fn(model, data_loader, device, optimizer, scheduler):
    model.train()
    losses = AverageMeter()
    scores = MetricMeter()
    tk0 = tqdm(data_loader, total=len(data_loader))
    
    for data in tk0:
        optimizer.zero_grad()
        inputs = data['x'].to(device)
        targets = data['y'].to(device)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()
        losses.update(loss.item(), inputs.size(0))
        scores.update(targets, outputs)
        tk0.set_postfix(loss=losses.avg)
    return scores.avg, losses.avg


def valid_fn(model, data_loader, device):
    model.eval()
    losses = AverageMeter()
    scores = MetricMeter()
    tk0 = tqdm(data_loader, total=len(data_loader))
    valid_preds = []
    with torch.no_grad():
        for data in tk0:
            inputs = data['x'].to(device)
            targets = data['y'].to(device)
            outputs = model(inputs)
            loss = loss_fn(outputs, targets)

            losses.update(loss.item(), inputs.size(0))
            scores.update(targets, outputs)
            tk0.set_postfix(loss=losses.avg)
    return scores.avg, losses.avg


def run_one_fold(fold, X, y):
    kf = KFold(n_splits = 5, random_state = SEED, shuffle=True)
    splits = list(kf.split(X=X))
    train_idx = splits[fold][0]
    valid_idx = splits[fold][1]

    train_dataset = MLPDataset(X=X[train_idx], y=y[train_idx])
    train_loader = torch.utils.data.DataLoader(
                   train_dataset, shuffle=True, 
                   batch_size=32,
                   num_workers=0, pin_memory=True)

    val_dataset = MLPDataset(X=X[valid_idx], y=y[valid_idx])
    val_loader = torch.utils.data.DataLoader(
                 val_dataset, shuffle=False, 
                 batch_size=32,
                 num_workers=0, pin_memory=True)

    del train_dataset, val_dataset
    gc.collect()

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = MLP(X.shape[1])
    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=1e-6)

    patience = 5
    p = 0
    min_loss = 999
    best_score = 999

    for epoch in range(1, 100 + 1):

        print("Starting {} epoch...".format(epoch))

        start_time = time.time()
        
        train_avg, train_loss = train_fn(model, train_loader, device, optimizer, scheduler)
        valid_avg, valid_loss = valid_fn(model, val_loader, device)
        scheduler.step()

        elapsed = time.time() - start_time
        
        print(f'Epoch {epoch+1} - avg_train_loss: {train_loss:.5f}  avg_val_loss: {valid_loss:.5f}  time: {elapsed:.0f}s')
        print(f"Epoch {epoch+1} - train_rmse:{train_avg['RMSE']:0.5f}  valid_rmse:{valid_avg['RMSE']:0.5f}")

        if valid_avg['RMSE'] < best_score:
            print(f">>>>>>>> Model Improved From {best_score} ----> {valid_avg['RMSE']}")
            torch.save(model.state_dict(), f'fold-{fold}.bin')
            best_score = valid_avg['RMSE']
            p = 0 

        p += 1
        if p > patience:
            print(f'Early Stopping')
            break
            
            
def calc_cv_and_inference(model_paths, X, y, X_test):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    models = []
    for p in model_paths:
        model = MLP(X.shape[1])
        model.to(device)
        model.load_state_dict(torch.load(p))
        model.eval()
        models.append(model)
    
    kf = KFold(n_splits = 5, random_state = SEED, shuffle=True)
    splits = list(kf.split(X=X))

    y_true = []
    y_pred = []
    y_test_pred = []
    for fold, model in enumerate(models):
        train_idx = splits[fold][0]
        valid_idx = splits[fold][1]
    
        valid_dataset = MLPDataset(X=X[valid_idx], y=y[valid_idx])
        valid_dataloader = torch.utils.data.DataLoader(
                 valid_dataset, shuffle=False, 
                 batch_size=32,
                 num_workers=0, pin_memory=True)
        
        test_dataset = MLPDataset(X=X_test)
        test_dataloader = torch.utils.data.DataLoader(
                 test_dataset, shuffle=False, 
                 batch_size=32,
                 num_workers=0, pin_memory=True)

        final_output = []
        for b_idx, data in tqdm(enumerate(valid_dataloader)):
            with torch.no_grad():
                inputs = data['x'].to(device)
                targets = data['y'].to(device)
                output = model(inputs)
                output = output.detach().cpu().numpy().tolist()
                final_output.extend(output)
        print(calc_loss(np.array(final_output), y[valid_idx]))
        y_pred.append(np.array(final_output))
        y_true.append(y[valid_idx])
        
        test_output = []
        for b_idx, data in tqdm(enumerate(test_dataloader)):
            with torch.no_grad():
                inputs = data['x'].to(device)
                output = model(inputs)
                output = output.detach().cpu().numpy().tolist()
                test_output.extend(output)   
        y_test_pred.append(np.array(test_output))
        
    y_pred = np.concatenate(y_pred)
    y_true = np.concatenate(y_true)
    y_test_pred = np.squeeze(np.mean(y_test_pred, 0))
    overall_cv_score = calc_loss(y_true, y_pred)
    print(f'cv score {overall_cv_score}')
    return y_test_pred, y_pred, y_true


class MLP(nn.Module):
    def __init__(self, len_features):
        super(MLP, self).__init__()    
        
        self.conv = nn.Conv1d(773, 2, 3, padding=2)
        nn.init.xavier_uniform_(self.conv.weight)

        self.regressor = nn.Sequential(
            nn.Linear(8, 1)
        )

    def forward(self, features):
        # bs, 2, 773
        conv_out = self.conv(features.transpose(1,2))
        bs, _, _ = conv_out.size()
        conv_out = conv_out.view(bs, -1)
        
        output = self.regressor(conv_out)
        return output

In [None]:
for fold in range(5):
    print("Starting fold {} ...".format(fold))
    run_one_fold(fold, XX, y)
    
model_paths = [
    'fold-0.bin', 
    'fold-1.bin', 
    'fold-2.bin', 
    'fold-3.bin', 
    'fold-4.bin', 
]

# mlp_stacked_preds, mlp_stacked_oofs, mlp_stacked_trues = calc_cv_and_inference(model_paths, X, y, X_test)
mlp_stacked_preds, mlp_stacked_oofs, mlp_stacked_trues = calc_cv_and_inference(model_paths, XX, y, XX_test)
print(rmse_score(mlp_stacked_oofs, mlp_stacked_trues))
print(mlp_stacked_preds.shape, mlp_stacked_oofs.shape, mlp_stacked_trues.shape)

In [None]:
class MLP(nn.Module):
    def __init__(self, len_features):
        super(MLP, self).__init__()    

        self.regressor = nn.Sequential(
            nn.Linear(1550+1024, 512),
            nn.ReLU(inplace=True),
            nn.Linear(512, 64),
            nn.ReLU(inplace=True),
            nn.Linear(64, 1)
        )

    def forward(self, features):
        # bs, 1550+1024
        output = self.regressor(features)
        return output
    
    
for fold in range(5):
    print("Starting fold {} ...".format(fold))
    run_one_fold(fold, X, y)
    
model_paths = [
    'fold-0.bin', 
    'fold-1.bin', 
    'fold-2.bin', 
    'fold-3.bin', 
    'fold-4.bin', 
]

mlp_stacked_preds2, mlp_stacked_oofs2, mlp_stacked_trues2 = calc_cv_and_inference(model_paths, X, y, X_test)
print(rmse_score(mlp_stacked_oofs2, mlp_stacked_trues2))
print(mlp_stacked_preds2.shape, mlp_stacked_oofs2.shape, mlp_stacked_trues2.shape)

In [None]:
# make submission

In [None]:
final_oof = (br_stacked_oofs + oof_all['oof3'])/2
final_y = (br_stacked_trues + y)/2
rmse_score(final_oof, final_y)

In [None]:
final_oof = (br_stacked_oofs + np.squeeze(mlp_stacked_oofs) + np.squeeze(oof_all['oof3']))/3
final_y = (br_stacked_trues + mlp_stacked_trues + y)/3
rmse_score(final_oof, final_y)

In [None]:
final_oof = (br_stacked_oofs + np.squeeze(mlp_stacked_oofs) + np.squeeze(mlp_stacked_oofs2) + np.squeeze(oof_all['oof3']))/4
final_y = (br_stacked_trues + mlp_stacked_trues + mlp_stacked_trues2 + y)/4
rmse_score(final_oof, final_y)

In [None]:
submission = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")
submission.target = (br_stacked_preds + mlp_stacked_preds + mlp_stacked_preds2 + preds0457)/4
submission.to_csv("submission.csv", index=False)