In [None]:
import numpy as np
import pandas as pd 
import os
import gc
import sys
import time
import random
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold, StratifiedKFold
import xgboost as xgb

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.metrics import mean_squared_error

from transformers import AutoModel, AutoTokenizer
import json
from tensorflow.keras.models import load_model
import re
import pandas as pd
import string
import keras
from sklearn.svm import SVR

In [None]:
data_dir = '../input/commonlitreadabilityprize/'
train = pd.read_csv(data_dir + 'train.csv')
test = pd.read_csv(data_dir + 'test.csv')
sample_submission = pd.read_csv(data_dir + 'sample_submission.csv')

target = train['target'].to_numpy()


def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

In [None]:
# source: https://www.kaggle.com/maunish/clrp-roberta-lgbm

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


class CLRPDataset(nn.Module):
    def __init__(self, df, tokenizer, max_len=128):
        self.excerpt = df['excerpt'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        return encode
    
    def __len__(self):
        return len(self.excerpt)
    

def get_embeddings(df, path, plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    MODEL_PATH = path
    model = AutoModel.from_pretrained(MODEL_PATH)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model.to(device)
    model.eval()

    ds = CLRPDataset(df, tokenizer, config['max_len'])
    dl = DataLoader(ds,
                    batch_size=config["batch_size"],
                    shuffle=False,
                    num_workers = 4,
                    pin_memory=True,
                    drop_last=False)
        
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0], -1).to(device) for key, val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs[0][:, 0].detach().cpu().numpy()
            embeddings.extend(outputs)
    return np.array(embeddings)

config = {
    'batch_size': 128,
    'max_len': 256,
    'seed': 42,
}
seed_everything(seed=config['seed'])

train_embeddings =  get_embeddings(train,'../input/modelf1')
test_embeddings = get_embeddings(test,'../input/modelf1')




train_embeddings2 =  get_embeddings(train,'../input/modelf2')
test_embeddings2 = get_embeddings(test,'../input/modelf2')




train_embeddings3 =  get_embeddings(train,'../input/modelf3')
test_embeddings3 = get_embeddings(test,'../input/modelf3')



train_embeddings4 =  get_embeddings(train,'../input/modelf4')
test_embeddings4 = get_embeddings(test,'../input/modelf4')




train_embeddings5 =  get_embeddings(train,'../input/modelf5')
test_embeddings5 = get_embeddings(test,'../input/modelf5')

In [None]:
train_embeddings.shape

In [None]:
from sklearn.model_selection import train_test_split 
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, LSTM,Dropout,concatenate
from sklearn.feature_extraction.text import CountVectorizer
import tensorflow as tf
from tensorflow.python.keras.layers import Dense, Activation, Embedding, LSTM,Dropout,Bidirectional,GRU
from keras.utils import plot_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense , Flatten ,Embedding,Input,Conv1D,GlobalAveragePooling1D,GlobalMaxPooling1D,Dropout,MaxPooling1D,Bidirectional,GRU,Concatenate
from keras.models import Sequential,Model
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
def crt_model():
    i1=Input(shape=(768,1))
    l1=Conv1D(64,5,padding='valid', kernel_initializer='normal',activation='relu')(i1)
    l2 =MaxPooling1D(2) (l1)
    l3=Conv1D(128,5,padding='valid', kernel_initializer='normal',activation='relu')(l2)
    l3 =MaxPooling1D(2) (l3)
    l3=Conv1D(256,5,padding='valid', kernel_initializer='normal',activation='relu')(l3)
    l41=GlobalMaxPooling1D()(l3)
    
    
    
    i2=Input(shape=(768,1))
    l12=Conv1D(64,5,padding='valid', kernel_initializer='normal',activation='relu')(i2)
    l22 =MaxPooling1D(2) (l12)
    l32=Conv1D(128,5,padding='valid', kernel_initializer='normal',activation='relu')(l22)
    l32 =MaxPooling1D(2) (l32)
    l32=Conv1D(256,5,padding='valid', kernel_initializer='normal',activation='relu')(l32)
    l42=GlobalMaxPooling1D()(l32)
    
    
    
    i3=Input(shape=(768,1))
    l13=Conv1D(64,5,padding='valid', kernel_initializer='normal',activation='relu')(i3)
    l23 =MaxPooling1D(2) (l13)
    l33=Conv1D(128,5,padding='valid', kernel_initializer='normal',activation='relu')(l23)
    l33 =MaxPooling1D(2) (l33)
    l33=Conv1D(256,5,padding='valid', kernel_initializer='normal',activation='relu')(l33)
    l43=GlobalMaxPooling1D()(l33)
    
    
    
    
    i4=Input(shape=(768,1))
    l14=Conv1D(64,5,padding='valid', kernel_initializer='normal',activation='relu')(i4)
    l24 =MaxPooling1D(2) (l14)
    l34=Conv1D(128,5,padding='valid', kernel_initializer='normal',activation='relu')(l24)
    l34 =MaxPooling1D(2) (l34)
    l34=Conv1D(256,5,padding='valid', kernel_initializer='normal',activation='relu')(l34)
    l44=GlobalMaxPooling1D()(l34)
    
    
    
    
    i5=Input(shape=(768,1))
    l15=Conv1D(64,5,padding='valid', kernel_initializer='normal',activation='relu')(i5)
    l25 =MaxPooling1D(2) (l15)
    l35=Conv1D(128,5,padding='valid', kernel_initializer='normal',activation='relu')(l25)
    l35 =MaxPooling1D(2) (l35)
    l35=Conv1D(256,5,padding='valid', kernel_initializer='normal',activation='relu')(l35)
    l45=GlobalMaxPooling1D()(l35)
    
    
    
    
    
    i6=Input(shape=(768,1))
    l16=Conv1D(64,5,padding='valid', kernel_initializer='normal',activation='relu')(i6)
    l26 =MaxPooling1D(2) (l16)
    l36=Conv1D(128,5,padding='valid', kernel_initializer='normal',activation='relu')(l26)
    l36 =MaxPooling1D(2) (l36)
    l36=Conv1D(256,5,padding='valid', kernel_initializer='normal',activation='relu')(l36)
    l46=GlobalMaxPooling1D()(l36)
    
    
    
    i7=Input(shape=(768,1))
    l17=Conv1D(64,5,padding='valid', kernel_initializer='normal',activation='relu')(i7)
    l27 =MaxPooling1D(2) (l17)
    l37=Conv1D(128,5,padding='valid', kernel_initializer='normal',activation='relu')(l27)
    l37 =MaxPooling1D(2) (l37)
    l37=Conv1D(256,5,padding='valid', kernel_initializer='normal',activation='relu')(l37)
    l47=GlobalMaxPooling1D()(l37)
    
    
    
    i8=Input(shape=(768,1))
    l18=Conv1D(64,5,padding='valid', kernel_initializer='normal',activation='relu')(i8)
    l28 =MaxPooling1D(2) (l18)
    l38=Conv1D(128,5,padding='valid', kernel_initializer='normal',activation='relu')(l28)
    l38 =MaxPooling1D(2) (l38)
    l38=Conv1D(256,5,padding='valid', kernel_initializer='normal',activation='relu')(l38)
    l48=GlobalMaxPooling1D()(l38)
    
    
    
    
    i9=Input(shape=(768,1))
    l19=Conv1D(64,5,padding='valid', kernel_initializer='normal',activation='relu')(i9)
    l29 =MaxPooling1D(2) (l19)
    l39=Conv1D(128,5,padding='valid', kernel_initializer='normal',activation='relu')(l29)
    l39 =MaxPooling1D(2) (l39)
    l39=Conv1D(256,5,padding='valid', kernel_initializer='normal',activation='relu')(l39)
    l49=GlobalMaxPooling1D()(l39)
    
    
    
    
    i10=Input(shape=(768,1))
    l110=Conv1D(64,5,padding='valid', kernel_initializer='normal',activation='relu')(i10)
    l210=MaxPooling1D(2) (l110)
    l310=Conv1D(128,5,padding='valid', kernel_initializer='normal',activation='relu')(l210)
    l310 =MaxPooling1D(2) (l310)
    l310=Conv1D(256,5,padding='valid', kernel_initializer='normal',activation='relu')(l310)
    l410=GlobalMaxPooling1D()(l310)
    
    
    
    
    
    
    
    
       
    
    l4 = concatenate([l41, l42, l43, l44, l45,l46,l47,l48,l49,l410])
    
    
    l5=Dense(120, kernel_initializer='normal',activation='relu')(l4)
    l5=Dense(240, kernel_initializer='normal',activation='relu')(l5)
    l7=Dense(1, kernel_initializer='normal')(l5)
    model=Model(inputs=[i1,i2,i3,i4,i5,i6,i7,i8,i9,i10], outputs=l7)
    model.compile(loss='mean_squared_error', optimizer='adam',metrics=[keras.metrics.MeanSquaredError()])
    return model

In [None]:
model=crt_model()
keras.utils.plot_model(model)

In [None]:
model.summary()

In [None]:
def get_res(train_embedd,target,test_embedd):
    nfolds = 5
    scores =[]
    preds = np.zeros((test_embedd.shape[0]))
    kf = KFold(n_splits=nfolds, shuffle=True, random_state=config['seed'])
    for k, (train_idx, valid_idx) in enumerate(kf.split(train)): 
        model=crt_model()
        train_x,train_y,test_x,test_y=train_embedd[train_idx], target[train_idx],train_embedd[valid_idx], target[valid_idx]
        train_x=train_x.reshape(train_x.shape+(1,))
        traindata=[train_x for i in range(10)]
        val=[test_x for i in range(10)]
        model.fit(traindata,train_y,epochs=7,validation_data=(val,test_y),batch_size=8)
        y_pred=model.predict(val)
        score = rmse_score(y_pred,test_y)
        scores.append(score)
        print(f'Fold {k} , rmse score: {score}')
        test=[test_embedd for i in range(10)]
        y_preds = model.predict(test)
        y_preds=y_preds.reshape(-1)
        preds+=y_preds
   
        
        
    print("mean rmse",np.mean(scores))
    return np.array(preds)/nfolds  

In [None]:
pred1=get_res(train_embeddings,target,test_embeddings)
pred2=get_res(train_embeddings2,target,test_embeddings2)
pred3=get_res(train_embeddings3,target,test_embeddings3)
pred4=get_res(train_embeddings4,target,test_embeddings4)
pred5=get_res(train_embeddings5,target,test_embeddings5)

In [None]:
preds=(pred1+pred2+pred3+pred4+pred5)/5

In [None]:
def get_preds_svm(X,y,X_test,nfolds=5,C=10,kernel='rbf'):
    scores = list()
    preds = np.zeros((X_test.shape[0]))
    kf = KFold(n_splits=nfolds, shuffle=True, random_state=config['seed'])
    for k, (train_idx, valid_idx) in enumerate(kf.split(train)): 
        model = SVR(C=C,kernel=kernel,gamma='auto')
        train_x,train_y,val_x,val_y=X[train_idx], y[train_idx],X[valid_idx], y[valid_idx]
        
        
        model.fit(train_x,train_y)
        prediction = model.predict(val_x)
        score = rmse_score(prediction,val_y)
        print(f'Fold {k} , rmse score: {score}')
        scores.append(score)
        preds += model.predict(X_test)
        
    print("mean rmse",np.mean(scores))
    return np.array(preds)/nfolds

In [None]:
svm_preds1 = get_preds_svm(train_embeddings,target,test_embeddings)
svm_preds2 = get_preds_svm(train_embeddings2,target,test_embeddings2)
svm_preds3 = get_preds_svm(train_embeddings3,target,test_embeddings3)
svm_preds4 = get_preds_svm(train_embeddings4,target,test_embeddings4)
svm_preds5 = get_preds_svm(train_embeddings5,target,test_embeddings5)

In [None]:
svm_preds = (svm_preds1 + svm_preds2 + svm_preds3 + svm_preds4 + svm_preds5)/5

In [None]:
pred=(preds+svm_preds)/2

In [None]:
pred=pred.reshape(-1)
pred

In [None]:
submission = pd.DataFrame({'id':test.id,'target':pred})
submission.to_csv('submission.csv',index=False)