In [None]:
import numpy as np
import pandas as pd 
import os
import gc
import sys
import time
import random
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold, StratifiedKFold
import xgboost as xgb

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from transformers import AutoModel, AutoTokenizer

In [None]:
import json
from tensorflow.keras.models import load_model
import re
import pandas as pd
import string
import keras

In [None]:
data_dir = '../input/commonlitreadabilityprize/'
train = pd.read_csv(data_dir + 'train.csv')
test = pd.read_csv(data_dir + 'test.csv')
sample_submission = pd.read_csv(data_dir + 'sample_submission.csv')

target = train['target'].to_numpy()

In [None]:
# source: https://www.kaggle.com/maunish/clrp-roberta-lgbm

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


class CLRPDataset(nn.Module):
    def __init__(self, df, tokenizer, max_len=128):
        self.excerpt = df['excerpt'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        return encode
    
    def __len__(self):
        return len(self.excerpt)
    

def get_embeddings(df, path, plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    MODEL_PATH = path
    model = AutoModel.from_pretrained(MODEL_PATH)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model.to(device)
    model.eval()

    ds = CLRPDataset(df, tokenizer, config['max_len'])
    dl = DataLoader(ds,
                    batch_size=config["batch_size"],
                    shuffle=False,
                    num_workers = 4,
                    pin_memory=True,
                    drop_last=False)
        
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0], -1).to(device) for key, val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs[0][:, 0].detach().cpu().numpy()
            embeddings.extend(outputs)
    return np.array(embeddings)

config = {
    'batch_size': 128,
    'max_len': 256,
    'seed': 42,
}
seed_everything(seed=config['seed'])

train_embeddings =  get_embeddings(train,'../input/modelf1')
test_embeddings = get_embeddings(test,'../input/modelf1')



train_embeddings2 =  get_embeddings(train,'../input/modelf2')
test_embeddings2 = get_embeddings(test,'../input/modelf2')

In [None]:
train_embeddings.shape

In [None]:
from sklearn.model_selection import train_test_split 
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, LSTM,Dropout,concatenate
from sklearn.feature_extraction.text import CountVectorizer
import tensorflow as tf
from tensorflow.python.keras.layers import Dense, Activation, Embedding, LSTM,Dropout,Bidirectional,GRU
from keras.utils import plot_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense , Flatten ,Embedding,Input,Conv1D,GlobalAveragePooling1D,GlobalMaxPooling1D,Dropout,MaxPooling1D,Bidirectional,GRU,Concatenate
from keras.models import Sequential,Model

In [None]:
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
def crt_model():
    i=Input(shape=(768,1))
    l1=Conv1D(64,5,padding='valid', kernel_initializer='normal',activation='relu')(i)
    l2 =MaxPooling1D(2) (l1)
    l3=Conv1D(128,5,padding='valid', kernel_initializer='normal',activation='relu')(l2)
    l3 =MaxPooling1D(2) (l3)
    l3=Conv1D(256,5,padding='valid', kernel_initializer='normal',activation='relu')(l3)
    l3=Conv1D(512,5,padding='valid', kernel_initializer='normal',activation='relu')(l3)
    l3=Conv1D(1024,5,padding='valid', kernel_initializer='normal',activation='relu')(l3)
    l3=Conv1D(2048,5,padding='valid', kernel_initializer='normal',activation='relu')(l3)
    l3=Conv1D(4098,5,padding='valid', kernel_initializer='normal',activation='relu')(l3)
    l3=Conv1D(8196,5,padding='valid', kernel_initializer='normal',activation='relu')(l3)
    l4=GlobalMaxPooling1D()(l3)
    l5=Dense(120, kernel_initializer='normal',activation='relu')(l4)
    l5=Dense(240, kernel_initializer='normal',activation='relu')(l5)
    l5=Dense(480, kernel_initializer='normal',activation='relu')(l5)
    l5=Dense(980, kernel_initializer='normal',activation='relu')(l5)
    l7=Dense(1, kernel_initializer='normal')(l5)
    model=Model(inputs=i, outputs=l7)
    model.compile(loss='mean_squared_error', optimizer='adam',metrics=[keras.metrics.MeanSquaredError()])
    return model

In [None]:
model=crt_model()
model.summary()

In [None]:
model2=crt_model()

In [None]:
model2.summary()

In [None]:
nfolds = 3
kf = KFold(n_splits=nfolds, shuffle=True, random_state=config['seed'])
for k, (train_idx, valid_idx) in enumerate(kf.split(train)): 
    train_x,train_y,test_x,test_y=train_embeddings[train_idx], target[train_idx],train_embeddings[valid_idx], target[valid_idx]
    model.fit(train_x.reshape(train_x.shape+(1,)),
              train_y,
              epochs=7,
              validation_data=(test_x,test_y),
              batch_size=8)

In [None]:
nfolds = 3
kf = KFold(n_splits=nfolds, shuffle=True, random_state=config['seed'])
for k, (train_idx, valid_idx) in enumerate(kf.split(train)): 
    train_x,train_y,test_x,test_y=train_embeddings2[train_idx], target[train_idx],train_embeddings2[valid_idx], target[valid_idx]
    model2.fit(train_x.reshape(train_x.shape+(1,)),
              train_y,
              epochs=7,
              validation_data=(test_x,test_y),
              batch_size=8)

In [None]:
y_pred=model.predict(test_embeddings)

In [None]:
y_pred2=model.predict(test_embeddings2)

In [None]:
def pr(y_pred,y_pred2):
    ypred=[]
    for i in range(len(y_pred)):
        ypred.append((y_pred[i][0]+y_pred2[i][0])/2)
    return ypred
ypred=pr(y_pred,y_pred2)
ypred

In [None]:
submission = pd.DataFrame({'id':test.id,'target':ypred})
submission.to_csv('submission.csv',index=False)

In [None]:
submission