In [None]:
import os
import gc
import sys
import cv2
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

import optuna
import lightgbm as lgb
import xgboost as xgb
from sklearn.svm import SVR
from catboost import CatBoostRegressor, Pool, CatBoost

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.optim.optimizer import Optimizer
from torch.optim.lr_scheduler import _LRScheduler
from torch.optim.lr_scheduler import (CosineAnnealingWarmRestarts, CosineAnnealingLR, 
                                      ReduceLROnPlateau)

from transformers import (AutoModel, AutoTokenizer, 
                          AutoModelForSequenceClassification,get_constant_schedule_with_warmup)

import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff


from colorama import Fore, Back, Style
y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA
c_ = Fore.CYAN
sr_ = Style.RESET_ALL

In [None]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

target = train_data['target'].to_numpy()

#for kfold  
num_bins = int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:,'bins'] = pd.cut(train_data['target'],bins=num_bins,labels=False)
bins = train_data.bins.to_numpy()

def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

In [None]:
config = {
    'batch_size':128,
    'max_len':256,
    'seed':42,
}

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

In [None]:
class CLRPDataset(nn.Module):
    def __init__(self,df,tokenizer,max_len=128):
        self.excerpt = df['excerpt'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)  
        return encode
    
    def __len__(self):
        return len(self.excerpt)

In [None]:
def get_embeddings(df,path,plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    MODEL_PATH = path
    model = AutoModel.from_pretrained(MODEL_PATH)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model.to(device)
    model.eval()

    ds = CLRPDataset(df,tokenizer,config['max_len'])
    dl = DataLoader(ds,
                  batch_size = config["batch_size"],
                  shuffle=False,
                  num_workers = 4,
                  pin_memory=True,
                  drop_last=False
                 )
        
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs[0][:,0].detach().cpu().numpy()
            embeddings.extend(outputs)
    return np.array(embeddings)

In [None]:
train_embeddings1 =  get_embeddings(train_data,'../input/modelf1')
test_embeddings1 = get_embeddings(test_data,'../input/modelf1')

train_embeddings2 =  get_embeddings(train_data,'../input/modelf2')
test_embeddings2 = get_embeddings(test_data,'../input/modelf2')

train_embeddings3 =  get_embeddings(train_data,'../input/modelf3')
test_embeddings3 = get_embeddings(test_data,'../input/modelf3')

train_embeddings4 =  get_embeddings(train_data,'../input/modelf4')
test_embeddings4 = get_embeddings(test_data,'../input/modelf4')

train_embeddings5 =  get_embeddings(train_data,'../input/modelf5')
test_embeddings5 = get_embeddings(test_data,'../input/modelf5')

## neural tangent kernel

In [None]:
!pip3 install ../input/frozendict/frozendict-2.0.2-py3-none-any.whl
!pip3 install ../input/neuraltangent/neural_tangents-0.3.6-py2.py3-none-any.whl

In [None]:
from jax import random
from neural_tangents import stax
import neural_tangents as nt

def get_preds_svm(X,y,X_test,bins=bins,nfolds=5,C=10,kernel='rbf'):
    kfold = StratifiedKFold(n_splits=nfolds)
    scores = list()
    preds = np.zeros((X_test.shape[0]))
    for k, (train_idx,valid_idx) in enumerate(kfold.split(X,bins)):
        X_train,y_train = X[train_idx], y[train_idx]
        X_valid,y_valid = X[valid_idx], y[valid_idx]

        ResBlock = stax.serial(
                        stax.FanOut(2),
                        stax.parallel(
                            stax.serial(
                                stax.Erf(),
                                stax.Dense(1, W_std=1.25, b_std=0.0),
                                stax.Erf(),
                                stax.Dense(1, W_std=1.25, b_std=0.0),
                                stax.Erf(),
                                stax.Dense(1, W_std=1.25, b_std=0.0),
                            ),
                            stax.Identity(),
                        ),
                        stax.FanInSum()
                    )

        init_fn, apply_fn, kernel_fn = stax.serial(
                stax.Dense(1, W_std=1.0, b_std=0),
                ResBlock, ResBlock, stax.Erf(),
                stax.Dense(1, W_std=2.5, b_std=0.1)
        )

        key = random.PRNGKey(10)
        _, params = init_fn(key, input_shape=X_train.shape)
        predict_fn = nt.predict.gradient_descent_mse_ensemble(kernel_fn,
                                                                  X_train,
                                                                  y_train[:,np.newaxis],
                                                                  diag_reg=1e-1,
                                                                  lr=1)
        prediction = predict_fn(x_test=X_valid, get='nngp', t=None)#model.predict(X_valid)
        score = rmse_score(prediction,y_valid)
        print(f'Fold {k} , rmse score: {score}')
        scores.append(score)
        preds += predict_fn(x_test=X_test, get='nngp', t=None)#model.predict(X_test)
        
    print("mean rmse",np.mean(scores))
    return np.array(preds)/nfolds

In [None]:
svm_preds1 = get_preds_svm(train_embeddings1,target,test_embeddings1)
svm_preds2 = get_preds_svm(train_embeddings2,target,test_embeddings2)
svm_preds3 = get_preds_svm(train_embeddings3,target,test_embeddings3)
svm_preds4 = get_preds_svm(train_embeddings4,target,test_embeddings4)
svm_preds5 = get_preds_svm(train_embeddings5,target,test_embeddings5)

In [None]:
svm_preds = (svm_preds1 + svm_preds2 + svm_preds3 + svm_preds4 + svm_preds5)/5

In [None]:
sample.target = svm_preds
sample.to_csv('submission.csv',index=False)

In [None]:
sample