This notebook uses below given notebooks to make predictions.

1. LB 0.468 https://www.kaggle.com/rhtsingh/commonlit-readability-prize-roberta-torch-infer-3
2. LB 0.474 https://www.kaggle.com/maunish/clrp-roberta-svm

In [None]:
import os
import gc
import sys
import cv2
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader

from transformers import (AutoModel, AutoTokenizer, 
                          AutoModelForSequenceClassification)

import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff


from colorama import Fore, Back, Style
y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA
c_ = Fore.CYAN
sr_ = Style.RESET_ALL

In [None]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

num_bins = int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:,'bins'] = pd.cut(train_data['target'],bins=num_bins,labels=False)

target = train_data['target'].to_numpy()
bins = train_data.bins.to_numpy()

def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

In [None]:
config = {
    'batch_size':128,
    'max_len':256,
    'nfolds':5,
    'seed':42,
}

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

In [None]:
class CLRPDataset(Dataset):
    def __init__(self,df,tokenizer):
        self.excerpt = df['excerpt'].to_numpy()
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],return_tensors='pt',
                                max_length=config['max_len'],
                                padding='max_length',truncation=True)
        return encode
    
    def __len__(self):
        return len(self.excerpt)

In [None]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim

        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))

        score = self.V(att)

        attention_weights = torch.softmax(score, dim=1)

        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

In [None]:
class Model(nn.Module):
    def __init__(self):
        super(Model,self).__init__()
        self.roberta = AutoModel.from_pretrained('../input/roberta-base')    
        self.head = AttentionHead(768,768,1)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.head.out_features,1)

    def forward(self,**xb):
        x = self.roberta(**xb)[0]
        x = self.head(x)
        return x

In [None]:
def get_embeddings(df,path,plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    model = Model()
    model.load_state_dict(torch.load(path))
    model.to(device)
    model.eval()
    
    tokenizer = AutoTokenizer.from_pretrained('../input/roberta-base')
    
    ds = CLRPDataset(df,tokenizer)
    dl = DataLoader(ds,
                  batch_size = config["batch_size"],
                  shuffle=False,
                  num_workers = 4,
                  pin_memory=True,
                  drop_last=False
                 )
        
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs.detach().cpu().numpy()
            embeddings.extend(outputs)
    return np.array(embeddings)

In [None]:
train_embeddings1 =  get_embeddings(train_data,'../input/clr-roberta/model0/model0.bin')
test_embeddings1 = get_embeddings(test_data,'../input/clr-roberta/model0/model0.bin')
'''
train_embeddings2 =  get_embeddings(train_data,'../input/clr-roberta/model1/model1.bin')
test_embeddings2 = get_embeddings(test_data,'../input/clr-roberta/model1/model1.bin')

train_embeddings3 =  get_embeddings(train_data,'../input/clr-roberta/model2/model2.bin')
test_embeddings3 = get_embeddings(test_data,'../input/clr-roberta/model2/model2.bin')

train_embeddings4 =  get_embeddings(train_data,'../input/clr-roberta/model3/model3.bin')
test_embeddings4 = get_embeddings(test_data,'../input/clr-roberta/model3/model3.bin')

train_embeddings5 =  get_embeddings(train_data,'../input/clr-roberta/model4/model4.bin')
test_embeddings5 = get_embeddings(test_data,'../input/clr-roberta/model4/model4.bin')
'''

In [None]:
def synthesize_excerpt():
    pass

In [None]:
train_X=train_embeddings1
train_Y=train_data.target.values
test_X=test_embeddings1

print('train_X: ',train_embeddings1.shape)
print('train_Y: ',train_data.target.shape)
print('test_X: ',test_embeddings1.shape)


## From Embedding to Target

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge,Lasso
from sklearn.svm import SVR
from sklearn.decomposition import PCA

In [None]:
def Search_Model(train_X=train_embeddings1,train_Y=train_data.target.values):
    '''
    find and return best model
    '''
    pipe = Pipeline([
        ('scaler',None),
        ('reduce_dim',PCA()),
        ('regressor',None)
        ])
    
    
    scalers_to_test = [StandardScaler(), RobustScaler(), QuantileTransformer()]
    regressors_to_test=[Ridge(),Lasso()]
    alpha_to_test = 2.0**np.arange(-4, +4)
    n_features_to_test = np.arange(5, 10)
    params = [
        {'scaler': scalers_to_test,
         
         'reduce_dim__n_components': n_features_to_test,
         
         'regressor': regressors_to_test,
         'regressor__alpha': alpha_to_test}
    ]
    
    gridsearch = GridSearchCV(pipe, params,'neg_mean_squared_error',cv=3, verbose=3).fit(train_X, train_Y)
    return gridsearch

In [None]:
def Try_Model(train_X=train_embeddings1,train_Y=train_data.target.values):
    '''
    find and return best model
    '''
    pipe = Pipeline([
        ('scaler',None),
        ('reduce_dim',PCA()),
        ('regressor',SVR())
        ])
    
    
    scalers_to_test = [StandardScaler()]#, RobustScaler(), QuantileTransformer()]
    alpha_to_test = 2.0**np.arange(3, +4)
    kernel_to_test=['linear']#,'rbf']
    n_features_to_test = np.array([25,75,100,125])#np.arange(16, 25)
    params = [
        {'scaler': scalers_to_test,
         
         'reduce_dim__n_components': n_features_to_test,

         'regressor__C': alpha_to_test,
         'regressor__kernel': kernel_to_test
        }
    ]
    
    gridsearch = GridSearchCV(pipe, params,'neg_mean_squared_error',cv=3, verbose=2).fit(train_X, train_Y)
    return gridsearch

In [None]:
#grd=Try_Model()

In [None]:
#grd=Search_Model()
#pd.DataFrame(grd.cv_results_)
#best_model=grd.best_estimator_
#best_model

In [None]:
selected=Pipeline(steps=[('scaler', StandardScaler()),
                ('reduce_dim', PCA(n_components=25)),
                ('regressor', SVR(C=8.0, kernel='linear'))])

In [None]:
'''
selected=Pipeline(steps=[('scaler', StandardScaler()),
                ('reduce_dim', PCA(n_components=9)),
                ('regressor', Ridge(alpha=0.5))])
'''

In [None]:
selected.fit(train_X,train_Y)
Y_pred=selected.predict(test_X)

## Submission

In [None]:
#import pickle
#with open('best_model.pickle', 'wb') as handle:
#    pickle.dump(best_model, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
ret=pd.DataFrame(test_data['id'])
ret['target']=Y_pred

In [None]:
ret.to_csv('submission.csv',index=False)