# Model 1

In [None]:
import os
import math
import random
import time

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoConfig

from sklearn.model_selection import KFold
from sklearn.svm import SVR

import gc
gc.enable()

In [None]:
BATCH_SIZE = 32
MAX_LEN = 248
EVAL_SCHEDULE = [(0.50, 16), (0.49, 8), (0.48, 4), (0.47, 2), (-1., 1)]
ROBERTA_PATH = "../input/huggingface-roberta/roberta-base"
TOKENIZER_PATH = "../input/huggingface-roberta/roberta-base"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
test_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
submission_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/sample_submission.csv")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

# Dataset

In [None]:
class LitDataset(Dataset):
    def __init__(self, df, inference_only=False):
        super().__init__()

        self.df = df        
        self.inference_only = inference_only
        self.text = df.excerpt.tolist()
        #self.text = [text.replace("\n", " ") for text in self.text]
        
        if not self.inference_only:
            self.target = torch.tensor(df.target.values, dtype=torch.float32)        
    
        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',            
            max_length = MAX_LEN,
            truncation = True,
            return_attention_mask=True
        )        
 

    def __len__(self):
        return len(self.df)

    
    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return (input_ids, attention_mask)            
        else:
            target = self.target[index]
            return (input_ids, attention_mask, target)

# Model
The model is inspired by the one from [Maunish](https://www.kaggle.com/maunish/clrp-roberta-svm).

In [None]:
class LitModel(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(ROBERTA_PATH)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})                       
        
        self.roberta = AutoModel.from_pretrained(ROBERTA_PATH, config=config)  
            
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(                        
            nn.Linear(768, 1)                        
        )
        

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)        

        # There are a total of 13 layers of hidden states.
        # 1 for the embedding layer, and 12 for the 12 Roberta layers.
        # We take the hidden states from the last Roberta layer.
        last_layer_hidden_states = roberta_output.hidden_states[-1]

        # The number of cells is MAX_LEN.
        # The size of the hidden state of each cell is 768 (for roberta-base).
        # In order to condense hidden states of all cells to a context vector,
        # we compute a weighted average of the hidden states of all cells.
        # We compute the weight of each cell, using the attention neural network.
        weights = self.attention(last_layer_hidden_states)
                
        # weights.shape is BATCH_SIZE x MAX_LEN x 1
        # last_layer_hidden_states.shape is BATCH_SIZE x MAX_LEN x 768        
        # Now we compute context_vector as the weighted average.
        # context_vector.shape is BATCH_SIZE x 768
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        
        
        # Now we reduce the context vector to the prediction score.
        return self.regressor(context_vector)

In [None]:
def predict(model, data_loader):
    """Returns an np.array with predictions of the |model| on |data_loader|"""
    model.eval()

    result = np.zeros(len(data_loader.dataset))    
    index = 0
    
    with torch.no_grad():
        for batch_num, (input_ids, attention_mask) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
                        
            pred = model(input_ids, attention_mask)                        

            result[index : index + pred.shape[0]] = pred.flatten().to("cpu")
            index += pred.shape[0]

    return result

# Inference

In [None]:
test_dataset = LitDataset(test_df, inference_only=True)

In [None]:
NUM_MODELS = 5

all_predictions = np.zeros((NUM_MODELS, len(test_df)))



test_dataset = LitDataset(test_df, inference_only=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                         drop_last=False, shuffle=False, num_workers=2)

for model_index in range(NUM_MODELS):            
    model_path = f"../input/commonlit-roberta-0467/model_{model_index + 1}.pth"
    print(f"\nUsing {model_path}")
                        
    model = LitModel()
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))    
    model.to(DEVICE)
        
    all_predictions[model_index] = predict(model, test_loader)
            
    del model
    gc.collect()

In [None]:
model1_predictions = all_predictions.mean(axis=0)

# Model 2
Imported from [https://www.kaggle.com/rhtsingh/commonlit-readability-prize-roberta-torch-infer-3](https://www.kaggle.com/rhtsingh/commonlit-readability-prize-roberta-torch-infer-3)

In [None]:
import os
import gc
import sys
import cv2
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader

from transformers import (AutoModel, AutoTokenizer, AutoConfig,
                          AutoModelForSequenceClassification)

import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
from colorama import Fore, Back, Style
y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA
c_ = Fore.CYAN
sr_ = Style.RESET_ALL

train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

num_bins = int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:,'bins'] = pd.cut(train_data['target'],bins=num_bins,labels=False)

target = train_data['target'].to_numpy()
bins = train_data.bins.to_numpy()

def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))
config = {
    'batch_size':8,
    'max_len':256,
    'nfolds':5,
    'seed':42,
}

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

class CLRPDataset(Dataset):
    def __init__(self,df,tokenizer):
        self.excerpt = df['excerpt'].to_numpy()
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],return_tensors='pt',
                                max_length=config['max_len'],
                                padding='max_length',truncation=True)
        return encode
    
    def __len__(self):
        return len(self.excerpt)
    
    
class Model(nn.Module): 
    def __init__(self):
        super().__init__() 


        config = AutoConfig.from_pretrained('../input/huggingface-roberta/roberta-large')
        self.model = AutoModel.from_pretrained('../input/huggingface-roberta/roberta-large', config=config)
        self.drop_out1 = nn.Dropout(0)
        self.drop_out2 = nn.Dropout(0.1)
        self.layer_norm = nn.LayerNorm(1024)
        self.layer_norm1 = nn.LayerNorm(1024)
        self.l1 = nn.Linear(1024, 512)
        self.l2 = nn.Linear(512, 1)

        self._init_weights(self.layer_norm1)
        self._init_weights(self.l1)
        self._init_weights(self.l2)
 
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0) 
    def forward(self, input_ids, attention_mask, labels=None):
        
        outputs = self.model(input_ids, attention_mask)
        last_hidden_state = outputs[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        out= self.layer_norm(mean_embeddings)
        return out       

def get_embeddings(df,path,plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    model = Model()
    model.load_state_dict(torch.load(path))
    model.to(device)
    model.eval()
    
    tokenizer = AutoTokenizer.from_pretrained('../input/huggingface-roberta/roberta-large')
    
    ds = CLRPDataset(df,tokenizer)
    dl = DataLoader(ds,
                  batch_size = config["batch_size"],
                  shuffle=False,
                  num_workers = 4,
                  pin_memory=True,
                  drop_last=False
                 )
        
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs.detach().cpu().numpy()
            embeddings.extend(outputs)
    return np.array(embeddings)

def get_preds_svm(X,y,X_test,RidgeReg=0,bins=bins,nfolds=10,C=8,kernel='rbf'):
    scores = list()
    preds = np.zeros((X_test.shape[0]))
    
    kfold = StratifiedKFold(n_splits=10,shuffle=True,random_state=config['seed'])
    for k, (train_idx,valid_idx) in enumerate(kfold.split(X,bins)):
        if(RidgeReg):
            print("ridge...")
            model = Ridge(alpha=80.0)
        else:
            model = SVR(C=C,kernel=kernel,gamma='auto')
        X_train,y_train = X[train_idx], y[train_idx]
        X_valid,y_valid = X[valid_idx], y[valid_idx]
        
        model.fit(X_train,y_train)
        prediction = model.predict(X_valid)
        score = rmse_score(prediction,y_valid)
        print(f'Fold {k} , rmse score: {score}')
        scores.append(score)
        preds += model.predict(X_test)
        
    print("mean rmse",np.mean(scores))
    return np.array(preds)/nfolds

train_embeddings0 =  get_embeddings(train_data,'../input/version0/model0/model0.bin')
test_embeddings0 = get_embeddings(test_data,'../input/version0/model0/model0.bin')
svm_preds0 = get_preds_svm(train_embeddings0,target,test_embeddings0)
ridge_preds0 = get_preds_svm(train_embeddings0,target,test_embeddings0,RidgeReg=1)
del train_embeddings0,test_embeddings0
gc.collect()

train_embeddings2 =  get_embeddings(train_data,'../input/version0/model2/model2.bin')
test_embeddings2 = get_embeddings(test_data,'../input/version0/model2/model2.bin')
svm_preds2 = get_preds_svm(train_embeddings2,target,test_embeddings2)
ridge_preds2 = get_preds_svm(train_embeddings2,target,test_embeddings2,RidgeReg=1)
del train_embeddings2,test_embeddings2
gc.collect()
train_embeddings3 =  get_embeddings(train_data,'../input/version0/model3/model3.bin')
test_embeddings3 = get_embeddings(test_data,'../input/version0/model3/model3.bin')
svm_preds3 = get_preds_svm(train_embeddings3,target,test_embeddings3)
ridge_preds3 = get_preds_svm(train_embeddings3,target,test_embeddings3,RidgeReg=1)
del train_embeddings3,test_embeddings3
gc.collect()

train_embeddings4 =  get_embeddings(train_data,'../input/version0/model4/model4.bin')
test_embeddings4 = get_embeddings(test_data,'../input/version0/model4/model4.bin')
svm_preds4 = get_preds_svm(train_embeddings4,target,test_embeddings4)
ridge_preds4 = get_preds_svm(train_embeddings4,target,test_embeddings4,RidgeReg=1)
del train_embeddings4,test_embeddings4
gc.collect()









In [None]:
class Model2(nn.Module): 
    def __init__(self):
        super().__init__() 


        config = AutoConfig.from_pretrained('../input/huggingface-roberta/roberta-large')
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7}) 
        self.model = AutoModel.from_pretrained('../input/huggingface-roberta/roberta-large', config=config)
        self.drop_out1 = nn.Dropout(0)
        self.drop_out2 = nn.Dropout(0.1)
        self.layer_norm = nn.LayerNorm(1024)
        self.layer_norm1 = nn.LayerNorm(1024)
        self.l1 = nn.Linear(1024, 512)
        self.l2 = nn.Linear(512, 1)

        self._init_weights(self.layer_norm1)
        self._init_weights(self.l1)
        self._init_weights(self.l2)
 
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0) 
    def forward(self, input_ids, attention_mask, labels=None):
        
        outputs = self.model(input_ids, attention_mask)
        last_hidden_state = outputs[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        out= self.layer_norm(mean_embeddings)
        return out       

def get_embeddings(df,path,plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    model = Model2()
    model.load_state_dict(torch.load(path))
    model.to(device)
    model.eval()
    
    tokenizer = AutoTokenizer.from_pretrained('../input/huggingface-roberta/roberta-large')
    
    ds = CLRPDataset(df,tokenizer)
    dl = DataLoader(ds,
                  batch_size = config["batch_size"],
                  shuffle=False,
                  num_workers = 4,
                  pin_memory=True,
                  drop_last=False
                 )
        
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs.detach().cpu().numpy()
            embeddings.extend(outputs)
    return np.array(embeddings)

def get_preds_svm(X,y,X_test,RidgeReg=0,bins=bins,nfolds=10,C=10,kernel='rbf'):
    scores = list()
    preds = np.zeros((X_test.shape[0]))
    
    kfold = StratifiedKFold(n_splits=10,shuffle=True,random_state=config['seed'])
    for k, (train_idx,valid_idx) in enumerate(kfold.split(X,bins)):
        if(RidgeReg):
            print("ridge...")
            model = Ridge(alpha=80.0)
        else:
            model = SVR(C=C,kernel=kernel,gamma='auto')
        X_train,y_train = X[train_idx], y[train_idx]
        X_valid,y_valid = X[valid_idx], y[valid_idx]
        
        model.fit(X_train,y_train)
        prediction = model.predict(X_valid)
        score = rmse_score(prediction,y_valid)
        print(f'Fold {k} , rmse score: {score}')
        scores.append(score)
        preds += model.predict(X_test)
        
    print("mean rmse",np.mean(scores))
    return np.array(preds)/nfolds

train_embeddings0 =  get_embeddings(train_data,'../input/nepoch2/model0/model0.bin')
test_embeddings0 = get_embeddings(test_data,'../input/nepoch2/model0/model0.bin')
svm0 = get_preds_svm(train_embeddings0,target,test_embeddings0)

del train_embeddings0,test_embeddings0
gc.collect()
train_embeddings1 =  get_embeddings(train_data,'../input/nepoch2/model1/model1.bin')
test_embeddings1= get_embeddings(test_data,'../input/nepoch2/model1/model1.bin')
svm1 = get_preds_svm(train_embeddings1,target,test_embeddings1)

del train_embeddings1,test_embeddings1
gc.collect()    
train_embeddings2 =  get_embeddings(train_data,'../input/nepoch2/model2/model2.bin')
test_embeddings2 = get_embeddings(test_data,'../input/nepoch2/model2/model2.bin')
svm2 = get_preds_svm(train_embeddings2,target,test_embeddings2)

del train_embeddings2,test_embeddings2
gc.collect()




train_embeddings0 =  get_embeddings(train_data,'../input/fork-of-fork-of-nepoch2/model0/model0.bin')
test_embeddings0 = get_embeddings(test_data,'../input/fork-of-fork-of-nepoch2/model0/model0.bin')
svmpreds0 = get_preds_svm(train_embeddings0,target,test_embeddings0)

del train_embeddings0,test_embeddings0
gc.collect()
train_embeddings1 =  get_embeddings(train_data,'../input/fork-of-fork-of-nepoch2/model1/model1.bin')
test_embeddings1= get_embeddings(test_data,'../input/fork-of-fork-of-nepoch2/model1/model1.bin')
svmpreds1 = get_preds_svm(train_embeddings1,target,test_embeddings1)

del train_embeddings1,test_embeddings1
gc.collect()    
train_embeddings2 =  get_embeddings(train_data,'../input/fork-of-fork-of-nepoch2/model2/model2.bin')
test_embeddings2 = get_embeddings(test_data,'../input/fork-of-fork-of-nepoch2/model2/model2.bin')
svmpreds2 = get_preds_svm(train_embeddings2,target,test_embeddings2)

del train_embeddings2,test_embeddings2
gc.collect()
train_embeddings3 =  get_embeddings(train_data,'../input/fork-of-fork-of-nepoch2/model3/model3.bin')
test_embeddings3 = get_embeddings(test_data,'../input/fork-of-fork-of-nepoch2/model3/model3.bin')
svmpreds3= get_preds_svm(train_embeddings3,target,test_embeddings3)

del train_embeddings3,test_embeddings3
gc.collect()

train_embeddings4 =  get_embeddings(train_data,'../input/fork-of-fork-of-nepoch2/model4/model4.bin')
test_embeddings4 = get_embeddings(test_data,'../input/fork-of-fork-of-nepoch2/model4/model4.bin')
svmpreds4 = get_preds_svm(train_embeddings4,target,test_embeddings4)

del train_embeddings4,test_embeddings4
gc.collect()







svmpreds = (svmpreds1 + svmpreds2 + svmpreds4 + svmpreds0+svmpreds3)/5
svms = ( svm_preds2 + svmpreds + svm_preds4 + svm_preds0)/4

svm = (svm1 + svm2  + svm0)/3

In [None]:
svm_preds=svm*0.3+svms*0.7

In [None]:
predictions = model1_predictions * 0.3 + svm_preds * 0.7

In [None]:
submission_df.target = predictions
print(submission_df)
submission_df.to_csv("submission.csv", index=False)