# SVM + RoBERTa-large

Datasetlink(model): 

'../input/svm-robertalarge-pretrain/clrp_roberta_large' -> https://www.kaggle.com/joechan619/svm-robertalarge-pretrain

'../input/svm-robertalarge' -> https://www.kaggle.com/joechan619/svm-robertalarge


In [None]:
import os
import gc
import sys
import cv2
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader

from transformers import (AutoModel, AutoTokenizer, 
                          AutoModelForSequenceClassification)

import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff


from colorama import Fore, Back, Style
y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA
c_ = Fore.CYAN
sr_ = Style.RESET_ALL

In [None]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

num_bins = int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:,'bins'] = pd.cut(train_data['target'],bins=num_bins,labels=False)

target = train_data['target'].to_numpy()
bins = train_data.bins.to_numpy()

def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

In [None]:
config = {
    'batch_size':128,
    'max_len':256,
    'nfolds':10,
    'seed':42,
}

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

In [None]:
class CLRPDataset(Dataset):
    def __init__(self,df,tokenizer):
        self.excerpt = df['excerpt'].to_numpy()
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],return_tensors='pt',
                                max_length=config['max_len'],
                                padding='max_length',truncation=True)
        return encode
    
    def __len__(self):
        return len(self.excerpt)

In [None]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim

        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))

        score = self.V(att)

        attention_weights = torch.softmax(score, dim=1)

        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

In [None]:
class Model(nn.Module):
    def __init__(self):
        super(Model,self).__init__()
        self.roberta = AutoModel.from_pretrained('../input/svm-robertalarge-pretrain/clrp_roberta_large')    
        #changed attentionHead Dimension from 768 to 1024 by changing model from roberta-base to roberta-large
        self.head = AttentionHead(1024,1024,1)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.head.out_features,1)

    def forward(self,**xb):
        x = self.roberta(**xb)[0]
        x = self.head(x)
        return x

In [None]:
def get_embeddings(df,path,plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    model = Model()
    model.load_state_dict(torch.load(path))
    model.to(device)
    model.eval()
    
    tokenizer = AutoTokenizer.from_pretrained('../input/svm-robertalarge-pretrain/clrp_roberta_large')
    
    ds = CLRPDataset(df,tokenizer)
    dl = DataLoader(ds,
                  batch_size = config["batch_size"],
                  shuffle=False,
                  num_workers = 4,
                  pin_memory=True,
                  drop_last=False
                 )
        
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs.detach().cpu().numpy()
            embeddings.extend(outputs)
    return np.array(embeddings)

## svm

In [None]:
def get_preds_svm(X,y,X_test,bins=bins,nfolds=10,C=10,kernel='rbf'):
    scores = list()
    train_preds = np.zeros((X.shape[0]))
    preds = np.zeros((X_test.shape[0]))
    
    kfold = StratifiedKFold(n_splits=config['nfolds'],shuffle=True,random_state=config['seed'])
    for k, (train_idx,valid_idx) in enumerate(kfold.split(X,bins)):
        model = SVR(C=C,kernel=kernel,gamma='auto')
        X_train,y_train = X[train_idx], y[train_idx]
        X_valid,y_valid = X[valid_idx], y[valid_idx]
        
        model.fit(X_train,y_train)
        prediction = model.predict(X_valid)
        train_preds[valid_idx] = prediction
        score = rmse_score(prediction,y_valid)
        print(f'Fold {k} , rmse score: {score}')
        scores.append(score)
        preds += model.predict(X_test)
        
    print("mean rmse",np.mean(scores))
    return train_preds,np.array(preds)/nfolds

In [None]:
def get_svm(fold):
    model_path = f'../input/svm-robertalarge/model{fold}/model{fold}.bin'
    train_embeddings =  get_embeddings(train_data,model_path)
    test_embeddings = get_embeddings(test_data,model_path)
    train_svm,svm_preds = get_preds_svm(train_embeddings,target,test_embeddings)
    return train_svm,svm_preds

In [None]:
train_svm_list = []
test_svm_list = []
for fold in range(5):
    train_svm,svm_preds = get_svm(fold)
    train_svm_list.append(train_svm)
    test_svm_list.append(svm_preds)

In [None]:
train_svm = np.array(train_svm_list).mean(axis=0)
test_svm = np.array(test_svm_list).mean(axis=0)

In [None]:
from sklearn.metrics import mean_squared_error
def rmse(targets, preds):
    return round(np.sqrt(mean_squared_error(targets, preds)), 4)
print('SVM + RoBERTa CV’s RMSE:{}'.format(rmse(np.array(train_data.target.values), train_svm)))

# Model 4

Datasetlink(model):

"../input/clrp-roberta-base/clrp_roberta_base" -> https://www.kaggle.com/maunish/clrp-roberta-base

"../input/commonlit-roberta-0467/" -> https://www.kaggle.com/andretugan/commonlit-roberta-0467

In [None]:
import os
import math
import random
import time

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoConfig

from sklearn.model_selection import KFold
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import gc
gc.enable()

In [None]:
BATCH_SIZE = 32
MAX_LEN = 248
EVAL_SCHEDULE = [(0.50, 16), (0.49, 8), (0.48, 4), (0.47, 2), (-1., 1)]

ROBERTA_PATH = "../input/clrp-roberta-base/clrp_roberta_base"
TOKENIZER_PATH = "../input/clrp-roberta-base/clrp_roberta_base"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

train_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/train.csv")
print(train_df[(train_df.target == 0) & (train_df.standard_error == 0)].index)
# Remove incomplete entries if any.
train_df.drop(train_df[(train_df.target == 0) & (train_df.standard_error == 0)].index,
              inplace=True)
train_df.reset_index(drop=True, inplace=True)

test_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
submission_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/sample_submission.csv")

In [None]:
# DATASET
class LitDataset(Dataset):
    def __init__(self, df, inference_only=False):
        super().__init__()

        self.df = df        
        self.inference_only = inference_only
        self.text = df.excerpt.tolist()
        #self.text = [text.replace("\n", " ") for text in self.text]
        
        if not self.inference_only:
            self.target = torch.tensor(df.target.values, dtype=torch.float32)        
    
        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',            
            max_length = MAX_LEN,
            truncation = True,
            return_attention_mask=True
        )        
 
    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return (input_ids, attention_mask)            
        else:
            target = self.target[index]
            return (input_ids, attention_mask, target)
# MODEL
class LitModel(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(ROBERTA_PATH)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})                       
        
        self.roberta = AutoModel.from_pretrained(ROBERTA_PATH, config=config)  
            
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(                        
            nn.Linear(768, 1)                        
        )
        

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)        

        # There are a total of 13 layers of hidden states.
        # 1 for the embedding layer, and 12 for the 12 Roberta layers.
        # We take the hidden states from the last Roberta layer.
        last_layer_hidden_states = roberta_output.hidden_states[-1]

        # The number of cells is MAX_LEN.
        # The size of the hidden state of each cell is 768 (for roberta-base).
        # In order to condense hidden states of all cells to a context vector,
        # we compute a weighted average of the hidden states of all cells.
        # We compute the weight of each cell, using the attention neural network.
        weights = self.attention(last_layer_hidden_states)
                
        # weights.shape is BATCH_SIZE x MAX_LEN x 1
        # last_layer_hidden_states.shape is BATCH_SIZE x MAX_LEN x 768        
        # Now we compute context_vector as the weighted average.
        # context_vector.shape is BATCH_SIZE x 768
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        
        
        # Now we reduce the context vector to the prediction score.
        return self.regressor(context_vector)

In [None]:
def rmse(targets, preds):
    return round(np.sqrt(mean_squared_error(targets, preds)), 4)

def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True
        
def predict(model, data_loader,is_test=False):
    """Returns an np.array with predictions of the |model| on |data_loader|"""
    model.eval()

    result = np.zeros(len(data_loader.dataset))    
    index = 0
    if is_test:
        with torch.no_grad():
            for batch_num, (input_ids, attention_mask) in enumerate(data_loader):
                input_ids = input_ids.to(DEVICE)
                attention_mask = attention_mask.to(DEVICE)

                pred = model(input_ids, attention_mask)                        

                result[index : index + pred.shape[0]] = pred.flatten().to("cpu")
                index += pred.shape[0]
    else:
        with torch.no_grad():
            for batch_num, (input_ids, attention_mask, target) in enumerate(data_loader):
                input_ids = input_ids.to(DEVICE)
                attention_mask = attention_mask.to(DEVICE)

                pred = model(input_ids, attention_mask)                        

                result[index : index + pred.shape[0]] = pred.flatten().to("cpu")
                index += pred.shape[0]
            

    return result

In [None]:
gc.collect()
NUM_FOLDS = 5
SEED = 1000
kfold = KFold(n_splits=NUM_FOLDS, random_state=SEED, shuffle=True)
valid_prediction = np.zeros(len(train_df))

for fold, (train_indices, val_indices) in enumerate(kfold.split(train_df)):    
    model_path = f"../input/commonlit-roberta-0467/model_{fold + 1}.pth"
    print(f"\nUsing {model_path}")
                        
    model = LitModel()
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))    
    model.to(DEVICE)
        
    set_random_seed(SEED + fold)

    val_dataset = LitDataset(train_df.loc[val_indices])     
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE,
                            drop_last=False, shuffle=False, num_workers=2)    
    
    set_random_seed(SEED + fold)   
    
    pred = predict(model,val_loader)
    
    valid_prediction[val_indices] = pred
        
    del model
    gc.collect()
print('CV’s RMSE:{}'.format(rmse(train_df.target.values, valid_prediction)))

In [None]:
# In the training process, the author deleted the 106
# To match the dim in the training set of meta model
# we put the real target in the training set of meta model
import copy
for_meta = copy.deepcopy(valid_prediction)
for_meta = np.insert(for_meta,106,0)  

In [None]:
NUM_MODELS = 5

all_predictions = np.zeros((NUM_MODELS, len(test_df)))

test_dataset = LitDataset(test_df, inference_only=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                         drop_last=False, shuffle=False, num_workers=2)

for model_index in range(NUM_MODELS):            
    model_path = f"../input/commonlit-roberta-0467/model_{model_index + 1}.pth"
    print(f"\nUsing {model_path}")
                        
    model = LitModel()
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))    
    model.to(DEVICE)
    # blending fold 4 in meta-level
    if (model_index + 1) != 4:
        all_predictions[model_index] = predict(model, test_loader,is_test=True)
    else:
        pred = (predict(model, test_loader,is_test=True) * 0.7 + test_svm * 0.3)
        all_predictions[model_index] = pred
                
    del model
    gc.collect()

# Model 19

Datasetlink(model):

'../input/roberta-large-5fold-aux' -> https://www.kaggle.com/joechan619/roberta-large-5fold-aux

'../input/tokenizers/roberta-tokenizer.pt' -> https://www.kaggle.com/chamecall/tokenizers

In [None]:
import os
from pathlib import Path
import pandas as pd
from sklearn import model_selection
train_df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")

def create_folds(data, num_splits):
    data["fold"] = -1
    kf = model_selection.KFold(n_splits=num_splits, shuffle=True, random_state=2021)
    for f, (t_, v_) in enumerate(kf.split(X=data)):
        data.loc[v_, 'fold'] = f
    return data
kfold_df = create_folds(train_df, num_splits=5)

in_folder_path = Path('../input/roberta-large-5fold-aux')
scripts_dir = Path(in_folder_path / 'scripts')
os.chdir(scripts_dir)
exec(Path("imports.py").read_text())
exec(Path("config.py").read_text())
exec(Path("dataset.py").read_text())
exec(Path("model.py").read_text())
os.chdir('/kaggle/working')

In [None]:
def make_dataloader(data, tokenizer, is_train=True):
    if is_train:
        dataset = CLRPDataset(data, tokenizer=tokenizer)
        sampler = RandomSampler(dataset)
    else:
        dataset = CLRPDataset(data, tokenizer=tokenizer,max_len=Config.max_len,is_test=True)
        sampler = SequentialSampler(dataset)
    batch_dataloader = DataLoader(dataset, sampler=sampler, batch_size=Config.batch_size)
    return batch_dataloader

def get_preds_k(dl,model_num,models_folder_path):
    model = torch.load(models_folder_path / f'best_model_{model_num}.pt').to(Config.device)
    model.eval()
          
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0],-1).to(Config.device) for key,val in inputs.items()}
            outputs = model(**inputs)
#             outputs = outputs.detach().cpu().numpy()
            outputs = outputs.cpu().detach().numpy().ravel().tolist()
            embeddings.extend(outputs)
    return np.array(embeddings)

def stacking_get_pred_k(models_folder_path):
    tokenizer = torch.load('../input/tokenizers/roberta-tokenizer.pt')
#     models_folder_path = Path(in_folder_path / 'models')
    models_preds = []
    n_models = 5
    test_preds = np.zeros((test_df.shape[0]))
    train_preds = np.zeros((train_df.shape[0]))

    for model_num in range(n_models):
        print(f'Inference#{model_num+1}/{n_models}')
        test_dataloader = make_dataloader(test_df, tokenizer, is_train=False)
        test_pred = get_preds_k(test_dataloader,model_num,models_folder_path)
        # blending fold 4 in meta-level
        if model_num == 3:
            test_pred = (test_pred * 0.7 + test_svm * 0.3)
            test_preds += test_pred
        else:
            test_preds += test_pred

        val_dl = make_dataloader(kfold_df[kfold_df.fold==model_num], tokenizer, is_train=False)
        val_index = kfold_df[kfold_df.fold==model_num].index.tolist()
        val_preds = get_preds_k(val_dl,model_num,models_folder_path)
        train_preds[val_index] = val_preds
    return train_preds,test_preds

In [None]:
models_folder_path = Path(in_folder_path / 'models')
train_preds_k_19,test_preds_k_19 = stacking_get_pred_k(models_folder_path)

# Stacking

In [None]:
train_df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")

def create_folds(data, num_splits):
    data["kfold"] = -1
    kf = model_selection.KFold(n_splits=num_splits, shuffle=True, random_state=2021)
    for f, (t_, v_) in enumerate(kf.split(X=data)):
        data.loc[v_, 'kfold'] = f
    return data
train = create_folds(train_df, num_splits=5)

# Blending Fold 4 in base level

In [None]:
i = 3
train_idx, val_idx = train.index[train['kfold'] != i].tolist(), train.index[train['kfold'] == i].tolist()

In [None]:
for_meta[val_idx] = (for_meta[val_idx] * 0.7 + train_svm[val_idx] * 0.3)
train_preds_k_19[val_idx] = (train_preds_k_19[val_idx] * 0.7 + train_svm[val_idx] * 0.3)

In [None]:
oof_train = pd.DataFrame()
# oof_train['model1'] = oof_roberta_base_i
# oof_train['model2'] = oof_roberta_large_itpt
# oof_train['model3'] = oof_roberta_large_ii
oof_train['model4'] = for_meta
# oof_train['model5'] = clrp_train_preds
# oof_train['model6'] = clrp_tpu_train_preds
# oof_train['model7'] = k_train_pred
# oof_train['model8'] = train_preds_8
# oof_train['model9'] = distil_train
# oof_train['model10'] = distil_aux_train
# oof_train['model11'] = distil_no_aux_train
# oof_train['model12'] = train_preds_k_12
# oof_train['model13'] = train_all_predictions
# oof_train['model15'] = train_preds_k
# oof_train['model18'] = k_train_pred_18
oof_train['model19'] = train_preds_k_19
oof_train['target'] = train.target.values
oof_train = create_folds(oof_train, num_splits=5)
display(oof_train.shape)
oof_train.head()

In [None]:
x_oof_train = oof_train[['model4','model19']]
x_oof_train.head()

In [None]:
oof_test = pd.DataFrame()
# oof_test['model1'] = pred_df1.mean(axis=1)
# oof_test['model2'] = pred_df2.mean(axis=1)
# oof_test['model3'] = pred_df3.mean(axis=1)
oof_test['model4'] = all_predictions.mean(axis=0)
# oof_test['model5'] = clrp_test_preds/5
# oof_test['model6'] = clrp_tpu_test_preds/5
# oof_test['model7'] = k_test_pred/5
# oof_test['model8'] = test_preds_8/5
# oof_test['model9'] = distil_test/5
# oof_test['model10'] = distil_aux_test/5
# oof_test['model11'] = distil_no_aux_test/5
# oof_test['model12'] = test_preds_k_12/5
# oof_test['model13'] = test_all_predictions
# oof_test['model15'] = test_preds_k/5
# oof_test['model18'] = k_test_pred_18/5
oof_test['model19'] = test_preds_k_19/5
display(oof_test.shape)
oof_test.head()

In [None]:
stacking_preds = []
oof_rmses = []

FOLDS = 5
for fold in range(FOLDS):
    print(f'\nTraining Fold {fold + 1} / {FOLDS}')
    
    train_idx, val_idx = oof_train.index[oof_train['kfold']!=fold].tolist(), oof_train.index[oof_train['kfold']==fold].tolist()
    x_train, y_train = x_oof_train.iloc[list(train_idx)], oof_train.target.iloc[train_idx]
    x_val, y_val = x_oof_train.iloc[list(val_idx)], oof_train.target.iloc[val_idx]
    from sklearn.linear_model import Ridge
    reg = Ridge(alpha = 1)
#     from sklearn.linear_model import BayesianRidge
#     reg = BayesianRidge(n_iter=300, verbose=True)
    reg.fit(x_train, y_train)
    
    val_pred = reg.predict(x_val)
    oof_rmse = rmse(val_pred, oof_train.target[val_idx].values)
    oof_rmses.append(oof_rmse)
    print(f"Fold {fold+1} train OOF RMSE: {oof_rmse}")
    
    stacking_preds.append(reg.predict(oof_test))

print('Stacking Regressor: Mean OOF RMSE = {}'.format(np.mean(oof_rmses)))

In [None]:
submission = pd.read_csv("/kaggle/input/commonlitreadabilityprize/sample_submission.csv")
submission.target = np.mean(stacking_preds,0)
print(submission)
submission.to_csv("submission.csv", index=False)