In [None]:
import os
import gc
import psutil
import torch
import pandas as pd
import numpy as np
import random

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
import sklearn.feature_selection as fs
from sklearn.linear_model import LassoCV,RidgeCV
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing

import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import KFold,StratifiedKFold
from torch.utils.data import Dataset, DataLoader
import tokenizers

import _pickle as pickle
from tempfile import mkdtemp
from shutil import rmtree

import statistics
#from torchsummary import summary
import transformers
from tqdm.autonotebook import tqdm
from transformers import (AutoModel, AutoTokenizer, 
                          AutoModelForSequenceClassification,get_constant_schedule_with_warmup,get_linear_schedule_with_warmup)

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

class Config:
    learning_rate = 3e-5
    max_len = 256
    train_batch_size = 8
    valid_batch_size = 32
    epochs = 7
    stderr_epochs = 7
    roberta_path = "../input/roberta-base"
    last_layer_size = 768
    seed = 10
    kfold = 12
    file_type = '.bin'
    svm_model_path = '/kaggle/working/model/svr_model.sav'
    #save_model_path = "/kaggle/working/model"
    #save_stderr_model_path = "/kaggle/working/model_stderr"
    
    save_model_path = "../input/robertabase-clrp/model"
    #save_stderr_model_path = "../input/robertabase-clrp/model_stderr"
    #svm_model_path = '../input/robertabase-clrp/model/svr_model.sav'

In [None]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
train_data = train_data.sample(frac = 1)
test_data_1 = train_data[2700:]
train_data = train_data[0:2700]

os.makedirs('/kaggle/working/model',exist_ok = True)
#test_data_1.to_csv('/kaggle/working/model' + '/test_data_1.csv')

#temp = train_data.loc[train_data.excerpt.map(lambda x:x.split()).map(len)<170,'target'] 
#print(temp)
#print(train_data.excerpt.map(lambda x:x.split()).map(len))
#print(train_data.excerpt.map(lambda x:x.split()).map(len).max()) #205

test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
test_data = test_data.assign(target = 0,standard_error = 0)
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

In [None]:
class CLRPDataset(Dataset):
    def __init__(self,df,tokenizer,max_len):
        self.excerpt = df['excerpt'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.target = df['target'].to_numpy()
        self.deviation = df['standard_error'].to_numpy()
        self.id = df['id'].tolist()

    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors = 'pt',
                                max_length = self.max_len,
                                padding = 'max_length',
                                truncation = True)
        return {'input_id':encode,'target':torch.tensor(self.target[idx],dtype = torch.float),'id':self.id[idx],
               'std_err':torch.tensor(self.deviation[idx],dtype = torch.float),'indx':idx}
    
    def __len__(self):
        return len(self.excerpt)

In [None]:
class RMSELoss(torch.nn.Module):
    def __init__(self):
        super(RMSELoss,self).__init__()

    def forward(self,x,y):
        criterion = nn.MSELoss()
        eps = 1e-6
        loss = torch.sqrt(criterion(x, y) + eps)
        return loss

class SimpleRobertaModel(torch.nn.Module):
    def __init__(self,model_path):
        super(SimpleRobertaModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_path)
        #self.roberta = transformers.RobertaModel.from_pretrained(Config.roberta_path)
        self.drop_out = nn.Dropout(0.1)
        self.l0 = nn.Linear(Config.last_layer_size, 1)
        torch.nn.init.normal_(self.l0.weight, std=0.02)
    
    def forward(self, input_ids, attention_mask):

        out = self.model(
            input_ids = input_ids,
            attention_mask = attention_mask,
            output_hidden_states = True
        )
        #out = out.hidden_states
        #sentence_embeddings = torch.mean(out[12], dim=1)
        #sentence_embeddings = torch.squeeze(sentence_embeddings,dim=1)
        
        out = out.last_hidden_state
        sentence_embeddings = torch.mean(out, dim=1)
        sentence_embeddings = torch.squeeze(sentence_embeddings,dim=1)
        drp_out = self.drop_out(sentence_embeddings)
        score = self.l0(drp_out)
        return score

In [None]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim

        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))
        #print('att.shape : ',att.shape) #batch,max_len,768
        score = self.V(att)
        #print('score.shape : ',score.shape)
        attention_weights = torch.softmax(score, dim=1)

        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)
        return context_vector

class RobertaModelAttention(torch.nn.Module):
    def __init__(self,model_path):
        super(RobertaModelAttention, self).__init__()
        self.model = AutoModel.from_pretrained(model_path)
        self.head = AttentionHead(768,768,1)
        self.l0 = nn.Linear(self.head.out_features, 1)
        torch.nn.init.normal_(self.l0.weight, std=0.02)
    
    def forward(self, input_ids, attention_mask):

        out = self.model(
            input_ids = input_ids,
            attention_mask = attention_mask
        )
        x = out[0] # and out.last_hidden_state are same
        #print(x.shape)
        x = self.head(x)
        x = self.l0(x)
        return x
        #out = out.last_hidden_state
        #print('out.shape ',out.size())
        #sentence_embeddings = torch.mean(out, dim=1)
        #sentence_embeddings = torch.squeeze(sentence_embeddings,dim=1)
        #print('sentence_embeddings.shape ',sentence_embeddings.size())
        #drp_out = self.drop_out(sentence_embeddings)

In [None]:
def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

def draw_chart(pred,target):
    #print(target)
    fig, ax = plt.subplots(1,2,figsize=(20,10))
    sns.scatterplot(x = target,y = np.abs(pred-target),ax = ax[0])
    sns.histplot(x = target,y = np.abs(pred-target),ax = ax[1]) 
    ax[0].set_title("scatter",font="Serif")
    ax[1].set_title("histogram",font="Serif")
    plt.show()

def save_model(epoch,model_state_dict,optimizer_state_dict,model_path):
    state = {
        'epoch': epoch,
        'model_state_dict': model_state_dict,
        #'optimizer_state_dict': optimizer_state_dict,
        }
    torch.save(state, model_path)

In [None]:
def finetune_training(data_loader, model, optimizer, scheduler, dl_valid, kfold):
    
    target_var = 'target'
    model_path = Config.save_model_path + '/model_' + str(kfold) + Config.file_type
    last_best = 10
    eval_err = 0
    diff_count = -1
    decay_threshold = 0.7
    
    while diff_count != 0:
        '''
        indx,pred = predict_fn(model,dl_valid if diff_count == -1 else dl_test)
        target = list()
        for i, x in enumerate(dl_valid if diff_count == -1 else dl_test):
            target.extend(x['target'].flatten().tolist())
        diff = np.abs(np.array(pred) - np.array(target))
        selected_indx = [val[0] for i,val in enumerate(zip(indx,diff)) if val[1] >= decay_threshold]
        test_indx = set(indx) - set(selected_indx)
        failed_df = train_data.loc[selected_indx,:]
        test_df = train_data.loc[test_indx,:]
        print('shape of failed and test ',failed_df.shape[0],test_df.shape[0])
        print(failed_df['target'])
        diff_count = len(selected_indx)
        if diff_count == 0:
            break
        
        train_indx = list()
        for i, x in enumerate(data_loader):
            train_indx.extend(x['indx'])
        new_train_df = pd.concat((failed_df,train_data.loc[train_indx]), axis=0)
        new_train_df = new_train_df.sample(frac = 1)
        tokenizer = AutoTokenizer.from_pretrained(Config.roberta_path)
        ds_failed = CLRPDataset(new_train_df,tokenizer,Config.max_len)
        dl_failed = DataLoader(
            ds_failed, 
            shuffle = True,
            batch_size = Config.train_batch_size,
            num_workers = 4,
            drop_last = False
                 )
        ds_test = CLRPDataset(test_df,tokenizer,Config.max_len)
        dl_test = DataLoader(
            ds_test,
            batch_size = Config.valid_batch_size,
            num_workers = 4,
            drop_last = False
                 )
        '''
        for epoch in range(Config.epochs):
            #training on failed data which is part of validation data
            #try two other strategies 1) train alone dl_failed with epochs and then train with trainset 2) combine both set and
            # then train with that....better distribution in combination
            loss_fn = RMSELoss()
            rmsescore = list()
            for i, x in enumerate(data_loader): #old train set
                model.zero_grad()
                inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in x['input_id'].items()}
                outputs = model(**inputs)
                target = x[target_var]
                loss = loss_fn(outputs.squeeze(),target.to(device))
                loss.backward()
                optimizer.step()
                scheduler.step()
                rmsescore.append(loss.item())
                if i%20 == 0:
                    eval_err = eval_fn(dl_valid, model,target_var)
                    print('step: ',i," | evaluation(finetuning) mean rmse: ",eval_err)
                    print('training mean rmse: ',np.mean(rmsescore))
                    if eval_err < last_best :
                        save_model(epoch,model.state_dict(),optimizer.state_dict(),model_path) 
                        last_best = eval_err
            print('training mean rmse: ',np.mean(rmsescore))
            eval_err = eval_fn(dl_valid, model,target_var)
            if eval_err < last_best : # in case the last few examples improved the performance
                save_model(epoch,model.state_dict(),optimizer.state_dict(),model_path) 
                last_best = eval_err
            print(f"best eval_err for epoch {epoch} in finetuning is : {last_best}")

def train_fn(data_loader, model, optimizer, scheduler, dl_valid, kfold):
    
    model.train()
    target_var = 'target'
    model_path = Config.save_model_path + '/model_' + str(kfold) + Config.file_type
    #tk0 = tqdm(data_loader, total=len(data_loader))
    last_best = 10
    eval_err = 0
    epoch_eval_error = 10
    breakcounter =  0 # to breakout of epoch loop when not improving
    rmsescore = list()
    for epoch in range(Config.epochs):
        loss_fn = RMSELoss()
        for i, x in enumerate(data_loader):
            model.zero_grad()
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in x['input_id'].items()}
            outputs = model(**inputs)
            target = x[target_var]
            loss = loss_fn(outputs.squeeze(),target.to(device))
            loss.backward()
            optimizer.step()
            scheduler.step()
            rmsescore.append(loss.item())
            if i % 10 == 0:
                eval_err = eval_fn(dl_valid, model,target_var)
                #print('step: ',i," | evaluation mean rmse: ",eval_err)
                #print('training mean rmse: ',np.mean(rmsescore))
                if eval_err < last_best :
                    save_model(epoch,model.state_dict(),optimizer.state_dict(),model_path)
                    last_best = eval_err
        eval_err = eval_fn(dl_valid, model,target_var)
        #print(f"epoch: {epoch} | final evaluation mean rmse: {eval_err}")
        if eval_err < last_best : # in case the last few examples improved the performance
            save_model(epoch,model.state_dict(),optimizer.state_dict(),model_path)   
            last_best = eval_err
        print(f"best eval_err for epoch {epoch} is : {last_best}")
        if (epoch_eval_error == last_best):
            breakcounter += 1
            if breakcounter == 2:
                break
        else:
            epoch_eval_error = last_best
            breakcounter = 0
    '''checkpoint = torch.load(model_path)
    model.load_state_dict(checkpoint['model_state_dict']) # fetch best model to finetune
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    #finetune_training(data_loader, model, optimizer, scheduler, dl_valid, kfold) #after all epochs are done
    model.train()
    for epoch in range(Config.epochs):
        for i, x in enumerate(data_loader):
            model.zero_grad()
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in x['input_id'].items()}
            outputs = model(**inputs)
            target = x[target_var]
            loss = loss_fn(outputs.squeeze(),target.to(device))
            loss.backward()
            optimizer.step()
            scheduler.step()
            rmsescore.append(loss.item())
            if i%20 == 0:
                print([param for param in list(model.l0.parameters())])
                eval_err = eval_fn(dl_valid, model,target_var)
                print('step: ',i," | evaluation mean rmse: ",eval_err)
                #print('training mean rmse: ',np.mean(rmsescore))
                #if eval_err < last_best :
                #    save_model(epoch,model.state_dict(),optimizer.state_dict(),model_path)
                #    last_best = eval_err
        eval_err = eval_fn(dl_valid, model,target_var)
        print(f"epoch: {epoch} | final evaluation mean rmse: {eval_err}")
        if eval_err < last_best : # in case the last few examples improved the performance
            save_model(epoch,model.state_dict(),optimizer.state_dict(),model_path)
            last_best = eval_err
        print(f"best eval_err for epoch {epoch} is : {last_best}") '''
    #drawing scatter plot for difference
    #model = load_prediction_model(f"{Config.save_model_path}/model_{kfold}.bin")
    #ids,pred = predict_fn(model,dl_valid)
    #target = list()
    #for i, x in enumerate(dl_valid):
    #    target.extend(x['target'].flatten().tolist())
    #draw_chart(np.array(pred),np.array(target))
    
def eval_fn(data_loader, model,target_var):
    model.eval()
    #tk0 = tqdm(data_loader, total=len(data_loader))  
    scores = list()
    loss_fn = RMSELoss()
    with torch.no_grad():
        for i, X in enumerate(data_loader):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in X['input_id'].items()}
            outputs = model(**inputs)
            target = X[target_var]
            loss = loss_fn(outputs.squeeze(),target.to(device))
            scores.append(loss.item()) #or just loss.item()
    #model.train() #affectsthe model performance
    return np.mean(scores)

In [None]:
def load_prediction_model(model_path):
    
    model = SimpleRobertaModel(Config.roberta_path)
    model.to(device)    
    model.load_state_dict(torch.load(model_path)['model_state_dict'])
    return model
        
#returns index,prediction
def predict_fn(model,data_loader):
    
    model.eval()
    output =  list()
    ids = list()
    with torch.no_grad():
        for X in data_loader:
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in X['input_id'].items()}
            outputs = model(**inputs)
            output.extend(outputs.flatten().tolist())
            ids.extend(X['indx'])
    #print(output)
    return ids,output

def repeat_prediction(model_path,df):
    
    tokenizer = AutoTokenizer.from_pretrained(Config.roberta_path)
    ds_test = CLRPDataset(df,tokenizer,Config.max_len)
    dl_test = DataLoader(ds_test,
              batch_size = Config.valid_batch_size,
              num_workers = 4,
              drop_last=False
             )
    
    pred_folds = list()
    for i in range(Config.kfold):#[0,1,2,3,4,5,6,10]:#
        model = load_prediction_model(f"{model_path}/model_{i}{Config.file_type}")
        ids,pred = predict_fn(model,dl_test)
        pred_folds.append(pred)
    
    return ids,np.array(pred_folds)

def final_submission_prediction(model_path):

    ids,pred_folds = repeat_prediction(model_path,test_data)
    
    #prediction = np.mean(pred_folds,axis = 0)
    
    vec_preds = np.transpose(pred_folds)
    # load the svm model from disk
    model = pickle.load(open(Config.svm_model_path, 'rb'))
    prediction = model.predict(vec_preds)
    
    sample.id = sample.loc[ids,'id']
    sample.target = prediction.tolist()
    print(sample)
    sample.to_csv('submission.csv')
    
def final_evaluation_test(model_path,df):
    
    #df.reset_index(inplace = True)
    #ids,pred_folds = repeat_prediction(model_path,df)
    
    #with open("pred_folds.bin","wb") as f:
    #    pickle.dump(pred_folds,f)
    with open("../input/pred-folds/pred_folds.bin","rb") as f:
        pred_folds = pickle.load(f)
    vec_preds = np.transpose(pred_folds)
    ensemble_preds = np.mean(vec_preds,axis = 1)
    vec_preds = pd.concat((pd.DataFrame(vec_preds),pd.DataFrame(ensemble_preds),df[['target']]), axis = 1,ignore_index = True)
    print(vec_preds.shape)
    #fig, ax = plt.subplots(1,1,figsize=(20,10))
    annotate = [str(i) for i in range(vec_preds.shape[1]-2)] + ['P','T']
    fig, axs = plt.subplots(ncols = 2, nrows = int(vec_preds.shape[0]/2),figsize = (15,385))#int(vec_preds.shape[0]/2))
    fig.tight_layout()
    for i in range(int(vec_preds.shape[0]/2)):
        axs[i,0].scatter(x = vec_preds.iloc[i*2],y = np.ones(vec_preds.shape[1]))
        axs[i,1].scatter(x = vec_preds.iloc[i*2+1],y = np.ones(vec_preds.shape[1]))
        for j, txt in enumerate(annotate):
            axs[i,0].annotate(txt, (vec_preds.iloc[i*2][j], 1), fontsize='large')
            axs[i,1].annotate(txt, (vec_preds.iloc[i*2+1][j], 1), fontsize='large')
    
    #ensemble_preds = np.mean(pred_folds,axis = 0)
    #print(rmse_score(df['target'].to_numpy(), ensemble_preds))

def svm_model(model_path,df):
    
    #ids,pred_folds = repeat_prediction(model_path,df)
    with open("../input/pred-folds/pred_folds.bin","rb") as f:
        pred_folds = pickle.load(f)
    vec_preds = np.transpose(pred_folds)
    end_point = int(1*vec_preds.shape[0]) - 1
    train_vec_preds = vec_preds[0:end_point,:]
    
    score_fn = make_scorer(rmse_score, greater_is_better = False)
    model = CrossValidation_SVM(train_vec_preds, df.iloc[0:end_point]['target'], score_fn, kfold = 5)
    #model.fit(train_vec_preds,df['target'][0:end_point].to_numpy()) #default refit is set to true
    print(model[1].scores_)
    pickle.dump(model, open(Config.svm_model_path, 'wb'))
    
    #vald_vec_preds = vec_preds[end_point:,:]
    #prediction = model.predict(vald_vec_preds)
    #print('rmse: ',rmse_score(df[end_point:]['target'].to_numpy(), prediction))

def CrossValidation_SVM(df_x, df_y, score_fn, kfold = 5):
    
    # Set the parameters by cross-validation
    mod_str = 'svr__'
    param_grid = [{ mod_str+'kernel': ['rbf'], mod_str+'gamma': [1e-2, 1e-3, 1e-4, 1e-5], mod_str+'C': [1, 10, 100, 1000]},
                {mod_str+'kernel': ['linear'], mod_str+'C': [1, 10,]},
                 {mod_str+'kernel': ['poly'], mod_str+'degree': [3,4,5,6,7,8,9], mod_str+'gamma': [1e-2, 1e-3, 1e-4, 1e-5, 1e-6], mod_str+'coef0': [ 1,2,4,5,6,7,8]}]
    #param_grid = [{mod_str+'kernel': ['rbf'], mod_str+'gamma': [1e-2, 1e-3], mod_str+'C': [1]},
    #            {mod_str+'kernel': ['linear'], mod_str+'C': [1, 10,]},
    #             {mod_str+'kernel': ['poly'], mod_str+'degree': [3,4], mod_str+'gamma': [1e-2], mod_str+'coef0': [1]}]
    
    cachedir = mkdtemp()
    pipeline = make_pipeline(
        preprocessing.StandardScaler(),
        fs.SelectKBest(fs.mutual_info_regression, k = 10),
        SVR(),
        memory = cachedir,
        verbose = True
    )
    #grid = GridSearchCV(SVR(), param_grid = param_grid,scoring = score_fn, cv = kfold)
    grid = GridSearchCV(pipeline, param_grid = param_grid, scoring = score_fn, cv = kfold, n_jobs = -1)
    grid.fit(df_x, df_y)
    print("Best parameters set found on development set:")
    print(grid.best_estimator_,grid.best_params_,grid.best_score_)
    rmtree(cachedir)
    return grid.best_estimator_

In [None]:
def create_model(train_len):
    
    #model = RobertaModelAttention(Config.roberta_path)
    model = SimpleRobertaModel(Config.roberta_path)
    model.to(device)        
    param_optimizer = list(model.model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    #print([param for name, param in param_optimizer if '11' in name ])
    optimizer_parameters = [
        {'params': [param for name, param in param_optimizer if not any(nd in name for nd in no_decay)], 'weight_decay': 0.001 },
        {'params': [param for name, param in param_optimizer if any(nd in name for nd in no_decay)], 'weight_decay': 0.0 },
        {'params': model.l0.parameters(), 'lr': 1e-2 },
        #{'params': [param for name, param in param_optimizer if '11' in name ], 'lr': 3e-5},
        #{'params': [param for name, param in param_optimizer if '11' not in name ], 'lr': 3e-5},
    ]
    #print(optimizer_parameters)
    #print(summary(model, [(1,256), (1,256)]))
    optimizer = transformers.AdamW(optimizer_parameters, lr = Config.learning_rate)
    #os._exit()
    num_train_steps = int(train_len / Config.train_batch_size * Config.epochs)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=0, 
        num_training_steps=num_train_steps
    )
    return model, optimizer, scheduler

def CrossValidation_fn(df):
    kfold = KFold(n_splits = Config.kfold)
    for k, (train_idx,valid_idx) in enumerate(kfold.split(df)):
        train= df.iloc[train_idx]
        valid = df.iloc[valid_idx]        
        tokenizer = AutoTokenizer.from_pretrained(Config.roberta_path)
        ds_train = CLRPDataset(train,tokenizer,Config.max_len)
        dl_train = DataLoader(ds_train, 
                    shuffle = True,
                    batch_size = Config.train_batch_size,
                    num_workers = 4,
                    drop_last = False
                    )
        ds_valid = CLRPDataset(valid,tokenizer,Config.max_len)
        dl_valid = DataLoader(ds_valid,
                  batch_size = Config.valid_batch_size,
                  num_workers = 4,
                  drop_last = False
                 ) 
        print('kfold val: ', k)
        model, optimizer, scheduler = create_model(len(train))
        train_fn(dl_train, model, optimizer, scheduler, dl_valid, k) # either target or stderr model run!
        #model, optimizer, scheduler = create_model(len(train))
        #train_stderr_fn(dl_train, model, optimizer, scheduler, dl_valid, k)
        

In [None]:
def run():
 
    #CrossValidation_fn(train_data)
    #stderr_target_derivation()
    
    test_data_1 = pd.read_csv(Config.save_model_path + '/test_data_1.csv')
    svm_model(Config.save_model_path,test_data_1)

    #final_evaluation_test(Config.save_model_path,test_data_1)
    final_submission_prediction(Config.save_model_path)
    #final_submission_prediction(Config.save_stderr_model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"{device} is used")
seed_everything(seed = Config.seed)
print(psutil.virtual_memory().percent)
gc.collect()
print(psutil.virtual_memory().percent)
run()