In [None]:
!pip install textstat
from tqdm.autonotebook import tqdm
import shutil
import os
import gc
import psutil
import torch
import pandas as pd
import torch.nn as nn
import numpy as np
import textstat
import re
import statistics
import matplotlib.pyplot as plt

from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import string

from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR, SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import roc_curve                # Calculate the ROC curve
from sklearn.metrics import precision_recall_curve   # Calculate the Precision-Recall curve
from sklearn.metrics import f1_score                 # Calculate the F-score
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
import sklearn.feature_selection as fs
from sklearn.linear_model import LassoCV,RidgeCV

import random
import torch.nn.functional as F
from sklearn.model_selection import KFold,StratifiedKFold
from torch.utils.data import Dataset, DataLoader
import tokenizers
import _pickle as pickle

import transformers
#from torchsummary import summary
from transformers import (AutoModel, AutoTokenizer,
                          AutoModelForSequenceClassification,get_constant_schedule_with_warmup,get_linear_schedule_with_warmup)

In [None]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
train_data = train_data.sample(frac = 1)
train_data['target_binary'] = 0
train_data.loc[train_data['target'] > -1,'target_binary'] = 1
test_data_1 = train_data[2700:]
test_data_1.to_csv('test_data_1.csv')
train_data = train_data[0:2700]
test_data_1 = pd.read_csv("../input/clrp-abovebelow-model/test_data_1.csv",index_col = 0)
#print(train_data[(train_data['target'] > -2) & (train_data['target'] < 0)].shape)
df_middle = train_data[(train_data['target'] >-2) & (train_data['target'] < 0)]
#df_middle = df_middle.loc[~df_middle['id'].isin(test_data_1.id)]
#print(train_data.excerpt.map(lambda x:x.split()).map(len).max()) #205
df_above = train_data[train_data['target_binary'] == 1 ]
df_below = train_data[train_data['target_binary'] == 0 ]
print('above, below, middle ',df_above.shape,df_below.shape,df_middle.shape)

test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
test_data = test_data.assign(target = 0, standard_error = 0, target_binary = 0)
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

In [None]:
def seed_everything(seed = 42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

class Config:
    learning_rate = 3e-5
    max_len = 256
    train_batch_size = 8
    valid_batch_size = 32
    epochs = 8
    stderr_epochs = 7
    roberta_path = "../input/roberta-base"
    last_layer_size = 768
    seed = 10
    kfold = 5
    kfold_classifier = 10
    classifier_threshold = 0.5
    #save_model1_path = "/kaggle/working/abovemodel"
    #save_model2_path = "/kaggle/working/belowmodel"
    #save_model3_path = "/kaggle/working/middlemodel"
    #save_classifier_model_path = "/kaggle/working/classifiermodel"
    
    save_model1_path = "../input/clrp-abovebelow-model/abovemodel"
    save_model2_path = "../input/clrp-abovebelow-model/belowmodel"
    save_model3_path = "../input/clrp-abovebelow-model/middlemodel"
    save_classifier_model_path = "../input/clrp-abovebelow-model/classifiermodel"

In [None]:
def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

def create_features(df):
    stop_words = set(stopwords.words('english'))
    
    df['processed_excerpt'] = [[w for w in word_tokenize(re.sub(r'[^\w\s]', '', text)) if not w.lower() in stop_words] 
                               for text in df['excerpt']] #removing punctuation and stopwords
    #print(df.processed_excerpt)
    df['mean_syllable'] = df.processed_excerpt.map(lambda x : [textstat.syllable_count(word) for word in x]).map(lambda x: np.mean(x)) #[np.mean(list(map(lambda x:textstat.syllable_count(x),ls))) for ls in df['processed_excerpt']]
    df['char_per_word'] = df.processed_excerpt.map(lambda x : [len(word) for word in x]).map(lambda x: np.mean(x))
    df['long_word'] = df.processed_excerpt.map(lambda x : [1 for word in x if len(word) >6] ).map(lambda x: np.sum(x))
    df['LIX'] = df.excerpt.map(lambda x : textstat.lix(x))
    df['RIX'] = df.excerpt.map(lambda x : textstat.rix(x))
    
    df['avg_wordlen'] = df.excerpt.map(lambda x:x.split()).apply(lambda x : [len(i) for i in x]).map(lambda x: np.mean(x))
    #df['numberofwords'] = df.excerpt.map(lambda x:x.split()).map(len)
    df['text_len'] = df.excerpt.map(len)
    df['lexicon_count'] = [ textstat.lexicon_count(text,removepunct = True) for text in df['excerpt']]
    df['lexicon_count_withpunc'] = [ textstat.lexicon_count(text,removepunct = False) for text in df['excerpt']]
    df['sent_count'] = [ textstat.sentence_count(text) for text in df['excerpt']]
    df['flesch_reading_ease'] = [ textstat.flesch_reading_ease(text) for text in df['excerpt']]
    df['smog_index'] = [textstat.smog_index(text) for text in df['excerpt']]
    df['flesch_kincaid_grade'] = [textstat.flesch_kincaid_grade(text) for text in df['excerpt']]
    df['coleman_liau_index'] = [textstat.coleman_liau_index(text) for text in df['excerpt']]
    df['automated_readability_index'] = [textstat.automated_readability_index(text) for text in df['excerpt']]
    df['dale_chall_readability_score'] = [textstat.dale_chall_readability_score(text) for text in df['excerpt']]
    df['difficult_words'] = [textstat.difficult_words(text) for text in df['excerpt']]
    df['linsear_write_formula'] = [textstat.linsear_write_formula(text) for text in df['excerpt']]
    df['gunning_fog'] = [textstat.gunning_fog(text) for text in df['excerpt']]
    df['text_standard'] = [textstat.text_standard(text, float_output = True) for text in df['excerpt']]
    #df['mean_wordper_sent']
    
    for row in df.itertuples():
        pos_dict = Counter([k if k not in string.punctuation else "PUNCT" for k in 
                        [j for i,j in pos_tag(word_tokenize(df.at[row.Index,'excerpt'])) ]])
        for key in pos_dict.keys():
            df.at[row.Index,key] = pos_dict[key]
    #pos_dict = Counter([j for i,j in pos_tag(word_tokenize(sent))]) #kind of punctuations
    
    #avsentence lenght

def CrossValidation_SVM(df):
    #create_features(df)
    features = ['proxy_target','avg_wordlen','char_per_word','long_word','mean_syllable','LIX','RIX','text_len','lexicon_count','lexicon_count_withpunc','sent_count',
                'flesch_reading_ease','smog_index','flesch_kincaid_grade','coleman_liau_index','automated_readability_index',
                'dale_chall_readability_score','difficult_words','linsear_write_formula','gunning_fog','text_standard',
                'CC','CD','DT','EX','FW','IN','JJ','JJR','JJS','LS','MD','NN','NNS','NNP','NNPS','PDT','POS','PRP','RB','RBR',
                'RBS','RP','SYM','TO','UH','VB','VBG','VBD','VBN','VBP','VBZ','WDT','WP','WRB','PUNCT'
               ]
    #print(df[features].head(10))
    score_fn = make_scorer(rmse_score, greater_is_better = False)

    # Set the parameters by cross-validation
    param_grid = [{'svr__kernel': ['rbf'], 'svr__gamma': [1e-2, 1e-3, 1e-4, 1e-5], 'svr__C': [1, 10, 100, 1000]},
                {'svr__kernel': ['linear'], 'svr__C': [1, 10,]}]
    pipeline = make_pipeline(
        preprocessing.StandardScaler(),
        fs.SelectPercentile(fs.chi2, percentile = 80),
        SVC()
    )
    grid = GridSearchCV(pipeline, param_grid = param_grid, cv = 5,verbose = 4)
    grid.fit(df.iloc[:][features], df.iloc[:]['target_binary'])
    print("Best parameters set found on development set:")
    print(grid.best_params_)
    print()
    print("Grid scores on development set:")
    means = grid.cv_results_['mean_test_score']
    stds = grid.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, grid.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

def CrossValidation_lasso(df):
    #create_features(df)
    features = ['proxy_target','avg_wordlen','char_per_word','long_word','mean_syllable','LIX','RIX','text_len','lexicon_count','lexicon_count_withpunc','sent_count',
                'flesch_reading_ease','smog_index','flesch_kincaid_grade','coleman_liau_index','automated_readability_index',
                'dale_chall_readability_score','difficult_words','linsear_write_formula','gunning_fog','text_standard',
                'CC','CD','DT','EX','FW','IN','JJ','JJR','JJS','LS','MD','NN','NNS','NNP','NNPS','PDT','POS','PRP','RB','RBR',
                'RBS','RP','SYM','TO','UH','VB','VBG','VBD','VBN','VBP','VBZ','WDT','WP','WRB','PUNCT'
               ]
    print(df[features].head(10))
    score_fn = make_scorer(rmse_score, greater_is_better = False)

    # Set the parameters by cross-validation
    scaler = preprocessing.StandardScaler()
    x_train = fs.SelectPercentile(fs.mutual_info_regression, percentile = 10).fit_transform(scaler.fit_transform(df.iloc[:][features]),df.iloc[:]['target'])
    
    model = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1,5,10,40,100],scoring=score_fn,cv=5).fit(x_train,df.iloc[:]['target'])
    print(model.best_score_,model.alpha_)
    #print(model.cv_values_)
    #lcv = LassoCV(cv=5, random_state=Config.seed,max_iter=2000)
    #model = lcv.fit(scaler.fit_transform(df.iloc[:][features]),df.iloc[:]['target'])
    
    # Display results
    '''EPSILON = 1e-4
    plt.figure()
    print(model.alpha_)
    plt.plot(model.alphas_ + EPSILON, np.sqrt(model.mse_path_.mean(axis=-1)), 'k',
             label='Average across the folds', linewidth=2)
    plt.axvline(model.alpha_ + EPSILON, linestyle='--', color='k',
                label='alpha: CV estimate')

    plt.legend()
    plt.xlabel(r'$\alpha$')
    plt.ylabel('Mean square error')
    plt.axis('tight')
    plt.show()'''


In [None]:
class CLRPDataset(Dataset):
    def __init__(self,df,tokenizer,max_len=128):
        self.excerpt = df['excerpt'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.target = df['target'].to_numpy()
        self.target_binary = df['target_binary'].to_numpy()
        self.id = df['id'].tolist()
        self.indx = df.index

    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        return {'input_id':encode,'target':torch.tensor(self.target[idx],dtype = torch.float),'id':self.id[idx],
               'target_binary':torch.tensor(self.target_binary[idx],dtype = torch.float),'indx':self.indx[idx]}
    
    def __len__(self):
        return len(self.excerpt)

In [None]:
class RMSELoss(torch.nn.Module):
    def __init__(self):
        super(RMSELoss,self).__init__()

    def forward(self,x,y):
        criterion = nn.MSELoss()
        eps = 1e-6
        loss = torch.sqrt(criterion(x, y) + eps)
        return loss

class SimpleRobertaModel(torch.nn.Module):
    def __init__(self,model_path):
        super(SimpleRobertaModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_path)
        #self.roberta = transformers.RobertaModel.from_pretrained(Config.roberta_path)
        self.drop_out = nn.Dropout(0.1)
        self.l0 = nn.Linear(Config.last_layer_size, 1)
        torch.nn.init.normal_(self.l0.weight, std=0.02)
    
    def forward(self, input_ids, attention_mask):

        out = self.model(
            input_ids = input_ids,
            attention_mask = attention_mask
        )
        #out = out[-1]
        out = out.last_hidden_state
        sentence_embeddings = torch.mean(out, dim=1)
        sentence_embeddings = torch.squeeze(sentence_embeddings,dim=1)
        drp_out = self.drop_out(sentence_embeddings)
        score = self.l0(drp_out)
        return score

In [None]:
def train_classifier(data_loader, model, optimizer, scheduler=None, dl_valid=None, kfold = 0):
    
    target_var = 'target_binary'
    model.train()
    tk0 = data_loader
    #tk0 = tqdm(data_loader, total=len(data_loader))
    last_best = 0
    eval_err = 0
    epoch_eval_error = 0
    breakcounter =  0 # to breakout of epoch loop when not improving
    
    for epoch in range(Config.epochs):
        for i, x in enumerate(tk0):
            model.zero_grad()
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in x['input_id'].items()}
            outputs = torch.sigmoid(model(**inputs))
            #print(outputs.shape,x[target_var].shape)
            loss = F.binary_cross_entropy(outputs.squeeze(),x[target_var].to(device))
            loss.backward()
            optimizer.step()
            scheduler.step()
            #rmsescore.append(loss.cpu().detach().numpy())
            if i%10 == 0:
                #print(f"epoch: {epoch} step: {i}")
                acc,f1score = eval_classifier(dl_valid, model,target_var)
                #print('step: ',i," | evaluation accuracy, f1score : ",acc,f1score)
                if acc > last_best :
                    torch.save(model.state_dict(), Config.save_classifier_model_path+'/model_'+str(kfold)+'.bin') 
                    last_best = acc
        #print('epoch: ',epoch," | training mean rmse: ",train_err)
        #eval_err = eval_fn(dl_valid, model,target_var)
        #print(f"epoch: {epoch} | final evaluation accuracy: {acc}")
        #if eval_err < last_best : # in case the last few examples improved the performance
        #    torch.save(model.state_dict(), Config.save_model_path+'/model_classifier.bin')
        #    last_best = eval_err
        print(f"best accuracy for epoch {epoch} is : {last_best}")
        if (epoch_eval_error == last_best):
            breakcounter += 1
            if breakcounter == 2:
                break
        else:
            epoch_eval_error = last_best
            breakcounter = 0
        
def eval_classifier(data_loader, model,target_var):
    
    model.eval()
    predls = list()
    target = list()
    org_target = list()
    previous_f1score = 0.8
    with torch.no_grad():
        for i, X in enumerate(data_loader):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in X['input_id'].items()}
            outputs = torch.sigmoid(model(**inputs))
            target.extend(X[target_var].cpu().detach().numpy())
            #predls.extend(torch.round(outputs.squeeze()).cpu().detach().numpy())
            predls.extend(outputs.squeeze().cpu().detach().numpy())
            org_target.extend(X['target'].cpu().detach().numpy())
    #fpr, tpr, thresholds = roc_curve(target, predls)
    # Create the Precision-Recall curve
    precision, recall, thresholds = precision_recall_curve(target, predls)
    # Calculate the f-score
    #fscore = (2 * precision * recall) / (precision + recall)
    # Find the optimal threshold
    #index = np.argmax(fscore)
    #fscoreOpt = round(fscore[index], ndigits = 4)
    thresholdOpt = Config.classifier_threshold #0.5
    #thresholdOpt = round(thresholds[index], ndigits = 4)
    prediction = [1 if x >= thresholdOpt else 0 for x in predls]
    report_dict = classification_report(target, prediction,output_dict = True, zero_division = 0)
    #print(report_dict['accuracy'])
    return report_dict['accuracy'],report_dict['0.0']['f1-score']
    '''
    print((report_dict['0.0']['f1-score'] + fscoreOpt)/2)
    if (report_dict['0.0']['f1-score'] + fscoreOpt)/2 > previous_f1score: 
        print('Best Threshold: {}'.format(thresholdOpt))
        print(classification_report(target, prediction, zero_division=0))
        previous_f1score = (int(report_dict['0.0']['f1-score']) + fscoreOpt)/2
        target_idx = [i for i,x in enumerate(zip(prediction,target)) if x[0]!=x[1]]
        print([org_target[i] for i in target_idx],[x for i,x in enumerate(zip(prediction,target)) if x[0]!=x[1]])
    '''
    model.train()
    

In [None]:
def train_fn(data_loader, model, optimizer, scheduler, dl_valid, model_type, kfold = None):
    
    if model_type == 'above':
        model_path = Config.save_model1_path
    elif model_type == 'below':
        model_path = Config.save_model2_path
    else: #middle
        model_path = Config.save_model3_path
    
    target_var = 'target'
    model.train()
    tk0 = data_loader
    #tk0 = tqdm(data_loader, total=len(data_loader))
    last_best = 10
    eval_err = 0
    epoch_eval_error = 10
    breakcounter =  0 # to breakout of epoch loop when not improving
    for epoch in range(Config.epochs):
        
        rmsescore = list()
        loss_fn = RMSELoss()
        
        for i, x in enumerate(tk0):
            model.zero_grad()
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in x['input_id'].items()}
            outputs = model(**inputs)
            target = x[target_var]
            loss = loss_fn(outputs.squeeze(),target.to(device))
            loss.backward()
            optimizer.step()
            scheduler.step()
            #rmsescore.append(loss.cpu().detach().numpy())
            if i%10 == 0:
                eval_err = eval_fn(dl_valid, model,target_var)
                #print('step: ',i," | evaluation mean rmse: ",eval_err)
                if eval_err < last_best :
                    torch.save(model.state_dict(), model_path+'/model_'+str(kfold)+'.bin')
                    last_best = eval_err
        #print('epoch: ',epoch," | training mean rmse: ",train_err)
        eval_err = eval_fn(dl_valid, model,target_var)
        #print(f"epoch: {epoch} | final evaluation mean rmse: {eval_err}")
        if eval_err < last_best : # in case the last few examples improved the performance
            torch.save(model.state_dict(), model_path+'/model_'+str(kfold)+'.bin')
            last_best = eval_err
        print(f"best eval_err for epoch {epoch} is : {last_best}")
        
        if (epoch_eval_error == last_best):
            breakcounter += 1
            if breakcounter == 2:
                break
        else:
            epoch_eval_error = last_best
            breakcounter = 0
        
def eval_fn(data_loader, model,target_var):

    model.eval()
    tk0 = data_loader
    #tk0 = tqdm(data_loader, total=len(data_loader))  
    scores=list()
    loss_fn=RMSELoss()
    with torch.no_grad():
        for i, X in enumerate(tk0):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in X['input_id'].items()}
            outputs = model(**inputs)
            target = X[target_var]
            loss = loss_fn(outputs.squeeze(),target.to(device))
            scores.append(loss.cpu().detach().numpy()) #or just loss.item()
    #model.train()
    return np.mean(scores)

In [None]:
def load_prediction_model(model_path):
    
    model = SimpleRobertaModel(Config.roberta_path)
    model.to(device)    
    model.load_state_dict(torch.load(model_path))
    return model
    
'''
def final_submission(model_path):

    ids,pred_folds = repeat_prediction(model_path,test_data)
    print(pred_folds)
    ensemble_preds=np.mean(pred_folds,axis = 0)
    sample.id = #ids
    sample.target = ensemble_preds.tolist()
    print(sample)
    sample.to_csv('submission.csv',index=False)
'''
def final_submission():

    ids, preds = final_evaluation(test_data)
    sample.id = ids
    sample.target = preds
    print(sample)
    sample.to_csv('submission.csv',index = False)
    
def predict_fn(model,data_loader,mod_type = 'regressor'):
    
    model.eval()
    output =  list()
    ids = list()
    with torch.no_grad():
        for X in data_loader:
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in X['input_id'].items()}
            if mod_type == 'classifier':
                outputs = torch.sigmoid(model(**inputs))
            else:
                outputs = model(**inputs)
            output.extend(outputs.flatten().tolist())
            ids.extend(X['indx'])
    #print(output)
    return ids,output

def repeat_prediction(model_path,df,kfold = Config.kfold,mod_type = 'regressor'):
    
    tokenizer = AutoTokenizer.from_pretrained(Config.roberta_path)
    ds_test = CLRPDataset(df,tokenizer,Config.max_len)
    dl_test = DataLoader(ds_test,
              batch_size = Config.valid_batch_size,
              num_workers = 4,
              drop_last=False
             )
    
    pred_folds = list()
    for i in range(kfold):
        model = load_prediction_model(f"{model_path}/model_{i}.bin")
        ids,pred = predict_fn(model,dl_test,mod_type)
        pred_folds.append(pred)
    
    return ids,np.array(pred_folds)

def final_evaluation_test(model_path,df,condition):
    ids,pred_folds = repeat_prediction(model_path,df)
    ensemble_preds = np.mean(pred_folds,axis = 0)
    
    df1 = pd.concat((pd.DataFrame(ids),pd.DataFrame(ensemble_preds)),axis = 1)
    df1.columns = ['id','preds']
    df1.to_csv(condition + '_df1.csv',index =  False)
    df1 = pd.read_csv(condition + "_df1.csv", index_col = False)
    print(pd.concat((df1.reset_index(drop = True),df.loc[df1.id,['target']].reset_index(drop = True)),axis = 1,ignore_index = False))
    print('df1 previous ',np.sqrt(mean_squared_error(df.loc[df1.id,'target'].to_numpy(), df1.preds)))
    if condition == 'above':
        df2 = df1[df1['preds'] < -0.2]
    else:
        df2 = df1[df1['preds'] > -1.7]
    df_temp = df.loc[df2.id]
    print('df_temp previous',np.sqrt(mean_squared_error(df_temp['target'].to_numpy(), df2.preds)))
    middle_ids, middle_pred_folds = repeat_prediction(Config.save_model3_path,df_temp)
    middle_ensemble_preds = np.mean(middle_pred_folds,axis = 0)
    #ensemble_preds_final = [middle_ensemble_preds[i] if i in set(middle_ids) else ensemble_preds[i] for i in ids]
    df_temp['previous_prediction'] = df2.preds.to_numpy()
    df_temp['prediction'] = middle_ensemble_preds# + df_temp['previous_prediction'].to_numpy())/2 
    #print(df_temp['prediction'])
    print('df_temp improved : ',np.sqrt(mean_squared_error(df_temp['target'].to_numpy(), df_temp['prediction'].to_numpy())))
    print(df_temp[['target','previous_prediction','prediction']],df_temp.shape )
    #print(df_temp[abs(df_temp['target'] - df_temp['prediction']) > 0.4][['target','previous_prediction','prediction']])
    df1.loc[df1['id'].isin(df_temp.index), 'preds'] = df_temp['prediction'].to_numpy()
    #print(np.isnan(df1).any(),np.isfinite(df1).all(),np.isnan(df1),np.isfinite(df1),df1)
    print(condition+' output: ',np.sqrt(mean_squared_error(df['target'].to_numpy(), df1.preds)))
    return df1.preds.to_numpy(copy = True)

def final_classficationeval_test(model_path,df):
    
    #ids,pred_folds = repeat_prediction(model_path,df, Config.kfold_classifier,'classifier')
    #with open("pred_folds.bin","wb") as f:
    #    pickle.dump(pred_folds,f)
    with open("pred_folds.bin","rb") as f:
        pred_folds = pickle.load(f)
    thres_folds = np.where(pred_folds >= Config.classifier_threshold,1,0)
    ensemble_preds = np.sum(thres_folds,axis = 0)
    ensemble_preds = np.where(ensemble_preds >= int(Config.kfold_classifier/2),1,0)
    print(classification_report(df['target_binary'].to_numpy(), ensemble_preds, zero_division = 0))
    
    #annotate = [str(i) for i in range(pred_folds.shape[1]-2)] + ['A','T']
    #fig, axs = plt.subplots(1,figsize = (30,5))
    #col = [1 if x[0] == x[1] else 2 for x in zip(df['target_binary'].to_numpy(),ensemble_preds)]
    #axs.scatter(x = df['target'].to_numpy(), y = np.ones(ensemble_preds.shape[0]),c = col)
    
    pred_folds = np.transpose(pred_folds)
    annotate = [str(i) for i in range(pred_folds.shape[1])] + ['P','T','Tar']
    pred_folds = pd.concat((pd.DataFrame(pred_folds),pd.DataFrame(ensemble_preds),df[['target_binary','target']].reset_index(drop = True)), axis = 1,ignore_index = True)
    pred_folds.columns = annotate[:]
    fig, axs = plt.subplots(ncols = 1, nrows = pred_folds.shape[0],figsize = (15,385))#int(vec_preds.shape[0]/2))
    fig.tight_layout()
    for i in range(pred_folds.shape[0]):
        if pred_folds.iloc[i]['P'] != pred_folds.iloc[i]['T']:
            axs[i].set_title('Axis '+str(i)+' Target Unmatched: '+str(pred_folds.iloc[i]['Tar']))
        else:
            axs[i].set_title('Axis '+str(i)+' Target matched: '+str(pred_folds.iloc[i]['Tar']))
        axs[i].scatter(x = pred_folds.iloc[i][0:-1],y = np.ones(pred_folds.shape[1]-1))
        for j, txt in enumerate(annotate[:-1]):
            axs[i].annotate(txt, (pred_folds.iloc[i][j], 1), fontsize='large')
    
    return ensemble_preds

def final_evaluation(df):
    prediction = final_classficationeval_test(Config.save_classifier_model_path,df)
    '''idx_above = np.where(prediction == 1,True,False)
    idx_below = np.where(prediction == 0,True,False)
    
    #idx_above = np.where(df['target_binary'] == 1,True,False)
    #idx_below = np.where(df['target_binary'] == 0,True,False)
    #os._exit(1)
    r1 = final_evaluation_test(Config.save_model1_path,df.loc[idx_above],'above')
    r2 = final_evaluation_test(Config.save_model2_path,df.loc[idx_below],'below')
    results = np.concatenate((r1,r2))
    print(np.sqrt(mean_squared_error(np.concatenate((df.loc[idx_above,'target'].to_numpy(),df.loc[idx_below,'target'].to_numpy())), results)))
    return np.concatenate((df.loc[idx_above,'id'].to_numpy(),df.loc[idx_below,'id'].to_numpy())), results
'''

In [None]:
def create_model(train_len):
    
    model=SimpleRobertaModel(Config.roberta_path)
    model.to(device)
    param_optimizer = list(model.model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    #print(param_optimizer)
    optimizer_parameters = [
        {'params': [param for name, param in param_optimizer if not any(nd in name for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [param for name, param in param_optimizer if any(nd in name for nd in no_decay)], 'weight_decay': 0.0},
        {'params': model.l0.parameters(), 'lr': 1e-2},
    ]
    #print(summary(model, [(1,256), (1,256)]))
    optimizer = transformers.AdamW(optimizer_parameters, lr = Config.learning_rate)
    #os._exit()
    num_train_steps = int(train_len / Config.train_batch_size * Config.epochs)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=0, 
        num_training_steps=num_train_steps
    )
    return model, optimizer, scheduler

def CrossValidation_fn(df, k_fold, model_type = 'regressor'):
    kfold = KFold(n_splits = k_fold)
    for k, (train_idx,valid_idx) in enumerate(kfold.split(df)):
        train= df.iloc[train_idx]
        valid = df.iloc[valid_idx]
        tokenizer = AutoTokenizer.from_pretrained(Config.roberta_path)
        ds_train = CLRPDataset(train,tokenizer,Config.max_len)
        dl_train = DataLoader(ds_train, shuffle = True,
                  batch_size = Config.train_batch_size,
                  num_workers = 4,
                  drop_last=False
                 )
        ds_valid = CLRPDataset(valid,tokenizer,Config.max_len)
        dl_valid = DataLoader(ds_valid,
                  batch_size = Config.valid_batch_size,
                  num_workers = 4, shuffle = False,
                  drop_last=False
                 )        
        print('kfold val: ', k)
        model, optimizer, scheduler = create_model(len(train))
        if model_type == 'classifier':
            train_classifier(dl_train, model, optimizer, scheduler, dl_valid, k)
        else:
            train_fn(dl_train, model, optimizer, scheduler, dl_valid, model_type, k)

In [None]:
def run():
    os.makedirs(Config.save_classifier_model_path,exist_ok = True)
    os.makedirs(Config.save_model1_path,exist_ok = True)
    os.makedirs(Config.save_model2_path,exist_ok = True)
    os.makedirs(Config.save_model3_path,exist_ok = True)
    
    #shutil.rmtree("/kaggle/working/abovemodel")
    #shutil.rmtree("/kaggle/working/belowmodel")
    #src = "../input/clrp-abovebelow-model/abovemodel"
    #src_files = os.listdir(src)
    #shutil.copytree(src, "/kaggle/working/abovemodel")
    #src = "../input/clrp-abovebelow-model/belowmodel"
    #src_files = os.listdir(src)
    #shutil.copytree(src, "/kaggle/working/belowmodel")
    
    #for file_name in src_files:
    #    full_file_name = os.path.join(src, file_name)
    #    shutil.copy(full_file_name, "/kaggle/working")
    
    #CrossValidation_fn(df_above,Config.kfold,'above')
    #print('--------------------------------------------------')
    #CrossValidation_fn(df_below,Config.kfold,'below')
    #print('--------------------------------------------------')
    #CrossValidation_fn(df_middle,Config.kfold,'middle')
    #print('--------------------------------------------------')
    #CrossValidation_fn(train_data,Config.kfold_classifier,'classifier')
    CrossValidation_SVM(train_data)
    #final_evaluation(test_data_1)
    #final_submission()
    
    #final_evaluation_test(Config.save_model_path,test_data_1)
    #final_submission(Config.save_model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"{device} is used")
seed_everything(seed = Config.seed)
print(psutil.virtual_memory().percent)
gc.collect()
print(psutil.virtual_memory().percent)
run()