In [None]:
from pathlib import Path
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import random

from sklearn import linear_model
from sklearn import metrics
from math import sqrt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import pickle

import torch
from torch.utils.data import random_split,DataLoader, Dataset
from torchtext.data import Field, TabularDataset, BucketIterator, Iterator

from transformers import (BertTokenizer, BertModel, AdamW,
                          get_linear_schedule_with_warmup,
                          RobertaTokenizerFast,RobertaModel,
                          RobertaConfig,PreTrainedModel,
                          get_constant_schedule_with_warmup,
                          AutoModelForSequenceClassification,
                          AutoModel,AutoConfig,
                          AutoTokenizer,get_cosine_schedule_with_warmup
                         )
import warnings
warnings.simplefilter('ignore')


In [None]:
def set_seed(seed = 0):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    return random_state

seed = 82
random_state = set_seed(seed)

In [None]:
data_dir = Path('../input/commonlitreadabilityprize')
train_file = data_dir / 'train.csv'
test_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'
submission_file = 'submission.csv'
output_path = "./"

In [None]:
os.makedirs('./model_preds')
os.makedirs('./models')
os.makedirs('./test')

In [None]:
# Set random seed and set device to GPU.
#torch.manual_seed(17)

if torch.cuda.is_available():
    device = torch.device('cuda')
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
else:
    device = torch.device('cpu')

print(device)

In [None]:
df = pd.read_csv(train_file)
df.head()

# Create fold

In [None]:
k_folds = 5

In [None]:
from sklearn import model_selection

df.loc[:,"kfold"] = -1
df = df.sample(frac=1).reset_index(drop=True)

y = df.target.values
skf = model_selection.KFold(n_splits=k_folds)

for f, (t_, v_) in enumerate(skf.split(X=df, y=y)):
    df.loc[v_, "kfold"]=f
    
df.to_csv("./train_folds.csv", index=False)

# text pre-processing for Sklearn models

In [None]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

", ".join(stopwords.words('english'))
STOPWORDS = set(stopwords.words('english'))

def preprop(df):
    df["excerpt"]=df.excerpt.str.lower()
    
    ps = PorterStemmer()
    df["excerpt"] = df.excerpt.apply(ps.stem)
    
    wnl = WordNetLemmatizer()
    df["excerpt"] = df.excerpt.apply(wnl.lemmatize)

    df["excerpt"] = df.excerpt.apply(lambda text: remove_stopwords(text))
    
    return df

In [None]:
#train
df = pd.read_csv("./train_folds.csv")
df=preprop(df)

df.to_csv("./train_folds_preprop.csv", index=False)

# Create models

## Word embedding with TfIDF

In [None]:
#linear model with TfidfVectorizer on Excerpt
def run_Tfidf(fold):
    df = pd.read_csv("./train_folds.csv") #better results
    df.excerpt = df.excerpt.apply(str)
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    tfv = TfidfVectorizer()
    tfv.fit(df_train.excerpt.values)
    
    Xtrain = tfv.transform(df_train.excerpt.values)
    Xvalid = tfv.transform(df_valid.excerpt.values)
    
    ytrain = df_train.target.values
    yvalid = df_valid.target.values
    
    clf = linear_model.LinearRegression()
    clf.fit(Xtrain, ytrain)
    pred = clf.predict(Xvalid)
        
    rmse = sqrt(metrics.mean_squared_error(yvalid, pred))
    print(f"fold={fold}, RMSE={rmse}")
    
    if fold == (k_folds-1):
        filename = './models/clf_lr_Tfidf.sav'
        pickle.dump(clf, open(filename, 'wb'))
        print("model saved")
    
    df_valid.loc[:,"lr_Tfidf_pred"] = pred
    
    return df_valid[["id","target","kfold","lr_Tfidf_pred"]]

dfs = []
for j in range(k_folds):
    temp_df = run_Tfidf(j)
    dfs.append(temp_df)  
fin_valid_df = pd.concat(dfs)
fin_valid_df.to_csv("./model_preds/lr_Tfidf_excerpt.csv", index=False)

In [None]:
#linear model with CountVectorizer on Excerpt
from sklearn import linear_model
from sklearn import metrics
from math import sqrt
from sklearn.feature_extraction.text import CountVectorizer

def run_lr_cnt(fold):
    #df = pd.read_csv("./train_folds.csv")
    df = pd.read_csv("./train_folds_preprop.csv") #Better results
    df.excerpt = df.excerpt.apply(str)
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    tfv = CountVectorizer()
    tfv.fit(df_train.excerpt.values)
    
    Xtrain = tfv.transform(df_train.excerpt.values)
    Xvalid = tfv.transform(df_valid.excerpt.values)
    
    ytrain = df_train.target.values
    yvalid = df_valid.target.values
    
    clf = linear_model.LinearRegression()
    clf.fit(Xtrain, ytrain)
    pred = clf.predict(Xvalid)
    
    rmse = sqrt(metrics.mean_squared_error(yvalid, pred))
    print(f"fold={fold}, RMSE={rmse}")
    
    if fold == (k_folds-1):
        filename = './models/clf_lr_cnt.sav'
        pickle.dump(clf, open(filename, 'wb'))
        print("model saved")
    
    df_valid.loc[:,"lr_cnt_pred"] = pred
    
    return df_valid[["id","target","kfold","lr_cnt_pred"]]

dfs = []
for j in range(k_folds):
    temp_df = run_lr_cnt(j)
    dfs.append(temp_df)  
fin_valid_df = pd.concat(dfs)
fin_valid_df.to_csv("./model_preds/lr_cnt_excerpt.csv", index=False)

In [None]:
#rf_svd
from sklearn import decomposition
from sklearn import ensemble
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer

def run_rf_svd(fold):
    #df = pd.read_csv("./train_folds.csv")
    df = pd.read_csv("./train_folds_preprop.csv")
    df.review = df.excerpt.apply(str)
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    tfv = TfidfVectorizer()
    tfv.fit(df_train.excerpt.values)
    
    Xtrain = tfv.transform(df_train.excerpt.values)
    Xvalid = tfv.transform(df_valid.excerpt.values)
    
    svd = decomposition.TruncatedSVD(n_components=120)
    svd.fit(Xtrain)
    xtrain_svd = svd.transform(Xtrain)
    xvalid_svd = svd.transform(Xvalid)
    
    ytrain = df_train.target.values
    yvalid = df_valid.target.values
    
    clf = ensemble.RandomForestRegressor(n_estimators=500, n_jobs=-1)
    clf.fit(xtrain_svd, ytrain)
    pred = clf.predict(xvalid_svd)
    
    rmse = sqrt(metrics.mean_squared_error(yvalid, pred))
    print(f"fold={fold}, RMSE={rmse}")
    
    if fold == (k_folds-1):
        filename = './models/clf_rfr.sav'
        pickle.dump(clf, open(filename, 'wb'))
        print("model saved")
    
    df_valid.loc[:,"rf_svd_Tfidf_pred"] = pred
    
    return df_valid[["id","target","kfold","rf_svd_Tfidf_pred"]]

dfs = []
for j in range(k_folds):
    temp_df = run_rf_svd(j)
    dfs.append(temp_df)  
fin_valid_df = pd.concat(dfs)
fin_valid_df.to_csv("./model_preds/rf_svd_Tfidf_excerpt.csv", index=False)

In [None]:
#xgboost
from sklearn import decomposition
from sklearn import ensemble
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBRegressor

def run_xgboost_Tfidf(fold):
    df = pd.read_csv("./train_folds.csv") #better results
    #df = pd.read_csv("./train_folds_preprop.csv")
    df.review = df.excerpt.apply(str)
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    tfv = TfidfVectorizer()
    tfv.fit(df_train.excerpt.values)
    
    Xtrain = tfv.transform(df_train.excerpt.values)
    Xvalid = tfv.transform(df_valid.excerpt.values)
    
    svd = decomposition.TruncatedSVD(n_components=120)
    svd.fit(Xtrain)
    xtrain_svd = svd.transform(Xtrain)
    xvalid_svd = svd.transform(Xvalid)
    
    ytrain = df_train.target.values
    yvalid = df_valid.target.values
    
    clf = XGBRegressor(n_estimators=500, n_jobs=-1, learning_rate=0.05)
    clf.fit(xtrain_svd, ytrain)
    pred = clf.predict(xvalid_svd)  
    
    rmse = sqrt(metrics.mean_squared_error(yvalid, pred))
    print(f"fold={fold}, RMSE={rmse}")
        
    if fold == (k_folds-1):
        filename = './models/clf_xgboost.sav'
        pickle.dump(clf, open(filename, 'wb'))
        print("model saved")
    
    df_valid.loc[:,"xgboost_Tfidf_pred"] = pred
    
    return df_valid[["id","target","kfold","xgboost_Tfidf_pred"]]

dfs = []
for j in range(k_folds):
    temp_df = run_xgboost_Tfidf(j)
    dfs.append(temp_df)  
fin_valid_df = pd.concat(dfs)
fin_valid_df.to_csv("./model_preds/xgboost_Tfidf_excerpt.csv", index=False)

## Word embedding with Glove

In [None]:
EMBEDDING_FILE = '../input/glove6b/glove.6B.100d.txt'

embeddings_dict = {}
for line in open(EMBEDDING_FILE):
    values = line.split()
    word = values[0]
    # print(word)
    vector = np.asarray(values[1:], "float32")
    # print(vector)
    embeddings_dict[word] = vector

In [None]:
def get_feature_vectors(sentence):
    words = sentence.split()
    feature_vec = np.zeros((100,),dtype="float32")
    i=0
    for word in words:
        try:
            feature_vec = np.add(feature_vec, embeddings_dict.get(word))
        except:
            i = i + 1
    if len(words) > 0:
        feature_vec = np.divide(feature_vec, len(words)- i)
    return feature_vec

In [None]:
#randomforest with Glove on excerpt
def run_rf_glove(fold):
    #df = pd.read_csv("./train_folds.csv")
    df = pd.read_csv("./train_folds_preprop.csv") #Better results
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    Xtrain_glove = np.array([get_feature_vectors(sentence) for sentence in df_train.excerpt.values])
    Xvalid_glove = np.array([get_feature_vectors(sentence) for sentence in df_valid.excerpt.values])
    ytrain = df_train.target.values
    yvalid = df_valid.target.values
    
    clf = ensemble.RandomForestRegressor(n_estimators=500, n_jobs=-1)
    clf.fit(Xtrain_glove, ytrain)
    pred = clf.predict(Xvalid_glove)
    
    rmse = sqrt(metrics.mean_squared_error(yvalid, pred))
    print(f"fold={fold}, RMSE={rmse}")
    
    df_valid.loc[:,"rf_glove_pred"] = pred
    
    return df_valid[["id","target","kfold","rf_glove_pred"]]

dfs = []
for j in range(k_folds):
    temp_df = run_rf_glove(j)
    dfs.append(temp_df)  
fin_valid_df = pd.concat(dfs)
fin_valid_df.to_csv("./model_preds/rf_glove_excerpt.csv", index=False)

In [None]:
#xgboost with Glove on excerpt
def run_XGBR(fold):
    #df = pd.read_csv("./train_folds.csv")
    df = pd.read_csv("./train_folds_preprop.csv") #Better results
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    Xtrain_glove = np.array([get_feature_vectors(sentence) for sentence in df_train.excerpt.values])
    Xvalid_glove = np.array([get_feature_vectors(sentence) for sentence in df_valid.excerpt.values])
    ytrain = df_train.target.values
    yvalid = df_valid.target.values
    

    clf = XGBRegressor(n_estimators=600,
                       objective = 'reg:squarederror',
                       eval_metric = 'rmse',
                       n_jobs=-1,
                       subsample = 1.0,
                       learning_rate=0.05,
                       max_depth = 5,
                       early_stopping_rounds = 10,
                       gamma = 1,
                       colsample_bytree=0.9,
                       verbosity = 0,
                       random_state=seed
                      )
    clf.fit(Xtrain_glove, ytrain)
    pred = clf.predict(Xvalid_glove)
    
    rmse = sqrt(metrics.mean_squared_error(yvalid, pred))
    print(f"fold={fold}, RMSE={rmse}")
    
    df_valid.loc[:,"XGBR_glove_pred"] = pred
    
    return df_valid[["id","target","kfold","XGBR_glove_pred"]]

dfs = []
for j in range(k_folds):
    temp_df = run_XGBR(j)
    dfs.append(temp_df)  
fin_valid_df = pd.concat(dfs)
fin_valid_df.to_csv("./model_preds/xgboost_glove_excerpt.csv", index=False)

## Word embedding with Roberta

In [None]:
from tqdm import tqdm

class CLRPDataset(torch.nn.Module):
    def __init__(self, df, tokenizer, max_len=256):
        self.excerpt = df['excerpt'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        return encode
    
    def __len__(self):
        return len(self.excerpt)
    

def get_embeddings(df, path, plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    MODEL_PATH = path
    model = AutoModel.from_pretrained(MODEL_PATH,output_hidden_states = True,)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model.to(device)
    model.eval()
    
    ds = CLRPDataset(df, tokenizer, 256)
    dl = DataLoader(ds,
                    batch_size=128,
                    shuffle=False,
                    num_workers = 4,
                    pin_memory=True,
                    drop_last=False)
    
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0], -1).to(device) for key, val in inputs.items()}
            outputs = model(**inputs)
#https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/#31-running-bert-on-our-text
            cat = torch.cat(tuple([outputs[2][i] for i in [-4, -3, -2, -1]]), dim=-1)
            outputs = cat[:, 0, :].detach().cpu().numpy()
            embeddings.extend(outputs)
            
    del model
    
    return np.array(embeddings)

In [None]:
# source: https://www.kaggle.com/maunish/clrp-roberta-lgbm
#https://www.kaggle.com/c/commonlitreadabilityprize/discussion/237795
#https://www.kaggle.com/abhishek/modelf1

MODEL_NAME = Path('../input/modelf1')

In [None]:
df = pd.read_csv("./train_folds.csv")
train_embeddings =  get_embeddings(df,MODEL_NAME)

In [None]:
#import xgboost as xgb
#xgboost with roberta on excerpt
def run_XGBR_Roberta(fold):
    df = pd.read_csv("./train_folds.csv")
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    #df = pd.read_csv("./train_folds_preprop.csv")
    train_idx = df[df.kfold != fold].index
    valid_idx = df[df.kfold == fold].index
    #target = df['target'].to_numpy()
    
    Xtrain = train_embeddings[train_idx]
    Xvalid = train_embeddings[valid_idx]
    ytrain = df_train.target.values
    yvalid = df_valid.target.values
    

    clf = XGBRegressor(n_estimators=500,
                       objective = 'reg:squarederror',
                       eval_metric = 'rmse',
                       n_jobs=-1,
                       subsample = 0.7,
                       learning_rate=0.05,
                       max_depth = 4,
                       early_stopping_rounds = 15,
                       gamma = 1,
                       colsample_bytree=1,
                       verbosity = 0,
                       random_state=seed,
                       tree_method='gpu_hist'
                      )
    clf.fit(Xtrain, ytrain)
    pred = clf.predict(Xvalid)
    
    rmse = sqrt(metrics.mean_squared_error(yvalid, pred))
    print(f"fold={fold}, RMSE={rmse}")
    
    filename = f"./models/XGBR_roberta_fold_{fold}.sav"
    pickle.dump(clf, open(filename, 'wb'))
    print("model saved")
    
    df_valid.loc[:,"XGBR_roberta_pred"] = pred
    
    return df_valid[["id","target","kfold","XGBR_roberta_pred"]]

dfs = []
for j in range(k_folds):
    temp_df = run_XGBR_Roberta(j)
    dfs.append(temp_df)  
fin_valid_df = pd.concat(dfs)
fin_valid_df.to_csv("./model_preds/xgboost_roberta_excerpt.csv", index=False)

In [None]:
# SEARCH BEST PARAMS XGBOOST
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer
#import warnings

def rmse(predict, actual):
    predict = np.array(predict)
    actual = np.array(actual)
    distance = predict - actual
    square_distance = distance ** 2
    mean_square_distance = square_distance.mean()
    score = np.sqrt(mean_square_distance)

    return score

rmse_score = make_scorer(rmse, greater_is_better = False)

y_train = df.target.values


x_fit,x_test,y_fit,y_test = train_test_split(train_embeddings, y_train, train_size =0.15, 
                            random_state=seed)
clf = XGBRegressor(
        eval_metric = 'rmse',
        #nthread = 4,
        eta = 0.1,
        n_estimators = 500,
        max_depth = 5,
        subsample = 0.5,
        gamma = 1,
        colsample_bytree = 1.0,
        #silent = 1,
        verbosity = 0,
        random_state=seed,
        tree_method='gpu_hist'
        )
parameters = {
    'n_estimators': [200, 300, 400, 500],
    'eta': [0.01, 0.05, 0.1],
    'early_stopping_rounds':[10,15,20],
    'gamma':[0, 1, 10],
    'max_depth': [3, 4, 5],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
}

clf1 = RandomizedSearchCV(clf, parameters, n_jobs=-1, scoring=rmse_score, cv=5, return_train_score=True)
clf1.fit(x_fit, y_fit)
print('Best Params: \n', clf1.best_params_ )

Result = pd.DataFrame(clf1.cv_results_)
Result

## Transformers model

In [None]:
MODEL_NAME = Path('../input/modelf1')
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, model_max_length=256) 

In [None]:
class Data(Dataset):
    def __init__(self, data):
        super().__init__()
        self.data = data

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):       
        excerpt = self.data.excerpt[idx]
        target = self.data.target[idx]
        return excerpt, target

In [None]:
class Roberta(PreTrainedModel):
   def __init__(self, conf):
       super(Roberta, self).__init__(conf)
       self.roberta = AutoModel.from_pretrained(MODEL_NAME, config=conf)
    
       self.W = torch.nn.Linear(4*self.roberta.config.hidden_size, self.roberta.config.hidden_size) 
       self.drop_out = torch.nn.Dropout(0.4)
       self.V = torch.nn.Linear(self.roberta.config.hidden_size, 1)
    
       self.linear = torch.nn.Linear(4*self.roberta.config.hidden_size, 1)

   def forward(self, input_ids, attention_mask):
       out = self.roberta(input_ids=input_ids, 
                             attention_mask=attention_mask
                            )
       cat = torch.cat(tuple([out[2][i] for i in [-4, -3, -2, -1]]), dim=-1)
       att = torch.tanh(self.W(cat))
       score = self.V(att)
       attention_weights = torch.softmax(score, dim=1)
       context_vector = attention_weights * cat #* out[2]
       context_vector = torch.sum(context_vector, dim=1)
       out = self.drop_out(context_vector)
       out = self.linear(out)
        
       return out

In [None]:
# Training Function

def train(model,
          optimizer,
          train_iter,
          valid_iter,
          loss_fct,
          fold,
          valid_period,
          num_epochs = 5,
          scheduler = None,
          output_path = output_path):
    
    # Initialize losses and loss histories
    train_loss = 0.0
    valid_loss = 0.0

    train_loss_list = []
    valid_loss_list = []

    best_valid_loss = float('Inf')
    
    global_step = 0
    global_steps_list = []
    
    model.train()
        # Train loop
    for epoch in range(num_epochs):
        for (excerpts, target) in train_iter:
            
            batch = tokenizer(list(excerpts), truncation=True, padding=True, return_tensors='pt', add_special_tokens=True)
            input_ids = batch['input_ids']
            input_ids = input_ids.to(device, dtype=torch.long)
            attention_mask = batch['attention_mask']
            attention_mask = attention_mask.to(device, dtype=torch.long)
            
            target=torch.tensor(target).to(device, dtype=torch.float)

            logits = model(input_ids=input_ids,  
                           attention_mask=attention_mask)
            loss = loss_fct(torch.squeeze(logits), target)
            loss.backward()
            
            # Optimizer and scheduler step
            optimizer.step()    
            scheduler.step()
            
            optimizer.zero_grad()

            # Update train loss and global step
            train_loss += loss.item()
            global_step += 1
                        # Validation loop. Save progress and evaluate model performance.
            if global_step % valid_period == 0:
                all_preds=[]
                model.eval()
                with torch.no_grad():
                    for (excerpts, target) in valid_iter:
                        batch = tokenizer(list(excerpts), truncation=True, padding=True, return_tensors='pt', add_special_tokens=True)
                        input_ids = batch['input_ids']
                        input_ids = input_ids.to(device, dtype=torch.long)
                        attention_mask = batch['attention_mask']
                        attention_mask = attention_mask.to(device, dtype=torch.long)

                        target=torch.tensor(target).to(device, dtype=torch.float)

                        logits = model(input_ids=input_ids, 
                                       attention_mask=attention_mask)

                        preds = torch.squeeze(logits)
                        loss = loss_fct(preds, target)
                        valid_loss += loss.item()
                        all_preds.append(preds.detach().cpu())

                # Store train and validation loss history
                train_loss = train_loss / valid_period
                valid_loss = valid_loss / len(valid_iter)
                train_loss_list.append(train_loss)
                valid_loss_list.append(valid_loss)
                global_steps_list.append(global_step)

                # print summary
                print('Epoch [{}/{}], global step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
                      .format(epoch+1, num_epochs, global_step, num_epochs*len(train_iter),
                              train_loss, valid_loss))
                                
                # checkpoint
                if best_valid_loss > valid_loss:
                    best_valid_loss = valid_loss
                    best_pred = np.array(all_preds)
                    save_path = f'./models/RobertaBase_model-fold-{fold}.pth'
                    torch.save(model.state_dict(), save_path)
                        
                train_loss = 0.0                
                valid_loss = 0.0

                model.train()
    return best_pred
    print('Training done!')


    

In [None]:
model_config = RobertaConfig.from_pretrained(MODEL_NAME)
model_config.output_hidden_states = True

In [None]:
def test(model, test_loader):
    
    model.eval()
    all_preds=[]
    with torch.no_grad():
        for (excerpts, _) in test_loader:
            batch = tokenizer(list(excerpts), truncation=True, padding=True, return_tensors='pt', add_special_tokens=False)
            input_ids = batch['input_ids']
            input_ids = input_ids.to(device, dtype=torch.long)
            attention_mask = batch['attention_mask']
            attention_mask = attention_mask.to(device, dtype=torch.float)

            output = model(input_ids, attention_mask)
            preds = torch.squeeze(output)
            all_preds.append(preds.detach().cpu())

    torch.cuda.empty_cache()
    return np.array(all_preds)

In [None]:
#https://www.kaggle.com/chamecall/clrp-finetune

def create_optimizer(model):
    named_parameters = list(model.named_parameters())    
    
    roberta_parameters = named_parameters[:197]    
    attention_parameters = named_parameters[199:203]
    regressor_parameters = named_parameters[203:]
        
    attention_group = [params for (name, params) in attention_parameters]
    regressor_group = [params for (name, params) in regressor_parameters]

    parameters = []
    parameters.append({"params": attention_group})
    parameters.append({"params": regressor_group})

    for layer_num, (name, params) in enumerate(roberta_parameters):
        weight_decay = 0.0 if "bias" in name else 0.01

        lr = 2e-5

        if layer_num >= 69:        
            lr = 5e-5

        if layer_num >= 133:
            lr = 1e-4

        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr})

    return AdamW(parameters)

In [None]:
# Main training loop
# Configuration options

def run_Roberta(fold):
    # Configuration options
    NUM_EPOCHS = 3
    loss_fct = torch.nn.MSELoss()
    dfs = pd.read_csv("./train_folds.csv")
    
    
    print('--------------------------------')
    print(f'FOLD {fold}')
    print('--------------------------------')
    df_train = dfs[dfs.kfold != fold].reset_index(drop=True)
    df_valid = dfs[dfs.kfold == fold].reset_index(drop=True)
    train_data = Data(data = df_train)
    valid_data = Data(data = df_valid)
    yvalid = df_valid.target.values
    
    train_loader = torch.utils.data.DataLoader(
                      train_data, 
                      batch_size=16,
                      shuffle=True)
    
    val_loader = torch.utils.data.DataLoader(
                      valid_data)
    
    model = Roberta(model_config)
    model = model.to(device)

    optimizer = create_optimizer(model)
    
    print("======================= Start training =================================")

    scheduler = get_cosine_schedule_with_warmup(optimizer,
                                                num_warmup_steps=50,
                                                num_training_steps= NUM_EPOCHS * len(train_loader))

    pred = train(model=model, 
                  train_iter=train_loader, 
                  valid_iter=val_loader, 
                  optimizer=optimizer, 
                  scheduler=scheduler,
                  loss_fct =loss_fct,
                  num_epochs=NUM_EPOCHS,
                  fold =fold,
                  valid_period = len(train_loader))
    
    
    rmse = sqrt(metrics.mean_squared_error(yvalid, pred))
    print(f"fold={fold}, RMSE={rmse}")
    
    df_valid.loc[:,"Roberta_pred"] = pred
    #torch.cuda.empty_cache() # PyTorch empty cache
    
    return df_valid[["id","target","kfold","Roberta_pred"]]

dfs = []
for j in range(k_folds):
    temp_df = run_Roberta(j)
    dfs.append(temp_df)  
fin_valid_df = pd.concat(dfs)
fin_valid_df.to_csv("./model_preds/Roberta.csv", index=False)


In [None]:
import glob
import numpy as np

files = glob.glob("./model_preds/*.csv")
df = None
for f in files :
    if df is None:
        df = pd.read_csv(f)
    else:
        temp_df  = pd.read_csv(f)
        df = df.merge(temp_df, on="id", how="left")

print(list(df.columns))
targets = df.iloc[:,1].values

pred_cols = ["XGBR_roberta_pred", "Roberta_pred"]

for col in pred_cols:
    rmse = sqrt(metrics.mean_squared_error(targets, df[col].values))
    print(f"{col}, overall_rmse={rmse}")

print("average")
avg_pred = np.mean(df[["XGBR_roberta_pred", "Roberta_pred"]].values, axis=1)
print(sqrt(metrics.mean_squared_error(targets, avg_pred)))

print("weighted average")
#rf_glove_pred = df.rf_glove_pred.values
XGB_roberta_pred = df.XGBR_roberta_pred.values
Roberta_pred = df.Roberta_pred.values

avg_pred = (Roberta_pred + 2*XGB_roberta_pred)/3
print(sqrt(metrics.mean_squared_error(targets, avg_pred)))

# Blending

In [None]:
#optimal weights
import glob
import numpy as np
from functools import partial
from scipy.optimize import fmin

class OptimizeRMSE:
    def __init__(self):
        self.coef_ =0
        
    def _rmse(self, coef, X, y):
        x_coef = X*coef
        predictions = np.sum(x_coef, axis=1)
        rmse_score = sqrt(metrics.mean_squared_error(y, predictions))
        return 1.0 * rmse_score
    
    def fit(self, X, y):
        partial_loss = partial(self._rmse, X=X, y=y)
        init_coef = np.random.dirichlet(np.ones(X.shape[1]))
        self.coef_ = fmin(partial_loss, init_coef, disp=True)
        
    def predict(self, X):
        x_coef = X*self.coef_
        predictions = np.sum(x_coef, axis=1)
        return predictions
    
def run_training(pred_df, fold):
    
    train_df = pred_df[pred_df.iloc[:,2] != fold].reset_index(drop=True)
    valid_df = pred_df[pred_df.iloc[:,2] == fold].reset_index(drop=True)
    
    xtrain = train_df[["XGBR_roberta_pred", "Roberta_pred"]].values
    xvalid = valid_df[["XGBR_roberta_pred", "Roberta_pred"]].values
    
    opt = OptimizeRMSE()
    opt.fit(xtrain, train_df.iloc[:,1].values)
    return opt.coef_
     
files = glob.glob("./model_preds/*.csv")
df = None
for f in files :
    if df is None:
        df = pd.read_csv(f)
    else:
        temp_df  = pd.read_csv(f)
        df = df.merge(temp_df, on="id", how="left")
        
targets = df.iloc[:,1].values
pred_cols = ["XGBR_roberta_pred", "Roberta_pred"]

coefs = []
for j in range(k_folds):
          coefs.append(run_training(df, j))
coefs = np.array(coefs)
print(coefs)

coefs = np.mean(coefs, axis=0)
print(coefs)

wt_avg = (coefs[0]*df.XGBR_roberta_pred.values 
          + coefs[1]*df.Roberta_pred.values
         )
print("optimal rmse after finding optimal coefs")
print(sqrt(metrics.mean_squared_error(targets, wt_avg)))

# Test

In [None]:
#Data preprocessing

#Load Data
test_df = pd.read_csv(test_file)

#add fake label in the test only in the aim to use the evalution function
test_df.insert(1,'target', 0, False)
test_df.to_csv("./test/test_with_col_target.csv", index=False)

test_preprop=preprop(test_df)
test_preprop.to_csv("./test/test_preprop.csv", index=False)

# Load models

In [None]:
df_test = pd.read_csv("./test/test_with_col_target.csv")
MODEL_NAME = Path('../input/modelf1')
test_embeddings = get_embeddings(df_test,MODEL_NAME)

df_xgboost_roberta= pd.DataFrame({
     "id" : df_test.id.values
  })


for j in range(k_folds):
    xgb_roberta_reloaded = pickle.load(open(f'./models/XGBR_roberta_fold_{j}.sav','rb'))
    df_xgboost_roberta[f"fold_{j}"] = xgb_roberta_reloaded.predict(test_embeddings)
    
df_xgboost_roberta['target_xgb_roberta'] = df_xgboost_roberta.iloc[:, 1:].mean(axis=1)
df_xgboost_roberta = df_xgboost_roberta[['id','target_xgb_roberta']]

In [None]:
#ROBERTA_Base
df_test = pd.read_csv("./test/test_with_col_target.csv") #Better results
test_data = Data(data = df_test) 
test_loader = DataLoader(dataset = test_data, shuffle=False)#, batch_size = 64)

df_Roberta= pd.DataFrame({
     "id" : df_test.id.values
  })
for j in range(k_folds):
    torch.cuda.empty_cache() # PyTorch empty cache
    
    model = Roberta(model_config)    
    load_path = f'./models/RobertaBase_model-fold-{j}.pth'
    model.load_state_dict(torch.load(load_path))
    model = model.to(device)
    
    # Print about testing
    print(f'Starting testing - fold_{j}')    
    model_return = test(model=model,
        test_loader = test_loader
        )
    
    
    df_Roberta[f"fold_{j}"] = model_return
    
df_Roberta['target_Roberta'] = df_Roberta.iloc[:, 1:].mean(axis=1)
df_Roberta = df_Roberta[['id','target_Roberta']]

In [None]:
test_avg = (coefs[0]*df_xgboost_roberta.target_xgb_roberta.values
          + coefs[1]*df_Roberta.target_Roberta.values
         )

In [None]:
df= pd.DataFrame({
     "id" : df_test.id.values
  })
df['target']=test_avg

df.to_csv(f"{output_path}/submission.csv",index=False)
