#### Thanks to Abhishek Thakur for the AutoNLP trained Language Models. In this kernel, we will be using these models and taking out the last layer embedding which would go as features for the machine learning model.

In [None]:
import os
import pandas as pd
import numpy as np
import random
import gc
import tqdm


from sklearn import model_selection
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error

import lightgbm as lgb

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import DataLoader, Dataset
from transformers import (AutoModel, AutoTokenizer, 
                          AutoModelForSequenceClassification,get_constant_schedule_with_warmup)

In [None]:
# Set Configs/Constants

class config:
    
    SEED = 42
    MAX_LEN = 256
    TRAIN_BATCH_SIZE = 128
    VAL_BATCH_SIZE = 64
    ROBERTA_MODEL_PATH = '../input/roberta-base'
    EPOCHS = 3
    LR = 1e-5
    TEXT_COLUMN = 'excerpt'

In [None]:
def set_seed(seed = 0):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    return random_state

random_state = set_seed(config.SEED)

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

In [None]:
def create_kfolds(df,target_col, seed):

    df["kfold"] = -1

    df = df.sample(frac=1).reset_index(drop=True)

    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=seed)

    for fold, (train_idx, val_idx) in enumerate(kf.split(X=df)):
        print(len(train_idx), len(val_idx))
        df.loc[val_idx, 'kfold'] = fold

    return df

def create_Stratkfolds(df,target_col, seed):

    df["kfold"] = -1

    df = df.sample(frac=1).reset_index(drop=True)

    ### This was taken from https://www.kaggle.com/abhishek/step-1-create-folds
    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    num_bins = int(np.floor(1 + np.log2(len(df))))
    
    # bin targets
    df.loc[:, "bins"] = pd.cut(
        df[target_col], bins=num_bins, labels=False
    )

    kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

    for fold, (train_idx, val_idx) in enumerate(kf.split(X=df, y = df.bins.values)):
        print(len(train_idx), len(val_idx))
        df.loc[val_idx, 'kfold'] = fold

    return df

In [None]:
# Reading Data
train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

target = train['target'].to_numpy()

In [None]:
class ReadabiltyDataset(nn.Module):
    def __init__(self, data, tokenizer):
        self.sentences = data[config.TEXT_COLUMN].to_numpy()
        self.tokenizer = tokenizer
        self.max_len = config.MAX_LEN

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):

        encode = self.tokenizer(self.sentences[idx],
            return_tensors='pt',
            max_length=self.max_len,
            padding='max_length',
            truncation=True
        )

        return encode

In [None]:
# Thanks to this kernel to help me load the embeddings : https://www.kaggle.com/lars123/neural-tangent-kernel-2
def get_embeddings(df,path,plot_losses=True, verbose=True):
            
    MODEL_PATH = path
    model = AutoModel.from_pretrained(MODEL_PATH)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model.to(device)
    model.eval()

    ds = ReadabiltyDataset(df,tokenizer)
    dl = DataLoader(ds,
                  batch_size = config.TRAIN_BATCH_SIZE,
                  shuffle=False,
                  num_workers = 4,
                  pin_memory=True,
                  drop_last=False)
        
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm.tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs[0][:,0].detach().cpu().numpy()
            embeddings.extend(outputs)
    return np.array(embeddings)

In [None]:
train_embeddings1 =  get_embeddings(train,'../input/modelf1')
test_embeddings1 = get_embeddings(test,'../input/modelf1')

train_embeddings2 =  get_embeddings(train,'../input/modelf2')
test_embeddings2 = get_embeddings(test,'../input/modelf2')

train_embeddings3 =  get_embeddings(train,'../input/modelf3')
test_embeddings3 = get_embeddings(test,'../input/modelf3')

train_embeddings4 =  get_embeddings(train,'../input/modelf4')
test_embeddings4 = get_embeddings(test,'../input/modelf4')

train_embeddings5 =  get_embeddings(train,'../input/modelf5')
test_embeddings5 = get_embeddings(test,'../input/modelf5')

In [None]:
def runLGB_reg(train_X, train_y, test_X, test_y=None, test_X2=None, dep=8, seed=0, data_leaf=50, rounds=20000):
    params = {}
    params["objective"] = "regression"
    params['metric'] = 'rmse'
    params["max_depth"] = dep
    params["num_leaves"] = 30
    params["min_data_in_leaf"] = data_leaf
    #     params["min_sum_hessian_in_leaf"] = 50
    params["learning_rate"] = 0.01
    params["bagging_fraction"] = 0.8
    params["feature_fraction"] = 0.2
    params["feature_fraction_seed"] = seed
    params["bagging_freq"] = 1
    params["bagging_seed"] = seed
    params["lambda_l2"] = 3
    params["lambda_l1"] = 3
    params["verbosity"] = -1
#     params["sample_weight"] = sample_weight
    num_rounds = rounds

    plst = list(params.items())
    lgtrain = lgb.Dataset(train_X, label=train_y)

    if test_y is not None:
        lgtest = lgb.Dataset(test_X, label=test_y)
        model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest], early_stopping_rounds=200, verbose_eval=500)

    #         model = lgb.LGBMRegressor()
    else:
        lgtest = lgb.DMatrix(test_X)
        model = lgb.train(params, lgtrain, num_rounds)

    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)
    #imps = model.feature_importance()
    #names = model.feature_name()
    #for fi, fn in enumerate(names):
    #    print(fn, imps[fi])

    loss = 0
    if test_y is not None:
        loss = np.sqrt(metrics.mean_squared_error(test_y, pred_test_y))
        print(loss)
        return model, loss, pred_test_y, pred_test_y2
    else:
        return model, loss, pred_test_y, pred_test_y2

In [None]:
def run_lgb(X_train, y_train, test_X):
    print("Building model..")
    cv_scores = []
    pred_test_full = 0
    pred_train = np.zeros(X_train.shape[0])
    n_splits = 5
    kf = model_selection.KFold(n_splits=n_splits, shuffle=True, random_state=7988)
    # gkf = model_selection.GroupKFold(n_splits=n_splits)
    model_name = "lgb"
    for dev_index, val_index in kf.split(X_train, y_train):
        dev_X, val_X = X_train[dev_index,:], X_train[val_index,:]
        dev_y, val_y = y_train[dev_index], y_train[val_index]

        pred_val = 0
        pred_test = 0
        n_models = 0.

        model, loss, pred_v, pred_t = runLGB_reg(dev_X, dev_y, val_X, val_y, test_X, dep=6, data_leaf=200, seed=2019)
        pred_val += pred_v
        pred_test += pred_t
        n_models += 1

        model, loss, pred_v, pred_t = runLGB_reg(dev_X, dev_y, val_X, val_y, test_X,  dep=7, data_leaf=180, seed=9873)
        pred_val += pred_v
        pred_test += pred_t
        n_models += 1

    #     model, loss, pred_v, pred_t = runLGB(dev_X, dev_y, val_X, val_y, test_X, dep=7, data_leaf=200, seed=4568)
    #     pred_val += pred_v
    #     pred_test += pred_t
    #     n_models += 1


        pred_val /= n_models
        pred_test /= n_models

        loss = np.sqrt(metrics.mean_squared_error(val_y, pred_val))

        pred_train[val_index] = pred_val
        pred_test_full += pred_test / n_splits
        cv_scores.append(loss)
        print(cv_scores)
    #     break
    print(np.mean(cv_scores))
    return pred_test

In [None]:
preds_1 = run_lgb(train_embeddings1,target,test_embeddings1)
preds_2 = run_lgb(train_embeddings2,target,test_embeddings2)
preds_3 = run_lgb(train_embeddings3,target,test_embeddings3)
preds_4 = run_lgb(train_embeddings4,target,test_embeddings4)
preds_5 = run_lgb(train_embeddings5,target,test_embeddings5)

In [None]:
final_preds = (preds_1 + preds_2 + preds_3 + preds_4 + preds_5)/5

In [None]:
submission_df = pd.DataFrame({'id': test.id, 'target': final_preds})
submission_df

In [None]:
submission_df.to_csv('submission.csv', index = False)