In [None]:
import torch
import torchvision
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
from torchvision.transforms import ToTensor
from torchvision.utils import make_grid
from torch.utils.data import random_split

import pandas as pd
import seaborn as sns
import gc
import time
from tqdm import tqdm
import datatable as dt
from sklearn.preprocessing import StandardScaler
import warnings
from sklearn.model_selection import StratifiedKFold,KFold
warnings.filterwarnings("ignore")
%matplotlib inline

import os
import random

from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel

from colorama import Fore, Back, Style
red = Fore.RED
grn = Fore.GREEN
blu = Fore.BLUE
ylw = Fore.YELLOW
wht = Fore.WHITE

import plotly.express as ex
import plotly.graph_objs as go
import plotly.figure_factory as ff

from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor, Pool, CatBoost

In [None]:
config = {
    'SEED' : 43,
    'FOLDS' : 5
}

In [None]:
path = '../input/commonlitreadabilityprize/'
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
sample = pd.read_csv(path + 'sample_submission.csv')

y_train = train['target'].to_numpy()

nbins = 12
train.loc[:,'bins'] = pd.cut(train['target'],nbins,labels=False)
bins = train.bins.to_numpy()

In [None]:
def seed_everything(seed=43):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    config['SEED'] = seed
seed_everything(43)

In [None]:
class CommonLitDataset:
    def __init__(self, excerpt, tokenizer, max_len):
        self.excerpt = excerpt
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.excerpt)

    def __getitem__(self, item):
        text = str(self.excerpt[item])
        inputs = self.tokenizer(
            text, 
            max_length=self.max_len, 
            padding="max_length", 
            truncation=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long),
        }

In [None]:
def generateEmbeddings(data,model_path, max_len = 256):
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 
    model = AutoModel.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    model.to(device)
    model.eval()
    
    dataset = CommonLitDataset(excerpt=data.excerpt.values, tokenizer=tokenizer, max_len=max_len)
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=32, num_workers=4, pin_memory=True, shuffle=False
    )

    embeddings = list()
    
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(data_loader)):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs[0][:,0].detach().cpu().numpy()
            embeddings.extend(outputs)

    torch.cuda.empty_cache()
    return np.array(embeddings)

In [None]:
trainEmbeddings1 = generateEmbeddings(train, "../input/experimental-models-clrp/CLRPmodel_0_1")
testEmbeddings1 = generateEmbeddings(test, "../input/experimental-models-clrp/CLRPmodel_0_1/")

trainEmbeddings2 = generateEmbeddings(train, "../input/experimental-models-clrp/CLRPmodel_1_1/")
testEmbeddings2 = generateEmbeddings(test, "../input/experimental-models-clrp/CLRPmodel_1_1/")

trainEmbeddings3 = generateEmbeddings(train, "../input/experimental-models-clrp/CLRPmodel_2_1/")
testEmbeddings3 = generateEmbeddings(test, "../input/experimental-models-clrp/CLRPmodel_2_1/")

trainEmbeddings4 = generateEmbeddings(train, "../input/experimental-models-clrp/CLRPmodel_3_2/")
testEmbeddings4 = generateEmbeddings(test, "../input/experimental-models-clrp/CLRPmodel_3_2/")

trainEmbeddings5 = generateEmbeddings(train, "../input/experimental-models-clrp/CLRPmodel_4_5/")
testEmbeddings5 = generateEmbeddings(test, "../input/experimental-models-clrp/CLRPmodel_4_5/")

trainEmbeddings6 = generateEmbeddings(train, "../input/experimental-models-clrp/CLRPmodel_5_4/")
testEmbeddings6 = generateEmbeddings(test, "../input/experimental-models-clrp/CLRPmodel_5_4/")

trainEmbeddings7 = generateEmbeddings(train, "../input/experimental-models-clrp/CLRPmodel_6_5/")
testEmbeddings7 = generateEmbeddings(test, "../input/experimental-models-clrp/CLRPmodel_6_5/")

trainEmbeddings8 = generateEmbeddings(train, "../input/experimental-models-clrp/CLRPmodel_7_3/")
testEmbeddings8 = generateEmbeddings(test, "../input/experimental-models-clrp/CLRPmodel_7_3/")

trainEmbeddings9 = generateEmbeddings(train, "../input/experimental-models-clrp/CLRPmodel_8_4/")
testEmbeddings9 = generateEmbeddings(test, "../input/experimental-models-clrp/CLRPmodel_8_4/")

trainEmbeddings10 = generateEmbeddings(train, "../input/experimental-models-clrp/CLRPmodel_9_3/")
testEmbeddings10 = generateEmbeddings(test, "../input/experimental-models-clrp/CLRPmodel_9_3/")

# Final Fitting

In [None]:
def rmse_score(targets,outputs):
    return np.sqrt(mean_squared_error(targets,outputs))

In [None]:
import lightgbm as lgb

In [None]:
lgbm_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['RMSE'],
    'learning_rate': 0.005,
    "num_leaves": 64,  
    "max_bin": 256,
    "num_iterations" : 2000,
    "early_stopping_rounds" : 100,
    "force_col_wise" : True
#     "num_threads" : 16,
    
}

In [None]:
def generatePreds(train_embeds, test_embeds, mod = 'lgbm', y_train = y_train, bins=bins, folds=config["FOLDS"]):
    kFold = kfold = StratifiedKFold(n_splits=folds)
    rmse = list()
    preds = np.zeros((test_embeds.shape[0]))
    for fold , (train_idx,valid_idx) in enumerate(kFold.split(X=train_embeds,y=bins)):
        train_x,valid_x = train_embeds[train_idx],train_embeds[valid_idx]
        train_y,valid_y = y_train[train_idx],y_train[valid_idx]
        
        if mod == 'lgbm':
            lgb_train = lgb.Dataset(train_x, train_y)
            lgb_valid = lgb.Dataset(valid_x, valid_y, reference = lgb_train)
            model = lgb.train(lgbm_params, lgb_train, valid_sets=[lgb_train, lgb_valid], verbose_eval=100)
        elif mod == 'svm':
            model = SVR(C=10,kernel='rbf',gamma='auto')
            model.fit(train_x,train_y)
            
        pred = model.predict(valid_x)
        score = rmse_score(valid_y, pred)
        print(f'Fold: {fold} , RMSE : {score}')
        rmse.append(score)
        preds += model.predict(test_embeds)
    
    print(f'RMSE mean : {np.mean(rmse)}')
    return np.array(preds) / folds

In [None]:
preds1 = generatePreds(trainEmbeddings1,testEmbeddings1)
preds2 = generatePreds(trainEmbeddings2,testEmbeddings2)
preds3 = generatePreds(trainEmbeddings3,testEmbeddings3)
preds4 = generatePreds(trainEmbeddings4,testEmbeddings4)
preds5 = generatePreds(trainEmbeddings5,testEmbeddings5)
preds6 = generatePreds(trainEmbeddings6,testEmbeddings6)
preds7 = generatePreds(trainEmbeddings7,testEmbeddings7)
preds8 = generatePreds(trainEmbeddings8,testEmbeddings8)
preds9 = generatePreds(trainEmbeddings9,testEmbeddings9)
preds10 = generatePreds(trainEmbeddings10,testEmbeddings10)

In [None]:
preds = (preds1 + preds2 + preds3 + preds4 + preds5 + preds6 + preds7 + preds8 + preds9 + preds10) / 10

In [None]:
submission = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")
submission.target = preds
submission.to_csv("submission.csv", index=False)

In [None]:
submission