In [None]:
# ========================================
# library
# ========================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, KFold,GroupKFold
from sklearn.metrics import mean_squared_error
%matplotlib inline
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import transformers
from transformers import RobertaModel
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import RobertaForSequenceClassification
from transformers import RobertaTokenizer
import logging
import sys
from contextlib import contextmanager
import time
import random
from tqdm import tqdm
import os
import lightgbm as lgb
import pickle
from sklearn.svm import SVR
from sklearn.linear_model import Ridge,Lasso,ElasticNet

In [None]:
# ==================
# Constant
# ==================
ex = "015"
TRAIN_PATH = "../input/commonlitreadabilityprize/train.csvv"
LOGGER_PATH = f"ex{ex}.txt"
FOLD_PATH = "../input/fe001-step-1-create-folds/fe001_train_folds.csv"
MODEL_PATH_BASE = f"ex{ex}"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# ===============
# Settings
# ===============
SEED = 0
N_SPLITS = 5
SHUFFLE = True
num_workers = 4

BATCH_SIZE = 24
n_epochs = 5
es_patience = 3

max_len = 256
weight_decay = 0.1
lr = 5e-5
num_warmup_steps = 10

MODEL_PATH = '../input/roberta-transformers-pytorch/roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(MODEL_PATH)

LGBM_PARAMS = {'num_leaves': 64,
               'min_data_in_leaf': 16,
               'objective': 'regression',
               'max_depth': -1,
               'learning_rate': 0.05,
               "boosting": "gbdt",
               "bagging_freq": 1,
               "bagging_fraction": 0.8,
               "bagging_seed": SEED,
               "verbosity": -1,
              'reg_alpha': 0.1,
              'reg_lambda': 0.3,
              'colsample_bytree': 0.7,
              'metrics':"rmse",
              'num_threads':4,
         }

LGBM_FIT_PARAMS = {
    'num_boost_round': 3500,
    'early_stopping_rounds': 50,
    'verbose_eval': 50,
}

In [None]:
# ===============
# Functions
# ===============

class CommonLitDataset(Dataset):
    def __init__(self, excerpt, tokenizer, max_len, target=None):
        self.excerpt = excerpt
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.target = target

    def __len__(self):
        return len(self.excerpt)

    def __getitem__(self, item):
        text = str(self.excerpt[item])
        inputs = self.tokenizer(
            text, 
            max_length=self.max_len, 
            padding="max_length", 
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True
        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        if self.target is not None:
            return {
                "input_ids": torch.tensor(ids, dtype=torch.long),
                "attention_mask": torch.tensor(mask, dtype=torch.long),
                "token_type_ids" : torch.tensor(token_type_ids, dtype=torch.long),
                "target" : torch.tensor(self.target[item], dtype=torch.float32)
            }
        else:
            return {
                "input_ids": torch.tensor(ids, dtype=torch.long),
                "attention_mask": torch.tensor(mask, dtype=torch.long),
                "token_type_ids" : torch.tensor(token_type_ids, dtype=torch.long)
            }

class roberta_model(nn.Module):
    def __init__(self):
        super(roberta_model, self).__init__()
        self.roberta = RobertaModel.from_pretrained(
            MODEL_PATH, 
        )
        self.drop = nn.Dropout(0.2)
        self.fc = nn.Linear(768, 256)
        self.layernorm = nn.LayerNorm(256)
        self.drop2 = nn.Dropout(0.2)
        self.relu = nn.ReLU()
        self.out = nn.Linear(256, 1)
    
    def forward(self, ids, mask, token_type_ids):
        # pooler
        emb = self.roberta(ids, attention_mask=mask,token_type_ids=token_type_ids)['pooler_output']
        output = self.drop(emb)
        output = self.fc(output)
        output = self.layernorm(output)
        output = self.drop2(output)
        output = self.relu(output)
        output = self.out(output)
        return output,emb
    
    
def calc_loss(y_true, y_pred):
    return  np.sqrt(mean_squared_error(y_true, y_pred))
    
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True



def setup_logger(out_file=None, stderr=True, stderr_level=logging.INFO, file_level=logging.DEBUG):
    LOGGER.handlers = []
    LOGGER.setLevel(min(stderr_level, file_level))

    if stderr:
        handler = logging.StreamHandler(sys.stderr)
        handler.setFormatter(FORMATTER)
        handler.setLevel(stderr_level)
        LOGGER.addHandler(handler)

    if out_file is not None:
        handler = logging.FileHandler(out_file)
        handler.setFormatter(FORMATTER)
        handler.setLevel(file_level)
        LOGGER.addHandler(handler)

    LOGGER.info("logger set up")
    return LOGGER

def train_lgbm(X_train, y_train, X_valid, y_valid, X_test, categorical_features, feature_name, fold_id,lgb_params, fit_params, loss_func, calc_importances=True):
    
    train = lgb.Dataset(X_train, y_train,
                        categorical_feature=categorical_features,
                        feature_name=feature_name)
    if X_valid is not None:
        valid = lgb.Dataset(X_valid, y_valid,
                            categorical_feature=categorical_features,
                            feature_name=feature_name)
   
    if X_valid is not None:
        model = lgb.train(
            lgb_params,
            train,
            valid_sets=[train,valid],
            **fit_params
        )
    else:
        model = lgb.train(
            lgb_params,
            train,
            **fit_params
        )
    
    # train score
    if X_valid is not None:
        y_pred_valid = model.predict(X_valid)
        valid_loss = loss_func(y_valid, y_pred_valid)
    else:
        y_pred_valid = None
        valid_loss = None
    
    #test
    if X_test is not None:
        y_pred_test = model.predict(X_test)
    else:
        y_pred_test = None

    if calc_importances:
        importances = pd.DataFrame()
        importances['feature'] = feature_name
        importances['gain'] = model.feature_importance(importance_type='gain')
        importances['split'] = model.feature_importance(importance_type='split')
        importances['fold'] = fold_id
    else:
        importances = None

    return y_pred_valid, y_pred_test, valid_loss, importances, model.best_iteration, model


@contextmanager
def timer(name):
    t0 = time.time()
    yield 
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f} s')
    
    
LOGGER = logging.getLogger()
FORMATTER = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
setup_logger(out_file=LOGGER_PATH)

In [None]:
# ================================
# Main
# ================================
train = pd.read_csv("/kaggle/input/commonlitreadabilityprize/train.csv")
y = train["target"]
fold_df = pd.read_csv(FOLD_PATH)

In [None]:
fold_array = fold_df["kfold"].values

In [None]:
# ================================
# train
# ================================
with timer("lgbm"):
    set_seed(SEED)
    oof_lgb = np.zeros([len(train)])
    oof_svr = np.zeros([len(train)])
    oof_ridge = np.zeros([len(train)])
    oof_lasso = np.zeros([len(train)])
    oof_elastic = np.zeros([len(train)])
    kf = KFold(n_splits=N_SPLITS, shuffle=SHUFFLE, random_state=SEED)
    for fold in range(5):
        x_train, y_train = train.iloc[fold_array != fold], y.iloc[fold_array != fold]
        x_val, y_val =train.iloc[fold_array == fold], y.iloc[fold_array == fold]
        
        # dataset
        train_ = CommonLitDataset(x_train["excerpt"].values, tokenizer, max_len, y_train.values.reshape(-1,1))
        val_ = CommonLitDataset(x_val["excerpt"].values, tokenizer, max_len, y_val.values.reshape(-1,1))
        
        # loader
        train_loader = DataLoader(dataset=train_, batch_size=BATCH_SIZE, shuffle = False , num_workers=4)
        val_loader = DataLoader(dataset=val_, batch_size=BATCH_SIZE, shuffle = False , num_workers=4)
        
        # model
        model = roberta_model()
        model.load_state_dict(torch.load(f"../input/ex014-roberta/ex014_{fold}.pth"))
        model.to(device)
        model.eval()
        
        # make embedding
        train_emb = np.ndarray((0,768))
        val_emb = np.ndarray((0,768))
        
        # train
        with torch.no_grad():  
            for d in train_loader:
                # =========================
                # data loader
                # =========================
                input_ids = d['input_ids']
                mask = d['attention_mask']
                token_type_ids = d["token_type_ids"]
                target = d["target"]

                input_ids = input_ids.to(device)
                mask = mask.to(device)
                token_type_ids = token_type_ids.to(device)
                target = target.to(device)
                _,emb = model(input_ids, mask,token_type_ids )
                train_emb = np.concatenate([train_emb, emb.detach().cpu().numpy()], axis=0)
        
        # val
        with torch.no_grad():  
            for d in val_loader:
                # =========================
                # data loader
                # =========================
                input_ids = d['input_ids']
                mask = d['attention_mask']
                token_type_ids = d["token_type_ids"]
                target = d["target"]

                input_ids = input_ids.to(device)
                mask = mask.to(device)
                token_type_ids = token_type_ids.to(device)
                target = target.to(device)
                _,emb = model(input_ids, mask,token_type_ids )

                val_emb = np.concatenate([val_emb, emb.detach().cpu().numpy()], axis=0)
        
        # lgbm
        x_train = pd.DataFrame(train_emb)
        x_val = pd.DataFrame(val_emb)
        x_train.columns = [f"emb_{i}" for i in range(len(x_train.columns))]
        x_val.columns =  [f"emb_{i}" for i in range(len(x_train.columns))]
        features = list(x_train.columns)
        categorical_features = []
        y_pred_valid, y_pred_test, valid_loss, importances, best_iter, model = train_lgbm(
                        x_train, y_train, x_val, y_val,None,
                        categorical_features=categorical_features,
                        feature_name=features,
                        fold_id=fold,
                        lgb_params=LGBM_PARAMS,
                        fit_params=LGBM_FIT_PARAMS,
                        loss_func=calc_loss,
                        calc_importances=True
                    )
        oof_lgb[fold_array == fold] =  y_pred_valid
        save_path = f"ex015_lgb_roberta_emb_{fold}.pkl"
        pickle.dump(model, open(save_path, 'wb'))
        
        # svr
        model_svr = SVR(C=10,kernel="rbf",gamma='auto')
        model_svr.fit(x_train,y_train)
        pred = model_svr.predict(x_val)
        oof_svr[fold_array == fold] =  pred
        score = calc_loss(y_val, pred)
        print(f"fold_svr:{fold}:{score}")
        save_path = f"ex015_svr_roberta_emb_{fold}.pkl"
        pickle.dump(model_svr, open(save_path, 'wb'))
        
        # ridge
        ridge = Ridge(alpha=1)
        ridge.fit(x_train,y_train)
        pred = ridge.predict(x_val)
        oof_ridge[fold_array == fold] =  pred
        score = calc_loss(y_val, pred)
        print(f"fold_ridge:{fold}:{score}")
        save_path = f"ex015_ridge_roberta_emb_{fold}.pkl"
        pickle.dump(ridge, open(save_path, 'wb'))
        
        # lasso
        lasso = Lasso(alpha=0.0001)
        lasso.fit(x_train,y_train)
        pred = lasso.predict(x_val)
        oof_lasso[fold_array == fold] =  pred
        score = calc_loss(y_val, pred)
        print(f"fold_lasso:{fold}:{score}")
        save_path = f"ex015_lasso_roberta_emb_{fold}.pkl"
        pickle.dump(lasso, open(save_path, 'wb'))
        
        # elastic
        elastic = ElasticNet(alpha=0.0001)
        elastic.fit(x_train,y_train)
        pred = elastic.predict(x_val)
        oof_elastic[fold_array == fold] =  pred
        score = calc_loss(y_val, pred)
        print(f"fold_elastic:{fold}:{score}")
        save_path = f"ex015_elastic_roberta_emb_{fold}.pkl"
        pickle.dump(elastic, open(save_path, 'wb'))

In [None]:
np.save("ex015_lgb.npy",oof_lgb)
np.save("ex015_svr.npy",oof_svr)
np.save("ex015_lasso.npy",oof_lasso)
np.save("ex015_ridge.npy",oof_ridge)
np.save("ex015_elastic.npy",oof_elastic)