# Introduction

In this notebook, I would like to show that **standard_error is a key feature**.  
After adding standard_error to oof predictions, LGBM is trained on the features, and then do CV.

Here is the result which shows what I want to tell.

| Stacking Layer                | CV Score | Boost   | 
| ----------------------------- | -------- | ------- | 
| Linear (baseline)             | 0.4567   | -       | 
| LGBM (without standard_error) | 0.4596   | -0.0029 | 
| LGBM (with standard_error)    | 0.3959   | 0.069   | 

In [None]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(style='darkgrid')

from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

import optuna
import optuna.integration.lightgbm as lgb
optuna.logging.disable_default_handler()

In [None]:
SEED = 28

def RMSE_(y_pred, y_gt):
    mse = mean_squared_error(y_pred, y_gt)
    return np.sqrt(mse)

# BERT Variants with Custom Head
I trained many
* architecture (Roberta, Electra, XLNet and Funnel)
* size (base and large)
* head (attention, conv1D, meanpooling and so on).

This is for model diversity.

In [None]:
TRAIN = '../input/clrp-stratify-on-predictability/train_oof_stratified.csv'

CV_PATHS = [
    {'model_type': 0,
     'path': '../input/clrp-robertalarge-attentions-mask-act',
     'model_name': 'CLRPModelLarge'},
    {'model_type': 1,
     'path': '../input/clrp-robertalarge-conv1d-attentions-mask',
     'model_name': 'CLRPModelLarge'},
    {'model_type': 2,
     'path': '../input/clrp-robertabase-from-colab',
     'model_name': 'CLRPModelColab'},
    {'model_type': 3,
     'path': '../input/clrp-electralarge-attentions-mask-act',
     'model_name': 'CLRPModelLarge'},
#     {'model_type': 8,
#      'path': '../input/clrp-electra-base-attentions',
#      'model_name': 'CLRPModel'},
#     {'model_type': 5,
#      'path': '../input/clrp-xlnet-base-attentions',
#      'model_name': 'CLRPModel'},
    {'model_type': 4,
     'path': '../input/clrp-xlnetlarge-attentions-mask',
     'model_name': 'CLRPModelLarge'},
    {'model_type': 5,
     'path': '../input/clrp-electralarge-attentions-conv1d',
     'model_name': 'CLRPModelLarge'},
    {'model_type': 6,
     'path': '../input/clrp-robertalarge-meanpooling',
     'model_name': 'CLRPModelLarge'},
    {'model_type': 7,
     'path': '../input/clrp-funnellarge-attentions-act',
     'model_name': 'CLRPModelLarge'},
]

In [None]:
df = pd.read_csv(TRAIN)

oofs = []

for cv_path in CV_PATHS:
    model_type = cv_path['model_type']
    oofs.append(f'oof_{model_type}')
    temp_df = pd.read_csv(os.path.join(cv_path['path'], 'oof_df.csv'))
    df[f'oof_{model_type}'] = temp_df['oof']

## VIF Calculation
I check the model diversity using VIF approximately.  
Even if the heads are different, Robarta-large based models have big vif values.

In [None]:
vif_df = pd.DataFrame()
vif_df['model_type'] = [f'model_{i}' for i, cv_path in enumerate(CV_PATHS)]

vif_df['vif'] = [vif(df.loc[:, oofs].values, i) for i in range(len(oofs))]
vif_df['score'] = [RMSE_(df['target'], df[f'oof_{i}']) for i in range(len(CV_PATHS))]

vif_df

# Model Selection By OOF to Stacking

In [None]:
flag = True
vif_df['use'] = False

model_oof_dict = {f'model_{i}': f'oof_{i}' for i in range(len(CV_PATHS))}

score = vif_df['score']
print('Single Best Model: ', vif_df.loc[np.argmin(score)])
print()
vif_df.loc[np.argmin(score), 'use'] = True

best_score_before = np.min(vif_df['score'])
best_scores = []

n_models = 0

while flag:
    n_models += 1
    print('-' * 40)
    print(f'# Model = {n_models}')
    temp_df = vif_df.loc[vif_df.use==False]
    n_models_to_add = len(temp_df)
    models_to_add = temp_df.model_type.tolist()
    
    stacking_score = []
    weights = []
    intercept = []
    for model in models_to_add:
        fe_cols = vif_df.loc[(vif_df.use) | (vif_df.model_type==model), 'model_type'].tolist()
        print(fe_cols)
        oofs= [model_oof_dict[model_type] for model_type in fe_cols]
        features = df[oofs].values
        targets = df['target'].values
        
        lm = linear_model.LinearRegression(fit_intercept=True)
        lm.fit(features, targets)
        
        df['oof_stacking'] = 0.0
        for i, oof_col in enumerate(oofs):
            df['oof_stacking'] += lm.coef_[i] * df[oof_col]
            
        df['oof_stacking'] += lm.intercept_
        temp_score = RMSE_(df['target'], df['oof_stacking'])
        stacking_score.append(temp_score)
        weights.append(lm.coef_)
        intercept.append(lm.intercept_)
        print('CV score (stacking): ', temp_score)
        print()
        
    stacking_score = np.array(stacking_score)
    best_arg = np.argmin(stacking_score)
    
    best_score = stacking_score[best_arg]
    best_model = models_to_add[best_arg]
    best_weight = weights[best_arg]
    best_intercept = intercept[best_arg]
    
    delta = -(best_score - best_score_before)
    
    print('Best Model: ', best_model)
    print('Best Score: ', best_score)
    print('Delta: ', delta)
    print('Weight: ', best_weight)
    print('Intercept: ', best_intercept)
    vif_df.loc[vif_df.model_type==best_model, 'use'] = True
    best_scores.append(best_score)
    
    if delta < 0:
        flag = False
        print('Stacking Stop. Score does not increase anymore.')
        
    best_score_before = best_score
    
    if vif_df.sum()['use']==len(CV_PATHS): flag = False
        
    print()
    print()

plt.figure(figsize=(10, 6))
plt.plot(range(1, len(best_scores)+1), best_scores, marker='o')
plt.xticks(range(1, len(best_scores)+1))
plt.title('Stacking Score')
plt.xlabel('# Models')
plt.ylabel('Score')
plt.show()

As model added, the boost decreases.  
In this stacking, the model diversity plays an important role (see the weight for each model).

# Features: BERTs OOF + Standard Error

In [None]:
def show_importance(fe, opt):
    fe_df = pd.DataFrame()
    fe_df['feature'] = fe
    fe_df['importance'] = opt.feature_importance(importance_type='gain')
    
    plt.figure(figsize=(12, 6))
    sns.barplot(data=fe_df, y='feature', x='importance')
    plt.title('Feature Importance')
    plt.show()
    

class OptLGBM:
    def __init__(self, df, fe, opt_fe='standard_error'):
        self.df = df
        self.fe = fe
        self.opt_fe = opt_fe
        
        self.best_scores = []
        self.best_params = []
        self.best_models = []
        
        self.df['opt_fe_oof'] = np.nan
         
    def optimize(self, num_boost_round=1000, early_stopping_rounds=500):
        for fold in range(5):
            train_df = self.df.loc[self.df.fold!=fold, self.fe + [self.opt_fe]].reset_index(drop=True)
            val_df = self.df.loc[self.df.fold==fold, self.fe + [self.opt_fe]].reset_index(drop=True)

            train_ds = lgb.Dataset(train_df.loc[:, self.fe], train_df.loc[:, self.opt_fe])
            val_ds = lgb.Dataset(val_df.loc[:, self.fe], val_df.loc[:, self.opt_fe])

            params = {
                'objective': 'regression',
                'metric': 'rmse',
                'verbosity': -1,
                'boosting_type': 'gbdt',
                'seed': SEED,
            }

            opt = lgb.train(params, 
                            train_ds, 
                            valid_sets=val_ds, 
                            num_boost_round=num_boost_round, 
                            verbose_eval=False, 
                            early_stopping_rounds=early_stopping_rounds,
                            show_progress_bar=False)

            print(f'fold {fold}: ', opt.best_score['valid_0']['rmse'])
            self.best_scores.append(opt.best_score['valid_0']['rmse'])
            self.best_params.append(opt.params)
            self.best_models.append(opt.best_iteration)
            
            preds = opt.predict(val_df.loc[:, self.fe], num_iteration=opt.best_iteration)
            self.df.loc[self.df.fold==fold, 'opt_fe_oof'] = preds
            
            self._show_importance(opt)
            
        print('CV Score: ', RMSE_(self.df[self.opt_fe], self.df['opt_fe_oof']))
        
    def _show_importance(self, opt):
        show_importance(fe=self.fe, opt=opt)
        
    def retrain(self, col='se'):
        self.df[col] = np.nan
        
        for fold in range(5):
            train_df = self.df.loc[self.df.fold!=fold, self.fe + [self.opt_fe]].reset_index(drop=True)
            val_df = self.df.loc[self.df.fold==fold, self.fe + [self.opt_fe]].reset_index(drop=True)
        for fold in range(5):
            train_df = self.df.loc[self.df.fold!=fold, self.fe + [self.opt_fe]].reset_index(drop=True)
            val_df = self.df.loc[self.df.fold==fold, self.fe + [self.opt_fe]].reset_index(drop=True)

            train_ds = lgb.Dataset(train_df.loc[:, self.fe], train_df.loc[:, self.opt_fe])
            val_ds = lgb.Dataset(val_df.loc[:, self.fe], val_df.loc[:, self.opt_fe])
            
            params = self.best_params[fold]
            
            model = lgb_original.train(params, 
                            train_ds, 
                            valid_sets=val_ds, 
                            verbose_eval=False)
            
            model.save_model(f'lgb_{col}_fold{fold}.pkl')
            self.df.loc[self.df.fold==fold, col] = model.predict(val_df.loc[:, self.fe])
            
        print('CV Score (retrained): ', RMSE_(self.df[self.opt_fe], self.df[col]))

## Without standard_error ......

In [None]:
fe_oofs = [f'oof_{i}' for i in range(len(CV_PATHS))]

opt = OptLGBM(df=df, fe=fe_oofs, opt_fe='target')
opt.optimize(num_boost_round=100, early_stopping_rounds=50)

## With standard_error ......

In [None]:
fe = fe_oofs + ['standard_error']

In [None]:
opt = OptLGBM(df=df, fe=fe, opt_fe='target')
opt.optimize(num_boost_round=100, early_stopping_rounds=50)

# Results

From these experiments, we get the result table below.  
As you can see, the **standard_error plays an very important role** for the target prediction.  
Feature importance for standard_error is comparable to target predictions from BERT variants (see above).

However, standard_error is not given, we must predict it in addition to target if we want to take it into account.  
That is another difficult task.

| Stacking Layer                | CV Score | Boost   | 
| ----------------------------- | -------- | ------- | 
| Linear (baseline)             | 0.4567   | -       | 
| LGBM (without standard_error) | 0.4596   | -0.0029 | 
| LGBM (with standard_error)    | 0.3959   | 0.069   | 