Seed average as an ensemble method.

# Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import decomposition
from sklearn.preprocessing import StandardScaler, MinMaxScaler, QuantileTransformer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from tqdm.auto import tqdm
from sklearn import linear_model
import xgboost as xgb
import operator
import lightgbm as lgb
from catboost import CatBoostRegressor, CatBoostClassifier
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# visualize
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
from matplotlib import pyplot
from matplotlib.ticker import ScalarFormatter
sns.set_context("talk")
style.use('seaborn-colorblind')

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

import warnings
warnings.filterwarnings('ignore')
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Config

In [None]:
SEED = 42
NFOLD = 10
NSA = 5 # number of seed average

# Load data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jan-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jan-2021/test.csv')

features = [f'cont{i}' for i in range(1, 15)]
target_col = 'target'

X_train = train.drop(['id', 'target'], axis=1)
y_train = train['target']
X_test = test.drop('id', axis=1)

In [None]:
print(X_train.shape)
X_train.head()

In [None]:
print(X_test.shape)
X_test.head()

# Target
Normal?

In [None]:
y_train.hist()

# GBDT

In [None]:
xgb_params = {
    'colsample_bytree': 0.4,                 
    'learning_rate': 0.01,
    'max_depth': 7,
    'subsample': 1,
    'min_child_weight': 4,
    'gamma': 0.24,
    'alpha': 1,
    'lambda': 1,
    'seed': SEED,
    'n_estimators': 800,
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
}

lgb_params = {
    'num_leaves': 1024,
    'objective': 'regression',
    'boosting_type': 'gbdt',
    'max_depth': 11,
    'learning_rate': 0.01,
    'subsample': 0.72,
    'subsample_freq': 4,
    'feature_fraction': 0.6,
    'lambda_l1': 1,
    'lambda_l2': 1,
    'seed': SEED,
    'early_stopping_rounds': 80,
    'metric': 'rmse'
    
}

catb_params = { 
    'task_type': "CPU",
    'learning_rate': 0.01, 
    'iterations': 1200,
    'colsample_bylevel': 0.5,
    'random_seed': SEED,
    'use_best_model': True,
    'early_stopping_rounds': 80,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE'
}
            
def fit_gbdt(params, X_train, y_train, X_test, features=features, n_fold=NFOLD, seed=SEED, modelname='xgb'):
    cv = KFold(n_splits=n_fold, shuffle=True, random_state=seed)

    models = []
    oof_train = np.zeros((len(X_train),))
    y_preds = np.zeros((len(X_test),))
    
    # feature importance
    fi_df = pd.DataFrame()
    fi_df['features'] = features

    for fold_id, (train_index, valid_index) in tqdm(enumerate(cv.split(X_train, y_train))):
        # split
        X_tr = X_train.loc[train_index, features]
        X_val = X_train.loc[valid_index, features]
        y_tr = y_train.loc[train_index].values
        y_val = y_train.loc[valid_index].values
        
        # model
        if modelname == 'xgb':
            model = xgb.XGBRegressor(**params)
            model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)],
                        early_stopping_rounds=40, verbose=100)

            # feature importance
            importance = model.get_booster().get_score(importance_type='gain')
            importance = sorted(importance.items(), key=operator.itemgetter(1))
            importance = pd.DataFrame(importance, columns=['features', f'importance_cv{fold_id}'])
            fi_df = fi_df.merge(importance, how='left', on='features')
            
        elif modelname == 'lgb':
            model = lgb.LGBMRegressor(**params)
            model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)],
                verbose=-1, categorical_feature=[])
            fi_df[f'importance_cv{fold_id}'] = model.booster_.feature_importance(importance_type="gain")
            
        elif modelname == 'catb':
            model = CatBoostRegressor(**params)
            model.fit(X_tr, y_tr, eval_set=(X_val, y_val),
                verbose=100, cat_features=[])     
            fi_df[f'importance_cv{fold_id}'] = model.get_feature_importance()       

        # predict
        oof_train[valid_index] = model.predict(X_val)
        y_pred = model.predict(X_test[features])
        y_preds += y_pred / n_fold
        models.append(model)
        
    return oof_train, y_preds, models, fi_df

## XGB

In [None]:
%%time

oof = np.zeros(len(train))
y_pred = np.zeros(len(test))

for n in range(NSA):
    # fit
    oof_train, y_preds, models, fi_df = fit_gbdt(xgb_params, 
        X_train, y_train, X_test, features=features, n_fold=NFOLD, 
        seed=SEED + n**2, modelname='xgb')
    
    # average
    oof += oof_train / NSA
    y_pred += y_preds / NSA

In [None]:
fi_df['importance_mean'] = fi_df.values[:, 1:].mean(axis=1)
sns.barplot(x='importance_mean', y='features', data=fi_df.sort_values(by='importance_mean', ascending=False))

# Score

In [None]:
print(f'CV score: {mean_squared_error(y_train, oof, squared=False)}')

# Submit

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-jan-2021/sample_submission.csv')
sub['target'] = y_pred
sub.to_csv('submission.csv', index=False)
sub.head()