Yes, linear stacking of GBDT and NN! A Promised Method:)

# Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import decomposition
from sklearn.preprocessing import StandardScaler, MinMaxScaler, QuantileTransformer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from tqdm.auto import tqdm
from sklearn import linear_model
import xgboost as xgb
import operator
import lightgbm as lgb
from catboost import CatBoostRegressor, CatBoostClassifier
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# visualize
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
from matplotlib import pyplot
from matplotlib.ticker import ScalarFormatter
sns.set_context("talk")
style.use('seaborn-colorblind')

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

import warnings
warnings.filterwarnings('ignore')
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Config

In [None]:
SEED = 42
NFOLD = 10

# Load data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jan-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jan-2021/test.csv')

features = [f'cont{i}' for i in range(1, 15)]
target_col = 'target'

X_train = train.drop(['id', 'target'], axis=1)
y_train = train['target']
X_test = test.drop('id', axis=1)

In [None]:
print(X_train.shape)
X_train.head()

In [None]:
print(X_test.shape)
X_test.head()

# Target
Normal?

In [None]:
y_train.hist()

# GBDT

In [None]:
xgb_params = {
    'colsample_bytree': 0.4,                 
    'learning_rate': 0.01,
    'max_depth': 7,
    'subsample': 1,
    'min_child_weight': 4,
    'gamma': 0.24,
    'alpha': 1,
    'lambda': 1,
    'seed': SEED,
    'n_estimators': 800,
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
}

lgb_params = {
    'num_leaves': 512,
    'objective': 'regression',
    'boosting_type': 'gbdt',
    'max_depth': 12,
    'learning_rate': 0.01,
    'subsample': 0.72,
    'subsample_freq': 4,
    'feature_fraction': 0.6,
    'lambda_l1': 1,
    'lambda_l2': 1,
    'seed': SEED,
    'early_stopping_rounds': 80,
    'metric': 'rmse'
    
}

catb_params = { 
    'task_type': "CPU",
    'learning_rate': 0.01, 
    'iterations': 1200,
    'colsample_bylevel': 0.5,
    'random_seed': SEED,
    'use_best_model': True,
    'early_stopping_rounds': 80,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE'
}
            
def fit_gbdt(params, X_train, y_train, X_test, features=features, n_fold=NFOLD, seed=SEED, modelname='xgb'):
    cv = KFold(n_splits=n_fold, shuffle=True, random_state=seed)

    models = []
    oof_train = np.zeros((len(X_train),))
    y_preds = np.zeros((len(X_test),))
    
    # feature importance
    fi_df = pd.DataFrame()
    fi_df['features'] = features

    for fold_id, (train_index, valid_index) in tqdm(enumerate(cv.split(X_train, y_train))):
        # split
        X_tr = X_train.loc[train_index, features]
        X_val = X_train.loc[valid_index, features]
        y_tr = y_train.loc[train_index].values
        y_val = y_train.loc[valid_index].values
        
        # model
        if modelname == 'xgb':
            model = xgb.XGBRegressor(**params)
            model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)],
                        early_stopping_rounds=40, verbose=100)

            # feature importance
            importance = model.get_booster().get_score(importance_type='gain')
            importance = sorted(importance.items(), key=operator.itemgetter(1))
            importance = pd.DataFrame(importance, columns=['features', f'importance_cv{fold_id}'])
            fi_df = fi_df.merge(importance, how='left', on='features')
            
        elif modelname == 'lgb':
            model = lgb.LGBMRegressor(**params)
            model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)],
                verbose=-1, categorical_feature=[])
            fi_df[f'importance_cv{fold_id}'] = model.booster_.feature_importance(importance_type="gain")
            
        elif modelname == 'catb':
            model = CatBoostRegressor(**params)
            model.fit(X_tr, y_tr, eval_set=(X_val, y_val),
                verbose=100, cat_features=[])     
            fi_df[f'importance_cv{fold_id}'] = model.get_feature_importance()       

        # predict
        oof_train[valid_index] = model.predict(X_val)
        y_pred = model.predict(X_test[features])
        y_preds += y_pred / n_fold
        models.append(model)
        
    return oof_train, y_preds, models, fi_df

## XGB

In [None]:
oof_train_xgb, y_preds_xgb, xgb_models, fi_df = fit_gbdt(xgb_params, X_train, y_train, X_test, features=features, n_fold=NFOLD, seed=SEED, modelname='xgb')

In [None]:
fi_df['importance_mean'] = fi_df.values[:, 1:].mean(axis=1)
sns.barplot(x='importance_mean', y='features', data=fi_df.sort_values(by='importance_mean', ascending=False))

## LGB

In [None]:
oof_train_lgb, y_preds_lgb, lgb_models, fi_df = fit_gbdt(lgb_params, X_train, y_train, X_test, features=features, n_fold=NFOLD, seed=SEED, modelname='lgb')

In [None]:
fi_df['importance_mean'] = fi_df.values[:, 1:].mean(axis=1)
sns.barplot(x='importance_mean', y='features', data=fi_df.sort_values(by='importance_mean', ascending=False))

## CatB

In [None]:
oof_train_catb, y_preds_catb, catb_models, fi_df = fit_gbdt(catb_params, X_train, y_train, X_test, features=features, n_fold=NFOLD, seed=SEED, modelname='catb')

In [None]:
fi_df['importance_mean'] = fi_df.values[:, 1:].mean(axis=1)
sns.barplot(x='importance_mean', y='features', data=fi_df.sort_values(by='importance_mean', ascending=False))

# GBDT scores

In [None]:
print(f'CV (XGB): {mean_squared_error(y_train, oof_train_xgb, squared=False)}')
print(f'CV (LGB): {mean_squared_error(y_train, oof_train_lgb, squared=False)}')
print(f'CV (CATB): {mean_squared_error(y_train, oof_train_catb, squared=False)}')

# NN
We use a simple MLP!

## Scaling
To make sure similar range across features

In [None]:
prep = StandardScaler()
df = pd.concat([X_train[features], X_test[features]])
df[features] = prep.fit_transform(df[features].values)
X_test[features] = df[features].iloc[len(train):]
X_train[features] = df[features].iloc[:len(train)]

In [None]:
print(X_train.shape)
X_train.head()

In [None]:
print(X_test.shape)
X_test.head()

## MLP

In [None]:
import math
import random
from typing import List, NoReturn, Union, Tuple, Optional, Text, Generic, Callable, Dict

# tf keras
import tensorflow as tf
import tensorflow_addons as tfa

def seed_everything(seed : int):    
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)

seed_everything(SEED)    

# adapted from https://github.com/ghmagazine/kagglebook/blob/master/ch06/ch06-03-hopt_nn.py
params = {
    'input_dropout': 0.0,
    'hidden_layers': 3,
    'hidden_units': 128,
    'hidden_activation': 'relu',
    'dropout': 0.2,
    'lr': 1e-2,
    'batch_size': 128,
    'epochs': 196
}
    
def nn_model(params, L):
    """
    NN hyperparameters and models
    
    :INPUT: 
    
    :L: the number of features (int)
    """

    # NN model architecture
    n_neuron = params['hidden_units']

    inputs = tf.keras.layers.Input(shape=(L, ))
    
    x = tf.keras.layers.Dense(n_neuron, activation=params['hidden_activation'])(inputs)
    x = tf.keras.layers.Dropout(params['dropout'])(x)

    # stack more layers
    for i in np.arange(params['hidden_layers'] - 1):
        x = tf.keras.layers.Dense(n_neuron // (2 * (i+1)), activation=params['hidden_activation'])(x)
        x = tf.keras.layers.Dropout(params['dropout'])(x)
        
    # output
    out1 = tf.keras.layers.Dense(1, activation='linear', name = 'out1')(x)
    model = tf.keras.models.Model(inputs=inputs, outputs=out1)

    # compile
    loss = 'mse'
    opt = tfa.optimizers.RectifiedAdam(lr=params['lr'])
    model.compile(loss=loss, optimizer=opt, metrics=[tf.keras.metrics.RootMeanSquaredError()])
    
    return model

model = nn_model(params, len(features))
model.summary()

## 1DCNN
Inspired from https://www.kaggle.com/sishihara/1dcnn-for-tabular-from-moa-2nd-place

Make sure you upvote the kernel.

In [None]:
def cnn_model(params, L):
    """
    NN hyperparameters and models
    
    :INPUT: 
    
    :L: the number of features (int)
    """

    # NN model architecture
    n_neuron = params['hidden_units']

    inputs = tf.keras.layers.Input(shape=(L, ))
    
    # 1dcnn
    x = tf.keras.layers.Dense(4096, activation=params['hidden_activation'])(inputs)
    x = tf.keras.layers.Reshape((256, 16))(x)
    x = tf.keras.layers.Conv1D(filters=16,
                      kernel_size=5,
                      strides=1,
                      activation=params['hidden_activation'])(x)
    x = tf.keras.layers.MaxPooling1D(pool_size=2)(x)
    x = tf.keras.layers.Flatten()(x)
    
    # ffn
    x = tf.keras.layers.Dense(n_neuron, activation=params['hidden_activation'])(x)
    x = tf.keras.layers.Dropout(params['dropout'])(x)

    # stack more layers
    for i in np.arange(params['hidden_layers'] - 1):
        x = tf.keras.layers.Dense(n_neuron // (2 * (i+1)), activation=params['hidden_activation'])(x)
        x = tf.keras.layers.Dropout(params['dropout'])(x)
        
    # output
    out1 = tf.keras.layers.Dense(1, activation='linear', name = 'out1')(x)
    model = tf.keras.models.Model(inputs=inputs, outputs=out1)

    # compile
    loss = 'mse'
    opt = tfa.optimizers.RectifiedAdam(lr=params['lr'])
    model.compile(loss=loss, optimizer=opt, metrics=[tf.keras.metrics.RootMeanSquaredError()])
    
    return model

model = cnn_model(params, len(features))
model.summary()

In [None]:
def fit_model(params, X_train, y_train, X_test, features=features, n_fold=NFOLD, seed=SEED, modelname='mlp'):
    cv = KFold(n_splits=n_fold, shuffle=True, random_state=seed)

    models = []
    oof_train = np.zeros((len(X_train),))
    y_preds = np.zeros((len(X_test),))

    for fold_id, (train_index, valid_index) in tqdm(enumerate(cv.split(X_train, y_train))):
        # split
        X_tr = X_train.loc[train_index, features].values
        X_val = X_train.loc[valid_index, features].values
        y_tr = y_train.loc[train_index].values
        y_val = y_train.loc[valid_index].values
        
        # model
        tf.keras.backend.clear_session()
        if modelname == 'mlp':
            model = nn_model(params, len(features))
        elif modelname == 'cnn':
            model = cnn_model(params, len(features))
            
        # callbacks
        er = tf.keras.callbacks.EarlyStopping(patience=8, restore_best_weights=True, monitor='val_loss')
        ReduceLR = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=8, verbose=1, mode='min')
        model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=f'mybestweight{fold_id}_{modelname}.hdf5', 
                                                              save_weights_only=True, verbose=0, monitor='val_loss', save_best_only=True)

        # fit
        history = model.fit(X_tr, y_tr, callbacks=[er, ReduceLR, model_checkpoint_callback], 
                            verbose=2, epochs=params['epochs'], batch_size=params['batch_size'],
                            validation_data=(X_val, y_val)) 
        
        # predict
        oof_train[valid_index] = model.predict(X_val).ravel()
        y_pred = model.predict(X_test[features].values).ravel()
        y_preds += y_pred / n_fold
        models.append(model)
        
    return oof_train, y_preds, models

In [None]:
oof_train_mlp, y_preds_mlp, mlp_models = fit_model(params, X_train, y_train, X_test, features=features, n_fold=NFOLD, seed=SEED, modelname='mlp')

In [None]:
oof_train_cnn, y_preds_cnn, cnn_models = fit_model(params, X_train, y_train, X_test, features=features, n_fold=NFOLD, seed=SEED, modelname='cnn')

# MLP score

In [None]:
print(f'CV (MLP): {mean_squared_error(y_train, oof_train_mlp, squared=False)}')
print(f'CV (1DCNN): {mean_squared_error(y_train, oof_train_cnn, squared=False)}')

# Linear model

In [None]:
lin_params = {
    'alpha': 80, 
    'fit_intercept': True,
    'max_iter': 8000, 
    'tol': 1e-04,
    'random_state': SEED,
}
def fit_linear(params, X_train, y_train, X_test, features=features, n_fold=NFOLD, seed=SEED):
    cv = KFold(n_splits=n_fold, shuffle=True, random_state=seed)

    models = []
    oof_train = np.zeros((len(X_train),))
    y_preds = np.zeros((len(X_test),))
    
    # feature importance
    fi_df = pd.DataFrame()
    fi_df['features'] = features

    for fold_id, (train_index, valid_index) in tqdm(enumerate(cv.split(X_train, y_train))):
        # split
        X_tr = X_train.loc[train_index, features]
        X_val = X_train.loc[valid_index, features]
        y_tr = y_train.loc[train_index].values
        y_val = y_train.loc[valid_index].values
        
        # model
        model = linear_model.Ridge(**params)
        model.fit(X_tr, y_tr)

        # feature importance
        fi_df[f'importance_cv{fold_id}'] = model.coef_.ravel()
            
        # predict
        oof_train[valid_index] = model.predict(X_val)
        y_pred = model.predict(X_test[features])
        y_preds += y_pred / n_fold
        models.append(model)
        
    return oof_train, y_preds, models, fi_df

oof_train_lin, y_preds_lin, lin_models, fi_df = fit_linear(lin_params, 
    X_train, y_train, X_test, features=features, n_fold=NFOLD, seed=SEED)

In [None]:
print(f'CV (Linear): {mean_squared_error(y_train, oof_train_lin, squared=False)}')

# Stacking

In [None]:
# train
stack_train_df = pd.DataFrame()
stack_train_df['mlp'] = oof_train_mlp
stack_train_df['1dcnn'] = oof_train_cnn
stack_train_df['xgb'] = oof_train_xgb
stack_train_df['lgb'] = oof_train_lgb
stack_train_df['catb'] = oof_train_catb
stack_train_df['lin'] = oof_train_lin

# test
stack_test_df = pd.DataFrame()
stack_test_df['mlp'] = y_preds_mlp
stack_test_df['1dcnn'] = y_preds_cnn
stack_test_df['xgb'] = y_preds_xgb
stack_test_df['lgb'] = y_preds_lgb
stack_test_df['catb'] = y_preds_catb
stack_test_df['lin'] = y_preds_lin

In [None]:
oof_train_lin, y_preds_lin, lin_models, fi_df = fit_linear(lin_params, 
    stack_train_df, y_train, stack_test_df, features=stack_test_df.columns.values.tolist(),
    n_fold=NFOLD, seed=SEED)

In [None]:
fi_df['importance_mean'] = fi_df.values[:, 1:].mean(axis=1)
sns.barplot(x='importance_mean', y='features', data=fi_df.sort_values(by='importance_mean'))

# Stacking score

In [None]:
print(f'CV (stacking): {mean_squared_error(y_train, oof_train_lin, squared=False)}')

In [None]:
fi_df['importance_mean'] = fi_df.values[:, 1:].mean(axis=1)
sns.barplot(x='importance_mean', y='features', data=fi_df.sort_values(by='importance_mean'))

# Submit

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-jan-2021/sample_submission.csv')
sub['target'] = y_preds_lin
sub.to_csv('submission.csv', index=False)
sub.head()