In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np 
import pandas as pd 
import re

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib_venn import venn2
import category_encoders as ce
%matplotlib inline

#from xfeat import (SelectCategorical, LabelEncoder, Pipeline, ConcatCombination, SelectNumerical, 
#                   ArithmeticCombinations, TargetEncoder, aggregation, GBDTFeatureSelector, GBDTFeatureExplorer)

from catboost import CatBoost
from catboost import CatBoostClassifier
from catboost import Pool
from catboost import cv
from sklearn.metrics import mean_squared_log_error
import lightgbm as lgb
import xgboost as xgb
from tqdm import tqdm

import os
from glob import glob

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

import shap

from optuna.integration import _lightgbm_tuner as lgb_tuner
import optuna
from collections import Counter
pd.set_option('display.max_columns', 100)

import warnings
warnings.filterwarnings('ignore')

In [None]:
train_df = pd.read_csv("../input/tabular-playground-series-feb-2021/train.csv")
test_df = pd.read_csv("../input/tabular-playground-series-feb-2021/test.csv")
submission = pd.read_csv("../input/tabular-playground-series-feb-2021/sample_submission.csv")

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
fig = plt.figure(figsize = (16,8))
for i in range(0,10):
    ax = fig.add_subplot(2,5,i+1)
    sns.countplot(train_df["cat"+str(i)])
    plt.title("train")
    plt.tight_layout()

In [None]:
fig = plt.figure(figsize = (16,8))
for i in range(0,10):
    ax = fig.add_subplot(2,5,i+1)
    sns.countplot(test_df["cat"+str(i)])
    plt.title("test")
    plt.tight_layout()

In [None]:
fig = plt.figure(figsize = (16,16))
for i in range(0,14):
    ax = fig.add_subplot(4,4,i+1)
    sns.distplot(train_df["cont"+str(i)], label='train')
    sns.distplot(test_df["cont"+str(i)], label='test')
    plt.legend()
    plt.title("cont"+str(i))
    plt.tight_layout()

# preprocess

In [None]:
# https://www.guruguru.science/competitions/13/discussions/41b4ac2d-690b-4ba5-8ff7-be3639578bc1/

# BaseBlock 
class BaseBlock(object):
    def fit(self, input_df, y=None):
        return self.transform(input_df)
    
    def transform(self, input_df):
        raise NotImplementedError()

# OneHotEncoding
class OneHotEncodingBlock(BaseBlock):
    def __init__(self, cols):
        self.cols = cols
        self.encoder = None
        
    def fit(self, input_df, y=None):
        self.encoder = ce.OneHotEncoder(use_cat_names=True)
        self.encoder.fit(input_df[self.cols])
        return self.transform(input_df[self.cols])
    
    def transform(self, input_df):
        return self.encoder.transform(input_df[self.cols]).add_prefix("OHE_")
    
# CountEncoding
class CountEncodingBlock(BaseBlock):
    def __init__(self, cols):
        self.cols = cols
        self.encoder = None
    
    def fit(self, input_df, y=None):
        return self.transform(input_df[self.cols])

    def transform(self, input_df):
        self.encoder = ce.CountEncoder()
        self.encoder.fit(input_df[self.cols])
        return self.encoder.transform(input_df[self.cols]).add_prefix("CE_")
    
# OrdinalEncoding
class OrdinalEncodingBlock(BaseBlock):
    def __init__(self, cols):
        self.cols = cols
        self.encoder = None
        
    def fit(self, input_df, y=None):
        self.encoder = ce.OrdinalEncoder()
        self.encoder.fit(input_df[self.cols])
        return self.transform(input_df[self.cols])
    
    def transform(self, input_df):
        return self.encoder.transform(input_df[self.cols]).add_prefix("OE_")

In [None]:
def get_ce_features(input_df):
    _input_df = pd.concat([input_df], axis=1)

    cols = [
        "cat0",
        "cat1",
        "cat2",
        "cat3",
        "cat4",
        "cat5",
        "cat6",
        "cat7",
        "cat8",
        "cat9",
    ]
    encoder = CountEncodingBlock(cols=cols)
    output_df = encoder.fit(_input_df.astype(str))
    return output_df

def get_oe_features(input_df):
    _input_df = pd.concat([input_df])
    cols = [
        "cat0",
        "cat1",
        "cat2",
        "cat3",
        "cat4",
        "cat5",
        "cat6",
        "cat7",
        "cat8",
        "cat9",
    ]
    encoder = OrdinalEncodingBlock(cols=cols)
    output_df = encoder.fit(input_df)
    return output_df


def get_ohe_features(input_df):
    cols = [
        "cat0",
        "cat1",
        "cat2",
        "cat3",
        "cat4",
        "cat5",
        "cat6",
        "cat7",
        "cat8",
        "cat9",
    ]
    encoder = OneHotEncodingBlock(cols=cols)
    output_df = encoder.fit(input_df)
    return output_df

In [None]:
def create_continuous_features(input_df):
    use_columns = ["cont0","cont1","cont2","cont3","cont4",
                   "cont5","cont6","cont7","cont8","cont9",
                   "cont10","cont11","cont12","cont13"
                  ]
    output_df = input_df[use_columns]
    return output_df

In [None]:
# propress
def to_features(train, test):
    input_df = pd.concat([train, test]).reset_index(drop=True)

    processes = [
        get_oe_features,
        get_ce_features,
        get_ohe_features,
        create_continuous_features
    ]

    output_df = pd.DataFrame()
    for func in tqdm(processes):
        _df = func(input_df)
        assert len(_df) == len(input_df), func.__name__
        output_df = pd.concat([output_df, _df], axis=1)

    train_x = output_df.iloc[:len(train)] 
    test_x = output_df.iloc[len(train):].reset_index(drop=True)
    return train_x, test_x

In [None]:
target_data = "target" 

train_x, test_x = to_features(train_df, test_df)
train_ys = train_df[target_data]

In [None]:
train_x.info()

In [None]:
train_ys

In [None]:
from contextlib import contextmanager
from time import time

@contextmanager
def timer(logger=None, format_str='{:.3f}[s]', prefix=None, suffix=None):
    if prefix: format_str = str(prefix) + format_str
    if suffix: format_str = format_str + str(suffix)
    start = time()
    yield
    d = time() - start
    out_str = format_str.format(d)
    if logger:
        logger.info(out_str)
    else:
        print(out_str)

In [None]:
def fit_lgbm(X, y, cv, params: dict=None, verbose: int=50):
    metric_func = mean_squared_error

    if params is None:
        params = {}

    models = []

    oof_pred = np.zeros_like(y, dtype=np.float)

    for i, (idx_train, idx_valid) in enumerate(cv): 

        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]

        clf = lgb.LGBMRegressor(**params)

        with timer(prefix='fit fold={} '.format(i + 1)):
            clf.fit(x_train, y_train, 
                    eval_set=[(x_valid, y_valid)],  
                    early_stopping_rounds=verbose,
                    verbose=verbose)

        pred_i = clf.predict(x_valid)

        oof_pred[idx_valid] = pred_i
        models.append(clf)

        print(f'Fold {i} RMSE: {metric_func(y_valid, pred_i) ** .5:.4f}')
        
    score = metric_func(y, oof_pred) ** .5
    print('FINISHED | Whole RMSE: {:.4f}'.format(score))
    return oof_pred, models

In [None]:
def fit_xgb(X, y, cv, params: dict=None, verbose: int=50):
    metric_func = mean_squared_error
    if params is None:
        params = {}

    models = []
    oof_pred = np.zeros_like(y, dtype=np.float)

    for i, (idx_train, idx_valid) in enumerate(cv): 
        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]
        
        model_xgb = xgb.XGBRegressor(**params)

        with timer(prefix='fit fold={} '.format(i + 1)):
            model_xgb.fit(x_train, y_train, eval_set=[(x_valid, y_valid)])
            
        #print(model_xgb.best_score())
        
        pred_i = model_xgb.predict(x_valid)

        oof_pred[idx_valid] = pred_i
        models.append(model_xgb)

        print(f'Fold {i} RMSE: {metric_func(y_valid, pred_i) ** .5:.4f}')

    score = metric_func(y, oof_pred) ** .5
    print('FINISHED | Whole RMSE: {:.4f}'.format(score))
    return oof_pred, models

In [None]:
def fit_cb(X, y, cv, params: dict=None, verbose: int=50):
    metric_func = mean_squared_error
    if params is None:
        params = {}

    models = []
    oof_pred = np.zeros_like(y, dtype=np.float)

    for i, (idx_train, idx_valid) in enumerate(cv): 
        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]
        
        train_pool = Pool(x_train, label = y_train)
        valid_pool = Pool(x_valid, label = y_valid)
        
        model_cb = CatBoost(params)

        with timer(prefix='fit fold={} '.format(i + 1)):
            model_cb.fit(train_pool,
              # valid_data
              eval_set = valid_pool,
              use_best_model = True,
              silent = True,
              plot = False)
            
        print(model_cb.get_best_score())
        
        pred_i = model_cb.predict(x_valid)

        oof_pred[idx_valid] = pred_i
        models.append(model_cb)

        print(f'Fold {i} RMSE: {metric_func(y_valid, pred_i) ** .5:.4f}')

    score = metric_func(y, oof_pred) ** .5
    print('FINISHED | Whole RMSE: {:.4f}'.format(score))
    return oof_pred, models

# Stratified_folds_for_regression

thanks for good information!
https://www.kaggle.com/c/tabular-playground-series-feb-2021/discussion/216576

In [None]:
def create_stratified_folds_for_regression(data_df, n_splits=5):
    """
    @param data_df: training data to split in Stratified K Folds for a continous target value
    @param n_splits: number of splits
    @return: the training data with a column with kfold id
    """
    data_df['kfold'] = -1
    # randomize the data
    data_df = data_df.sample(frac=1).reset_index(drop=True)
    # calculate the optimal number of bins based on log2(data_df.shape[0])
    num_bins = np.int(np.floor(1 + np.log2(len(data_df))))
    print(f"Num bins: {num_bins}")
    # bins value will be the equivalent of class value of target feature used by StratifiedKFold to 
    # distribute evenly the classed over each fold
    data_df.loc[:, "bins"] = pd.cut(pd.to_numeric(data_df['target'], downcast="signed"), bins=num_bins, labels=False)
    kf = StratifiedKFold(n_splits=n_splits)
    
    # set the fold id as a new column in the train data
    for f, (t_, v_) in enumerate(kf.split(X=data_df, y=data_df.bins.values)):
        data_df.loc[v_, 'kfold'] = f
    
    # drop the bins column (no longer needed)
    data_df = data_df.drop("bins", axis=1)
    
    return data_df

In [None]:
def kfold_splits(n_splits, train_df):
    """
    Returns a collection of (fold, train indexes, validation indexes)
    @param n_splits: number of splits
    @param train_df: training data
    @return: a collection of (fold, train indexes, validation indexes)
    """
    
    # not append "fold" => my function
    all_folds = list(range(0, n_splits))
    kf_splits = []
    for fold in range(0, n_splits):
        train_folds = [x for x in all_folds if x != fold]
        trn_idx = train_df[train_df.kfold!=fold].index
        val_idx = train_df[train_df.kfold==fold].index
        kf_splits.append((trn_idx, val_idx))
    return kf_splits

In [None]:
train_df_re = pd.concat([train_x, train_ys], axis=1)

In [None]:
n_splits = 9
train_df_re = create_stratified_folds_for_regression(train_df_re, n_splits)

In [None]:
stratified_cv = kfold_splits(n_splits, train_df_re)

# optuna

In [None]:
def fit_lgbm_param_optuna(X, 
             y, 
             cv, 
             params: dict=None, 
             verbose: int=50):
    metric_func = mean_squared_error

    if params is None:
        params = {}

    models = []

    oof_pred = np.zeros_like(y, dtype=np.float)

    for i, (idx_train, idx_valid) in enumerate(cv): 

        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]

        clf = lgb.LGBMRegressor(**params)

        with timer(prefix='fit fold={} '.format(i + 1)):
            clf.fit(x_train, y_train, 
                    eval_set=[(x_valid, y_valid)],  
                    early_stopping_rounds=verbose,
                    verbose=verbose)

        pred_i = clf.predict(x_valid)

        oof_pred[idx_valid] = pred_i
        models.append(clf)

    score = metric_func(y, oof_pred) ** .5
    return score

def objective(trial):
    
    #fold = KFold(n_splits=5, shuffle=True, random_state=71)
    #cv = list(fold.split(train_x, train_ys))
    optuna_paramas_lgb = {
        'num_leaves': trial.suggest_int('num_leaves', 32, 512),
        'boosting_type': 'gbdt',
        'max_bin': trial.suggest_int('max_bin', 700, 900),
        'objective': 'huber',
        'metric': 'mae',
        'learning_rate': trial.suggest_float('learning_rate',0.0155,0.05),
        'random_state' : 71,
        'max_depth': trial.suggest_int('max_depth', 4, 16),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 16),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 8),
        'min_child_samples': trial.suggest_int('min_child_samples', 4, 80),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 1.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 1.0),
        'early_stopping_rounds': 10
            
}
    score = fit_lgbm_param_optuna(train_x.values,  train_ys, stratified_cv, params=optuna_paramas_lgb)
    
    return score

#study = optuna.create_study(direction="minimize", study_name='lgbm_train')
#study.optimize(objective, n_trials=50)

In [None]:
#study.best_params

"""
lgb
{'num_leaves': 385,
 'max_bin': 887,
 'learning_rate': 0.049867328104748844,
 'max_depth': 14,
 'min_child_weight': 10,
 'feature_fraction': 0.4511004151880547,
 'bagging_fraction': 0.6559039807249963,
 'bagging_freq': 2,
 'min_child_samples': 77,
 'lambda_l1': 4.638151021025029e-08,
 'lambda_l2': 0.2937304195136803}"""

In [None]:
def fit_xgb_optuna(X, y, cv, params: dict=None, verbose: int=50):
    metric_func = mean_squared_error
    if params is None:
        params = {}

    models = []
    oof_pred = np.zeros_like(y, dtype=np.float)

    for i, (idx_train, idx_valid) in enumerate(cv): 
        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]
        
        model_xgb = xgb.XGBRegressor(**params)

        with timer(prefix='fit fold={} '.format(i + 1)):
            model_xgb.fit(x_train, y_train, eval_set=[(x_valid, y_valid)])
            
        #print(model_xgb.best_score())
        
        pred_i = model_xgb.predict(x_valid)

        oof_pred[idx_valid] = pred_i
        models.append(model_xgb)

    score = metric_func(y, oof_pred) ** .5

    return score

def objective_xgb(trial):
    
    fold = KFold(n_splits=5, shuffle=True, random_state=71)
    cv = list(fold.split(train_x, train_ys))
    optuna_paramas_xgb = {
        'booster': 'gbtree',
        'max_bin': trial.suggest_int('max_bin', 700, 900),
        'objective': 'reg:squarederror',
        'eval_metric': 'mae',
        'learning_rate': trial.suggest_float('learning_rate',0.0155,0.05),
        'random_state' : 71,
        'max_depth': trial.suggest_int('max_depth', 4, 16),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 16),
        'subsample': trial.suggest_uniform('subsample', 0.4, 1.0),
        'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
        'early_stopping_rounds': 10
    }
    
    score = fit_xgb_optuna(train_x.values,  train_ys, stratified_cv, params=optuna_paramas_xgb)
    
    return score

#study = optuna.create_study(direction="minimize", study_name='xgb_train')
#study.optimize(objective_xgb, n_trials=10)

In [None]:
#study.best_params
"""
{'max_bin': 830,
 'learning_rate': 0.048518442248912635,
 'max_depth': 15,
 'min_child_weight': 7,
 'subsample': 0.9080463485454009,
 'lambda': 5.370896698434827e-07,
 'alpha': 0.005799175899438967}"""

In [None]:
params_best = {
    'num_leaves': 385,
    'max_bin': 887,
    'learning_rate': 0.049867328104748844,
    'max_depth': 14,
    'min_child_weight': 10,
    'feature_fraction': 0.4511004151880547,
    'bagging_fraction': 0.6559039807249963,
    'bagging_freq': 2,
    'min_child_samples': 77,
    'lambda_l1': 4.638151021025029e-08,
    'lambda_l2': 0.2937304195136803,
    "random_state": 71,
    "num_boost_round": 50000,
    "early_stopping_rounds": 100,
    'objective': 'regression',
    'metric': 'rmse',
    "boosting": "gbdt",
}

#fold = KFold(n_splits=5, shuffle=True, random_state=71)
#cv = list(fold.split(train_x, train_ys))

oof, models = fit_lgbm(train_x.values, train_ys, stratified_cv, params=params_best)

In [None]:
import xgboost as xgb
params_xgb = {
        'max_bin': 830,
 'learning_rate': 0.048518442248912635,
 'max_depth': 15,
 'min_child_weight': 7,
 'subsample': 0.9080463485454009,
 'lambda': 5.370896698434827e-07,
 'alpha': 0.005799175899438967
}

fold = KFold(n_splits=5, shuffle=True, random_state=71)
cv = list(fold.split(train_x, train_ys))

oof_xgb, models_xgb = fit_xgb(train_x.values, train_ys, stratified_cv, params=params_xgb)

In [None]:
params_cb = {
    'loss_function': 'RMSE',
    'max_depth': 3, 
    'learning_rate': 0.08, 
    'subsample': 0.8, 
    #'colsample_bytree': 0.7,
    'num_boost_round': 1000,
    'early_stopping_rounds': 100,
}

oof_cb, models_cb = fit_cb(train_x.values, train_ys, stratified_cv, params=params_cb)

In [None]:
def visualize_importance(models, feat_train_df):

    feature_importance_df = pd.DataFrame()
    for i, model in enumerate(models):
        _df = pd.DataFrame()
        _df['feature_importance'] = model.feature_importances_
        _df['column'] = feat_train_df.columns
        _df['fold'] = i + 1
        feature_importance_df = pd.concat([feature_importance_df, _df], axis=0, ignore_index=True)

    order = feature_importance_df.groupby('column')\
        .sum()[['feature_importance']]\
        .sort_values('feature_importance', ascending=False).index[:50]

    fig, ax = plt.subplots(figsize=(max(6, len(order) * .4), 7))
    sns.boxenplot(data=feature_importance_df, x='column', y='feature_importance', order=order, ax=ax, palette='viridis')
    ax.tick_params(axis='x', rotation=90)
    ax.grid()
    fig.tight_layout()
    return fig, ax

In [None]:
fig, ax = visualize_importance(models, train_x)

In [None]:
pred_lgb = np.array([model.predict(test_x.values) for model in models])
pred_lgb = np.mean(pred_lgb, axis=0)
pred_lgb = np.where(pred_lgb < 0, 0, pred_lgb)

In [None]:
pred_xgb = np.array([model.predict(test_x.values) for model in models_xgb])
pred_xgb = np.mean(pred_xgb, axis=0)
pred_xgb = np.where(pred_xgb < 0, 0, pred_xgb)

In [None]:
pred_cb = np.array([model.predict(test_x.values) for model in models_cb])
pred_cb = np.mean(pred_cb, axis=0)
pred_cb = np.where(pred_cb < 0, 0, pred_cb)

In [None]:
#oof_em = (oof+oof_xgb+oof_cb)/3
oof_em = oof*0.3+oof_xgb*0.1+oof_cb*0.6

metric_func = mean_squared_error

score = metric_func(train_ys, oof_em) ** .5

print(score)
#0.8439838083404265

In [None]:
#pred = (pred_lgb + pred_xgb + pred_cb)/3
pred_em = pred_lgb*0.3 + pred_xgb*0.1 +pred_cb*0.6

In [None]:
submission["target"] = pred_em
submission.to_csv('./submission.csv', index=False)

In [None]:
fig, ax = plt.subplots(figsize=(8, 8))
sns.distplot(oof, label='Test Predict')
sns.distplot(submission["target"], label='Out Of Fold')
ax.legend()
ax.grid()