In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/train.csv')
train.head()

In [None]:
plt.figure(figsize=(15,8), dpi = 120)
sns.histplot(data = train, x = 'loss')
plt.show()

In [None]:
fig, axes = plt.subplots(20,5, figsize =(20,70))
i = 0
for ax in axes:
    for a in ax:
        col = 'f{}'.format(i)
        sns.histplot(data = train, x =col, ax = a)
        a.set_xlabel('')
        a.set_ylabel('')
        a.set_title(col)
        i += 1

In [None]:
X = train.drop(['id', 'loss'], axis = 1)
y = train['loss'].values

In [None]:
pd.DataFrame(train.skew()).style.background_gradient()

Skewness is not too large, so no need to transform log or exp

In [None]:
pd.DataFrame(X).describe()[1:].style.background_gradient(cmap ='YlOrRd', axis = None)

In [None]:
pd.DataFrame(X.drop('f60', axis = 1)).describe()[1:].style.background_gradient(cmap ='YlOrRd', axis = None)

In [None]:
pd.DataFrame(X.drop(['f60', 'f16'], axis = 1)).describe()[1:].style.background_gradient(cmap ='YlOrRd', axis = None)

In [None]:
fig, axes = plt.subplots(20,5, figsize = (20,70))
i= 0
for ax in axes:
    for a in ax:
        col = 'f{}'.format(i)
        sns.boxplot(data = train, x = col, ax = a)
        a.set_title(col)
        a.set_xlabel('')
        a.set_ylabel('')
        i +=1

Use RobustScaler to standard data

In [None]:
from sklearn.preprocessing import RobustScaler
scale = RobustScaler()
scale.fit(X)
X = scale.transform(X)
X

In [None]:
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
X_train.shape, X_test.shape

### model lgbm

In [None]:
# model LGBM
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import StackingRegressor

from lightgbm import LGBMRegressor

In [None]:
import optuna

In [None]:
def objective_lgbm(trial):
    # model lgbm
    params = {
        'boosting_type': trial.suggest_categorical('boosting_type',['dart', 'gbdt']),
        'num_leaves': trial.suggest_int('num_leaves', 2,256),
        'max_depth': trial.suggest_int('max_depth', -1, 20),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e2),
        'n_estimators': trial.suggest_int('n_estimators', 200, 2400, 200),
        'subsample': trial.suggest_float('subsample',0.05,1),
        'min_child_samples' : trial.suggest_int('min_child_samples', 20, 100, 5),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'colsample_bytree' : trial.suggest_loguniform('colsample_bytree', 1e-8, 1.0)
}
    lgbm = LGBMRegressor().set_params(**params)
    lgbm.fit(X_train, y_train)
    y_pred = lgbm.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared= False)
    return rmse

In [None]:
study_lgbm = optuna.create_study(direction = 'minimize')
study_lgbm.optimize(objective_lgbm, n_trials = 100)

In [None]:
study_lgbm.best_trial.params

### model xgb

In [None]:
from xgboost import XGBRegressor

In [None]:
def objective_xgb(trial):
    # model lgbm
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
}

    if params['booster'] == 'gbtree' or params['booster'] == 'dart':
        params['max_depth'] = trial.suggest_int('max_depth', 0, 15)
        params['eta'] = trial.suggest_loguniform('eta', 1e-8, 1.0)
        params['gamma'] = trial.suggest_loguniform('gamma', 1e-8, 1.0)
        params['grow_policy'] = trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])
        params['subsample'] = trial.suggest_float('subsample', 1e-8,1)
        params['max_bin'] = trial.suggest_int('max_bin',256, 2048, step = 32)
        params['max_leaves'] = trial.suggest_int('max_leaves', 0,20)
    if params['booster'] == 'dart':
        params['sample_type'] = trial.suggest_categorical('sample_type', ['uniform', 'weighted'])
        params['normalize_type'] = trial.suggest_categorical('normalize_type', ['tree', 'forest'])
        params['rate_drop'] = trial.suggest_loguniform('rate_drop', 1e-8, 1.0)
        params['skip_drop'] = trial.suggest_loguniform('skip_drop', 1e-8, 1.0)
    
    xgbreg = XGBRegressor().set_params(**params)
    xgbreg.fit(X_train, y_train)
    y_pred = xgbreg.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared= False)
    return rmse

In [None]:
study_xgb = optuna.create_study(direction = 'minimize')
study_xgb.optimize(objective_xgb, n_trials = 100)

In [None]:
study_xgb.best_trial.params

In [None]:
'''lgbm = LGBMRegressor(
 boosting_type= 'dart',
 num_leaves= 25,
 max_depth= 2,
 learning_rate= 0.27705597165794427,
 n_estimators= 1200,
 subsample= 0.3901780564911078,
 min_child_samples= 80,
 reg_lambda= 0.0002860193200825624,
 reg_alpha= 0.18863986328479146,
 colsample_bytree= 0.02750628848161378
)

xgb = XGBRegressor(
 booster = 'gbtree',
 reg_lambda = 1.3715924919210447e-06,
 reg_alpha = 5.3840029128828335e-08,
 max_depth = 3,
 eta =  0.261274409181448,
 gamma = 0.00046895403732967537,
 grow_policy = 'lossguide',
 subsample = 0.9007410046689116,
 max_bin = 448,
 max_leaves = 6
)'''
lgbm = LGBMRegressor().set_params(**study_lgbm.best_trial.params)
xgb = LGBMRegressor().set_params(**study_xgb.best_trial.params)

stacking_model = StackingRegressor([
    ('lgbm', lgbm),
    ('xgb', xgb)
])

In [None]:
from sklearn import set_config
set_config(display='diagram')
stacking_model

In [None]:
stacking_model.fit(X, y)

In [None]:
test = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/test.csv', index_col=0)
test.head()

In [None]:
test = scale.transform(test)
y_pred = stacking_model.predict(test)
y_pred

In [None]:
submission = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/sample_submission.csv')
submission['loss'] = y_pred
submission.to_csv('./submission.csv', index = False)