In [None]:
import numpy as np 
import pandas as pd
import plotly as py
import plotly.graph_objs as go
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected = True)
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import mean_squared_error

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.model_selection import StratifiedKFold, KFold, LeaveOneGroupOut

from sklearn.ensemble import VotingRegressor

import optuna

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

pd.set_option('display.max_columns', None)
#########################################################
train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')
ss = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')

# Basic information

In [None]:
train.head(3)

In [None]:
train.info()

In [None]:
for i in [train, test]:
    i.drop('id', axis = 1, inplace = True)

# EDA

In [None]:
fig = plt.figure(figsize = (15, 60))
for i in range(len(train.columns.tolist()[:100])):
    plt.subplot(20,5,i+1)
    sns.set_style("white")
    plt.title(train.columns.tolist()[:100][i], size = 12, fontname = 'monospace')
    a = sns.kdeplot(train[train.columns.tolist()[:100][i]], color = '#34675c', shade = True, alpha = 0.9, linewidth = 1.5, edgecolor = 'black')
    plt.ylabel('')
    plt.xlabel('')
    plt.xticks(fontname = 'monospace')
    plt.yticks([])
    for j in ['right', 'left', 'top']:
        a.spines[j].set_visible(False)
        a.spines['bottom'].set_linewidth(1.2)
        
fig.tight_layout(h_pad = 3)

plt.show()

In [None]:
matrix = np.triu(train.corr())
plt.figure(figsize = (15, 12))
sns.heatmap(train.corr(), annot = False, cmap = 'Greens', mask = matrix, vmin = -0.03, vmax = 0.03, linewidths = 0.1, linecolor = 'white', cbar = True)
plt.xticks(size = 8, fontname = 'monospace')
plt.yticks(size = 8, fontname = 'monospace')
plt.figtext(0.77, 0.8, '''All 100 features and the target variable
have a very small
correlation''', fontsize = 20, fontname = 'monospace', ha = 'right', color = '#34675c')
plt.show()

In [None]:
plt.figure(figsize = (14, 7))
sns.set_style("white")
plt.title('Distribution of loss (target)', size = 25, y = 1.03, fontname = 'monospace', color = '#34675c')
plt.grid(color = 'gray', linestyle = ':', axis = 'x', alpha = 0.8, zorder = 0,  dashes = (1,7))
a = sns.kdeplot(train['loss'], color = '#34675c', shade = True, alpha = 0.9, linewidth = 1.5, edgecolor = 'black')
plt.ylabel('')
plt.xlabel('')
plt.xticks(fontname = 'monospace')
plt.yticks([])
for j in ['right', 'left', 'top']:
    a.spines[j].set_visible(False)
    a.spines['bottom'].set_linewidth(1.2)

# Preprocessing

In [None]:
X = train.drop('loss', axis = 1)
y = train['loss']

sc = StandardScaler()
X[X.columns.tolist()] = sc.fit_transform(X[X.columns.tolist()])
test[test.columns.tolist()] = sc.fit_transform(test[test.columns.tolist()])

X.head(3)

# Best CB/XGB/LGBM parameters with Optuna

In [None]:
# esr = 500
# Mean RMSE on 2 folds - 7.8466
paramsCB = {'depth': 5, 'learning_rate': 0.011283180425637522, 'iterations': 19562, 'max_bin': 152, 'min_data_in_leaf': 273, 'l2_leaf_reg': 0.91729332782104, 'subsample': 0.6160764186759223, 'grow_policy': 'Depthwise', 'leaf_estimation_method': 'Newton',
            'random_seed': 228,
            'loss_function': 'RMSE',
            'eval_metric': 'RMSE',
            'bootstrap_type': 'Bernoulli',
            'task_type': 'GPU'}
# Solo result - 7.87348

# esr = 500
# Mean RMSE on 2 folds - 7.8471
paramsLGBM = {'reg_alpha': 4.0695962262784615, 'reg_lambda': 9.190653872177396, 'num_leaves': 397, 'min_child_samples': 30, 'max_depth': 6, 'n_estimators': 8989, 'learning_rate': 0.010633230770718524, 'colsample_bytree': 0.4385122330057919, 'cat_smooth': 79, 'cat_l2': 12, 'min_data_per_group': 126,
              'device_type': 'gpu',
              'boosting_type': 'gbdt',
              'random_state': 228,
              'metric': 'rmse'
              }
# Solo result - 7.87401

# esr = 500
# Mean RMSE on 2 Folds - 7.8466
paramsXGB = {'max_depth': 10, 'learning_rate': 0.010512283852839102, 'n_estimators': 2432, 'min_child_weight': 185, 'gamma': 0.00010339779073732135, 'alpha': 0.00573215966018785, 'lambda': 0.00013592165632140884, 'colsample_bytree': 0.5825502178882395, 'subsample': 0.6245584427453496,
              'tree_method': 'gpu_hist',
              'booster': 'gbtree',
              'random_state': 228,
              'use_label_encoder': False,
              'eval_metric': 'rmse'}
# Solo result - 7.87065

# Voting time

In [None]:
cb_model = CatBoostRegressor(**paramsCB)
lgbm_model = LGBMRegressor(**paramsLGBM)
xgb_model = XGBRegressor(**paramsXGB)

In [None]:
folds = KFold(n_splits = 10, random_state = 228, shuffle = True)

predictions = np.zeros(len(test))

for fold, (trn_idx, val_idx) in enumerate(folds.split(X)):
    
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    model = VotingRegressor(
            estimators = [
                ('cb', cb_model),
                ('lgbm', lgbm_model),
                ('xgb', xgb_model)
            ],
            weights = [0.15, 0.15, 0.7],
            n_jobs = -1
        )
   
    model.fit(X_train, y_train)
    
    predictions += model.predict(test) / folds.n_splits 
    
ss['loss'] = predictions

In [None]:
ss.to_csv('voting.csv', index = False)