In [None]:
import numpy as np 
import pandas as pd
import plotly as py
import plotly.graph_objs as go
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected = True)
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import mean_squared_error

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.model_selection import StratifiedKFold, KFold, LeaveOneGroupOut

import optuna

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

pd.set_option('display.max_columns', None)
#########################################################
train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')
ss = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')

# Basic information

In [None]:
train.head(3)

In [None]:
train.info()

In [None]:
print(f'NA values in train df: {sum(train.isna().sum())}')
print(f'NA values in test df: {sum(test.isna().sum())}')

In [None]:
for i in [train, test]:
    i.drop('id', axis = 1, inplace = True)

# EDA

In [None]:
train.describe()

In [None]:
test.describe()

**The distributions of all 100 features are almost the same.**

In [None]:
fig = plt.figure(figsize = (15, 60))
for i in range(len(train.columns.tolist()[:100])):
    plt.subplot(20,5,i+1)
    sns.set_style("white")
    plt.title(train.columns.tolist()[:100][i], size = 12, fontname = 'monospace')
    a = sns.kdeplot(train[train.columns.tolist()[:100][i]], color = '#34675c', shade = True, alpha = 0.9, linewidth = 1.5, edgecolor = 'black')
    plt.ylabel('')
    plt.xlabel('')
    plt.xticks(fontname = 'monospace')
    plt.yticks([])
    for j in ['right', 'left', 'top']:
        a.spines[j].set_visible(False)
        a.spines['bottom'].set_linewidth(1.2)
        
fig.tight_layout(h_pad = 3)

plt.show()

In [None]:
matrix = np.triu(train.corr())
plt.figure(figsize = (15, 12))
sns.heatmap(train.corr(), annot = False, cmap = 'Greens', mask = matrix, vmin = -0.03, vmax = 0.03, linewidths = 0.1, linecolor = 'white', cbar = True)
plt.xticks(size = 8, fontname = 'monospace')
plt.yticks(size = 8, fontname = 'monospace')
plt.figtext(0.77, 0.8, '''All 100 features and the target variable
have a very small
correlation''', fontsize = 20, fontname = 'monospace', ha = 'right', color = '#34675c')
plt.show()

In [None]:
plt.figure(figsize = (14, 7))
sns.set_style("white")
plt.title('Distribution of loss (target)', size = 25, y = 1.03, fontname = 'monospace', color = '#34675c')
plt.grid(color = 'gray', linestyle = ':', axis = 'x', alpha = 0.8, zorder = 0,  dashes = (1,7))
a = sns.kdeplot(train['loss'], color = '#34675c', shade = True, alpha = 0.9, linewidth = 1.5, edgecolor = 'black')
plt.ylabel('')
plt.xlabel('')
plt.xticks(fontname = 'monospace')
plt.yticks([])
for j in ['right', 'left', 'top']:
    a.spines[j].set_visible(False)
    a.spines['bottom'].set_linewidth(1.2)

# Preproceesing for modeling

In [None]:
X = train.drop('loss', axis = 1)
y = train['loss']

sc = StandardScaler()
X[X.columns.tolist()] = sc.fit_transform(X[X.columns.tolist()])
test[test.columns.tolist()] = sc.fit_transform(test[test.columns.tolist()])

X.head(3)

# CatBoost

In [None]:
def objective(trial, data = X, target = y):
    params = {
        'depth': trial.suggest_int('depth', 2, 6),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'iterations': trial.suggest_int('iterations', 500, 5000),
        'max_bin': trial.suggest_int('max_bin', 1, 300),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 300),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.0001, 1.0, log = True),
        'subsample': trial.suggest_float('subsample', 0.1, 0.8),
        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'leaf_estimation_method': trial.suggest_categorical('leaf_estimation_method', ['Newton', 'Gradient']),
        'random_seed': 228,
        'loss_function': 'RMSE',
        'eval_metric': 'RMSE',
        'bootstrap_type': 'Bernoulli',
        'task_type': 'GPU'
    }
    
    model = CatBoostRegressor(**params)
    scores = []
    k = KFold(n_splits = 5, random_state = 228, shuffle = True)
    for i, (trn_idx, val_idx) in enumerate(k.split(X)):
        
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

        model.fit(X_train, y_train, eval_set = [(X_val, y_val)], early_stopping_rounds = 150, verbose = False, use_best_model = True)
        
        tr_preds = model.predict(X_train)
        tr_score = np.sqrt(mean_squared_error(y_train, tr_preds))
        
        val_preds = model.predict(X_val)
        val_score = np.sqrt(mean_squared_error(y_val, val_preds))

        scores.append((tr_score, val_score))
        
        print(f"Fold {i+1} | RMSE: {val_score}")
        
        
    scores = pd.DataFrame(scores, columns = ['train score', 'validation score'])
    
    return scores['validation score'].mean()

study = optuna.create_study(direction = 'minimize')
study.optimize(objective, n_trials = 100)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best value:', study.best_value)

In [None]:
# Mean RMSE on 5 folds - 7.8448
paramsCB = {'depth': 5, 
            'learning_rate': 0.04278413956100119, 
            'iterations': 2116, 
            'max_bin': 238, 
            'min_data_in_leaf': 251, 
            'l2_leaf_reg': 0.004676455789227335, 
            'subsample': 0.3773307810571105, 
            'grow_policy': 'Depthwise', 
            'leaf_estimation_method': 'Newton',
            'random_seed': 228,
            'loss_function': 'RMSE',
            'eval_metric': 'RMSE',
            'bootstrap_type': 'Bernoulli',
            'task_type': 'GPU'}

In [None]:
folds = KFold(n_splits = 5, random_state = 228, shuffle = True)

predictions = np.zeros(len(test))

for fold, (trn_idx, val_idx) in enumerate(folds.split(X)):
    
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    model = CatBoostRegressor(**paramsCB)
   
    model.fit(X_train, y_train, eval_set = [(X_val, y_val)], verbose = False, early_stopping_rounds = 150, use_best_model = True)
    
    predictions += model.predict(test) / folds.n_splits 
    
ss['loss'] = predictions

In [None]:
ss.to_csv('cb.csv', index = False)

**Result - 7.88071**

# XGB

In [None]:
def objective(trial, data = X, target = y):

    params = {
        'max_depth': trial.suggest_int('max_depth', 2, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'n_estimators': trial.suggest_int('n_estimators', 500, 10000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 200),
        'gamma': trial.suggest_float('gamma', 0.0001, 1.0, log = True),
        'alpha': trial.suggest_float('alpha', 0.0001, 10.0, log = True),
        'lambda': trial.suggest_float('lambda', 0.0001, 10.0, log = True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.8),
        'subsample': trial.suggest_float('subsample', 0.1, 0.8),
        'tree_method': 'gpu_hist',
        'booster': 'gbtree',
        'random_state': 228,
        'use_label_encoder': False,
        'eval_metric': 'rmse'
    }
    
    model = XGBRegressor(**params)
    scores = []
    k = KFold(n_splits = 5, random_state = 228, shuffle = True)
    for i, (trn_idx, val_idx) in enumerate(k.split(X)):
        
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

        model.fit(X_train, y_train, eval_set = [(X_val, y_val)], early_stopping_rounds = 150, verbose = False)
        
        tr_preds = model.predict(X_train)
        tr_score = np.sqrt(mean_squared_error(y_train, tr_preds))
        
        val_preds = model.predict(X_val)
        val_score = np.sqrt(mean_squared_error(y_val, val_preds))

        scores.append((tr_score, val_score))
        
        print(f"Fold {i+1} | RMSE: {val_score}")
        
        
    scores = pd.DataFrame(scores, columns = ['train score', 'validation score'])
    
    return scores['validation score'].mean()

study = optuna.create_study(direction = 'minimize')
study.optimize(objective, n_trials = 100)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best value:', study.best_value)

In [None]:
# Mean RMSE on 5 folds - 7.8462
paramsXGB = {'max_depth': 2, 
             'learning_rate': 0.010707053059983151, 
             'n_estimators': 9688, 
             'min_child_weight': 168, 
             'gamma': 0.0006130691869231192, 
             'alpha': 0.0015540336440723174, 
             'lambda': 0.012133281664909838, 
             'colsample_bytree': 0.5945187331960007, 
             'subsample': 0.3432887319679862,
             'tree_method': 'gpu_hist',
             'booster': 'gbtree',
             'random_state': 228,
             'use_label_encoder': False,
             'eval_metric': 'rmse'}

In [None]:
predictions = np.zeros(len(test))

for fold, (trn_idx, val_idx) in enumerate(folds.split(X)):
    
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    model = XGBRegressor(**paramsXGB)
   
    model.fit(X_train, y_train, eval_set = [(X_val, y_val)], verbose = False, early_stopping_rounds = 150)
    
    predictions += model.predict(test) / folds.n_splits 
    
ss['loss'] = predictions

In [None]:
ss.to_csv('xgb.csv', index = False)

**Result - 7.88625**

# LGBM

In [None]:
def objective(trial, data = X, target = y):

    params = {
        'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 10, 500),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'max_depth': trial.suggest_int('max_depth', 2, 8),
        'n_estimators': trial.suggest_int('n_estimators', 500, 10000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.6),
        'cat_smooth' : trial.suggest_int('cat_smooth', 10, 100),
        'cat_l2': trial.suggest_int('cat_l2', 1, 20),
        'min_data_per_group': trial.suggest_int('min_data_per_group', 1, 200),
        'device_type': 'gpu',
        'boosting_type': 'gbdt',
        'random_state': 228,
        'metric': 'rmse'
    }
    
    model = LGBMRegressor(**params)
    scores = []
    k = KFold(n_splits = 5, random_state = 228, shuffle = True)
    for i, (trn_idx, val_idx) in enumerate(k.split(X)):
        
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

        model.fit(X_train, y_train, eval_set = [(X_val, y_val)], early_stopping_rounds = 150, verbose = False)
        
        tr_preds = model.predict(X_train)
        tr_score = np.sqrt(mean_squared_error(y_train, tr_preds))
        
        val_preds = model.predict(X_val)
        val_score = np.sqrt(mean_squared_error(y_val, val_preds))

        scores.append((tr_score, val_score))
        
        print(f"Fold {i+1} | RMSE: {val_score}")
        
        
    scores = pd.DataFrame(scores, columns = ['train score', 'validation score'])
    
    return scores['validation score'].mean()

study = optuna.create_study(direction = 'minimize')
study.optimize(objective, n_trials = 100)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best value:', study.best_value)

In [None]:
# Mean RMSE on 5 folds - 7.8405
paramsLGBM = {'reg_alpha': 8.682795832798263, 
              'reg_lambda': 8.688528314713478, 
              'num_leaves': 35, 
              'min_child_samples': 5, 
              'max_depth': 8, 
              'n_estimators': 4461, 
              'learning_rate': 0.010109446255049337, 
              'colsample_bytree': 0.104662962036166, 
              'cat_smooth': 56, 
              'cat_l2': 13, 
              'min_data_per_group': 5,
              'device_type': 'gpu',
              'boosting_type': 'gbdt',
              'random_state': 228,
              'metric': 'rmse'
              }

In [None]:
predictions = np.zeros(len(test))

for fold, (trn_idx, val_idx) in enumerate(folds.split(X)):
    
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    model = LGBMRegressor(**paramsLGBM)
   
    model.fit(X_train, y_train, eval_set = [(X_val, y_val)], verbose = False, early_stopping_rounds = 150)
    
    predictions += model.predict(test) / folds.n_splits 
    
ss['loss'] = predictions

In [None]:
ss.to_csv('lgbm.csv', index = False)

**Beautiful result - 7.88000**

# Conclusion

**Interesting. When I first started studying machine learning, I often read about how powerful XGB is, but for the second competition in a row, XGB shows the worst result.**