In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Data processing
%matplotlib inline
import numpy as np
import pandas as pd
# import pandas_profiling as pp
# import lux
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler

# Machine Learning
import optuna
from optuna.samplers import TPESampler
import xgboost as xgb
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error,roc_curve,auc,accuracy_score,confusion_matrix,f1_score
from sklearn.model_selection import train_test_split

import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from keras.layers import Dense,Dropout,BatchNormalization

In [None]:
# load data
input_dir = Path('../input/tabular-playground-series-aug-2021/')
train = pd.read_csv(input_dir / 'train.csv')
test = pd.read_csv(input_dir / 'test.csv')
submission = pd.read_csv(input_dir / 'sample_submission.csv')

In [None]:
columns = [col for col in train.columns if col not in ["id", 'loss']]
X=train[columns].values
y=train['loss'].values
X_test = test[columns].values

# EDA

In [None]:
bins = len(np.unique(y, return_counts=False))
freq = train["loss"].value_counts(normalize=True, sort=False)
cum_sum = train["loss"].value_counts(normalize=True, sort=False).cumsum()

fig, ax = plt.subplots(2, 1, figsize=(10,10))
sns.countplot(x=y, label='Train_loss', ax=ax[0])
ax[0].set_title(f'Distribution of the Loss, {bins} unique values', color = "crimson")
ax[0].set_xlabel('Loss value')
ax[0].legend()



sns.ecdfplot(x=y, label='Train_loss', ax=ax[1])
ax[1].set_title(f'Cumulative sum per loss', color = "crimson")
ax[1].set_xlabel('Loss value')
ax[1].legend()

fig.tight_layout(pad=3.0);

In [None]:
%%time
fig, ax = plt.subplots(10, 10,figsize=(24,24))
for i,col in enumerate(columns):
    sns.kdeplot(train[columns[i]], legend=False, shade=True, ax = ax[i%10][i//10])
    ax[i%10][i//10].set_title(f"{train.columns[i]}", fontsize=10, weight='bold',)
    ax[i%10][i//10].set_xlabel('')
    ax[i%10][i//10].set_ylabel('')
    ax[i%10][i//10].set_yticks([])
#     plt.subplots_adjust(hspace=0.2)

fig.tight_layout()

# XGB/LBG/CATBOOST - HYPERPARAMETERS TUNING

In [None]:
%%time
def cat_estimation(trial,data=X,target=y):
    x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.4,random_state=1)
    params = {'iterations':trial.suggest_int("iterations", 1000, 4000),
              'od_wait':trial.suggest_int('od_wait', 300, 500),
              'task_type':'GPU',
              'eval_metric':'RMSE',
              'learning_rate' : trial.suggest_uniform('learning_rate', 0.008,0.02),
              'grow_policy': trial.suggest_categorical('grow_policy', ['Depthwise','SymmetricTree']),
              'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.2 , 0.3),
              'subsample': trial.suggest_uniform('subsample',0.5,1.0),
              'random_strength': trial.suggest_uniform('random_strength',10,50),
              'depth': trial.suggest_int('depth',3,15),
              'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
              'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',10,40),
              'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bernoulli', 'Poisson']),
               }
    
    
    model = CatBoostRegressor(**params, random_state=1)
    model.fit(x_train, y_train,eval_set=[(x_valid,y_valid)], verbose=False)
    
    preds = model.predict(x_valid)
    rmse = mean_squared_error(y_valid, preds,squared=False)
    
    return rmse

In [None]:
train_time = 1 * 30 * 60
study_cat = optuna.create_study(direction='minimize', sampler=TPESampler(), study_name='CatRegressor')
study_cat.optimize(cat_estimation, timeout=train_time)

print('Number of finished trials: ', len(study_cat.trials))
print('Best trial:')
trial_cat = study_cat.best_trial

print('\tValue: {}'.format(trial_cat.value))
print('\tParams: ')
for key, value in trial_cat.params.items():
    print('\t\t{}: {}'.format(key, value))

Number of finished trials:  10
Best trial:
	Value: 7.843146985719494
	Params: 
		iterations: 3084
		od_wait: 332
		learning_rate: 0.019584817897930977
		grow_policy: Depthwise
		reg_lambda: 0.2587005604240873
		subsample: 0.6264800857693638
		random_strength: 46.10328452158566
		depth: 3
		min_data_in_leaf: 27
		leaf_estimation_iterations: 28
		bootstrap_type: Poisson

In [None]:
#Now let's get a lightgbm
def lgb_estimation(trial,data=X,target=y):
    x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.4,random_state=1)
    params = {
        'reg_alpha' : trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'reg_lambda' : trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'num_leaves' : trial.suggest_int('num_leaves' , 40 , 70),
        'learning_rate' : trial.suggest_uniform('learning_rate' , 0.008 , 0.04),
        'max_depth' : trial.suggest_int('max_depth', 3 , 15),
        'n_estimators' : trial.suggest_int('n_estimators', 400 , 4000),
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-4, 100),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 0.9, 0.05), 
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.15, 0.9, 0.05),
        'min_child_samples' : trial.suggest_int('min_child_samples', 20, 65),
        'metric' : 'rmse',
        'subsample_freq' : 1,
        'device_type' : 'gpu',}

    model = LGBMRegressor(**params, random_state=1, n_jobs=-1)
    model.fit(x_train, y_train,eval_set=[(x_valid,y_valid)], verbose=False)
    
    preds = model.predict(x_valid)
    rmse = mean_squared_error(y_valid, preds,squared=False)
    
    return rmse

#https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html

In [None]:
train_time = 1 * 30 * 60
study_lgb = optuna.create_study(direction='minimize', sampler=TPESampler(), study_name='LGBRegressor')
study_lgb.optimize(lgb_estimation, timeout=train_time)

print('Number of finished trials: ', len(study_lgb.trials))
print('Best trial:')
trial_lgb = study_lgb.best_trial

print('\tValue: {}'.format(trial_lgb.value))
print('\tParams: ')
for key, value in trial_lgb.params.items():
    print('\t\t{}: {}'.format(key, value))

Number of finished trials:  28
Best trial:
	Value: 7.833826248634097
	Params: 
		alpha: 0.002844219172632396
		lambda: 0.04988448769361392
		num_leaves: 58
		learning_rate: 0.010295669290960142
		max_depth: 8
		n_estimators: 3663
		min_child_weight: 2.185521963310407
		subsample: 0.75
		colsample_bytree: 0.35
		min_child_samples: 57

In [None]:
def xgb_estimation(trial,data=X,target=y):
    
    x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.4,random_state=1)
    param = {
        'tweedie_variance_power': trial.suggest_discrete_uniform('tweedie_variance_power', 1.0, 2.0, 0.1),
        'lambda': trial.suggest_loguniform('lambda', 1, 100),
        'alpha': trial.suggest_loguniform('alpha', 1, 100),
        'gamma': trial.suggest_loguniform('gamma', 1e-3, 1e4),
        'n_estimators': trial.suggest_int('n_estimators', 400, 4000, 400),
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.2, 0.9, 0.05),
        'colsample_bylevel': trial.suggest_discrete_uniform('colsample_bylevel', 0.2, 0.9, 0.05),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 0.9, 0.05),
        'eta': trial.suggest_float('eta', 0.007,0.020),
        'n_estimators': trial.suggest_int("n_estimators",400,4000,400),
        'max_depth': trial.suggest_int('max_depth', 3,15,1),
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1, 1000),
    }
    model = xgb.XGBModel(
        objective='reg:tweedie',
        tree_method='gpu_hist',   #which tree to chose: https://xgboost.readthedocs.io/en/latest/treemethod.html
        predictor='gpu_predictor',
        n_jobs=-1,
        **param
    ) 
    
    model.fit(x_train, y_train,
            eval_set=[(x_valid, y_valid)], eval_metric='rmse',
            verbose=False)
    
    preds = model.predict(x_valid)
    
    rmse = mean_squared_error(y_valid, preds,squared=False)
    
    return rmse

In [None]:
train_time = 1 * 30 * 60
study_xgb = optuna.create_study(direction='minimize', sampler=TPESampler(), study_name='XGBRegressor')
study_xgb.optimize(xgb_estimation, timeout=train_time)

print('Number of finished trials: ', len(study_xgb.trials))
print('Best trial:')
trial_xgb = study_xgb.best_trial

print('\tValue: {}'.format(trial_xgb.value))
print('\tParams: ')
for key, value in trial_xgb.params.items():
    print('\t\t{}: {}'.format(key, value))

Number of finished trials:  47
Best trial:
	Value: 7.835773902876162
	Params: 
		tweedie_variance_power: 1.0
		lambda: 2.1531451037845706
		alpha: 75.55781034374724
		gamma: 0.003783218732688592
		n_estimators: 3600
		colsample_bytree: 0.8500000000000001
		colsample_bylevel: 0.25
		subsample: 0.8500000000000001
		eta: 0.008050760753145963
		max_depth: 7
		min_child_weight: 27.35935907624855

In [None]:
# cat_params = {'iterations': 2352,
#  'od_wait': 423,
#  'learning_rate': 0.016226794416482,
#  'grow_policy': 'Depthwise',
#  'reg_lambda': 0.22443418652185604,
#  'subsample': 0.6489507065409703,
#  'random_strength': 33.31677512363275,
#  'depth': 5,
#  'min_data_in_leaf': 28,
#  'leaf_estimation_iterations': 23,
#  'bootstrap_type': 'Bernoulli',}

# lgb_params = {'reg_alpha': 0.5127042132407919, 'reg_lambda': 5.357028975534249, 
#           'num_leaves': 50, 'learning_rate': 0.011363404060130413, 
#           'max_depth': 10, 'n_estimators': 3279, 
#           'min_child_weight': 0.07984529371109039,
#           'subsample': 0.65, 
#           'colsample_bytree': 0.35, 
#           'min_child_samples': 43, 'subsample_freq' : 1}

# xgb_params = {'tweedie_variance_power': 1.1,
#  'max_depth': 6,
#  'n_estimators': 3200,
#  'eta': 0.011245712330378816,
#  'subsample': 0.9,
#  'min_child_weight': 318.8784492865065,
#  'colsample_bytree': 0.45,
#  'colsample_bylevel': 0.30000000000000004,
#  'subsample': 0.9,
#  'lambda': 30.619480207080088,
#  'alpha': 42.00321964378956,
#  'gamma': 0.24759763410326613,
#  'tree_method':'gpu_hist'
#              }    

# XGB/LBG/CATBOOST - VALIDATION AND TEST SET

In [None]:
cat_params = trial_cat.params
lgb_params = trial_lgb.params
xgb_params = trial_xgb.params

xgb_params['objective'] = 'reg:tweedie'
xgb_params['tree_method'] = 'gpu_hist'
xgb_params['predictor'] = 'gpu_predictor'
        
n_splits = 10

cat_preds = np.zeros((X_test.shape[0],))
oof_cat_preds = np.zeros((X.shape[0],))
kf_cat_rmse = []

lgb_preds = np.zeros((X_test.shape[0],))
oof_lgb_preds = np.zeros((X.shape[0],))
kf_lgb_rmse = []

xgb_preds = np.zeros((X_test.shape[0],))
oof_xgb_preds = np.zeros((X.shape[0],))
kf_xgb_rmse = []


for fold, (train_idx, valid_idx) in enumerate(KFold(n_splits=n_splits, shuffle=True).split(X, y)):
    # Fetch the train-validation indices.
    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]
    
    # Create and fit the models using optuna hyperparameters
    model_cat = CatBoostRegressor(**cat_params, task_type='GPU') #https://catboost.ai/docs/concepts/python-reference_catboostregressor.html
    model_cat.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
    
    model_lgb = LGBMRegressor(**lgb_params, device='gpu') #https://catboost.ai/docs/concepts/python-reference_catboostregressor.html
    model_lgb.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False,early_stopping_rounds=200,)
    
    model_xgb = xgb.XGBModel(**xgb_params)
    model_xgb.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric='rmse', verbose=False)
    
    # Validation predictions.
    cat_pred = model_cat.predict(X_valid)
    cat_rmse = mean_squared_error(y_valid, cat_pred,squared=False)
    kf_cat_rmse.append(cat_rmse)
    oof_cat_preds[valid_idx] += model_cat.predict(X_valid) / n_splits
    
    lgb_pred = model_lgb.predict(X_valid)
    lgb_rmse = mean_squared_error(y_valid, lgb_pred,squared=False)
    kf_lgb_rmse.append(lgb_rmse)
    oof_lgb_preds[valid_idx] += model_lgb.predict(X_valid) / n_splits
    
    xgb_pred = model_xgb.predict(X_valid)
    xgb_rmse = mean_squared_error(y_valid, xgb_pred,squared=False)
    kf_xgb_rmse.append(xgb_rmse)
    oof_xgb_preds[valid_idx] += model_xgb.predict(X_valid) / n_splits

    #Pred on test set 
    cat_preds += model_cat.predict(X_test) / n_splits
    lgb_preds += model_lgb.predict(X_test) / n_splits
    xgb_preds += model_xgb.predict(X_test) / n_splits

for i in range(0,10):
    print(f'Fold {i+1}/{n_splits} CATBOOST RMSE: {kf_cat_rmse[i]:.4f}')

for i in range(0,10):
    print(f'Fold {i+1}/{n_splits} LGBM RMSE: {kf_lgb_rmse[i]:.4f}')
    
for i in range(0,10):
    print(f'Fold {i+1}/{n_splits} XGB RMSE: {kf_xgb_rmse[i]:.4f}')

In [None]:
upper = 11
min_value = 1e6
mae_plot = []
for i in range(0,11):
    for j in range(0,upper-i):
        pred_ensemble = ((0.1*i) * oof_lgb_preds) + ((0.1*j) * oof_cat_preds) +(0.1*(upper - 1 - i - j)) *  oof_xgb_preds
        mae = mean_squared_error(y, pred_ensemble,squared=False)
        if mae < min_value:
            mae_plot.append(mae)
            min_value = mae
            weights = [i, j,upper - 1 - i - j ]
        else:
            mae_plot.append(mae)
#             print(f'lgb_coeff : {i} cat_coeff: {j} lgb_coeff: {upper - 1 - i - j}')
print(f'Min_RMSE : {min_value} Best_Weights: {weights}')
plt.plot(mae_plot)

# BASIC NEURAL NETWORK 
**(added at last minute to check how it worked with optuna)**

In [None]:
#We will now build a simple Neural Network using Keras
def nn_model(h1, h2, h3, lr, dr):
    #initiating the model
    model = Sequential()
    initializer = tf.keras.initializers.HeUniform()
    model.add(Dense(input_dim=X.shape[1], units=h1, kernel_initializer=initializer , activation="elu"))
    model.add(BatchNormalization())
    model.add(Dropout(dr))
    model.add(Dense(units=h2, activation="elu"))
    model.add(BatchNormalization())
    model.add(Dropout(dr))
    model.add(Dense(units=h3, activation="elu"))
    model.add(BatchNormalization())
    model.add(Dropout(dr))
    model.add(Dense(units=1, activation='linear'))
    
    #compile the model
    model.compile(optimizer = tf.optimizers.Adam(learning_rate = lr), loss='mse', metrics=[tf.keras.metrics.RootMeanSquaredError()])
    return model

In [None]:
%%time
def nn_estimation(trial,data=X,target=y):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.4,random_state=1)
    SS = StandardScaler().fit(X_train)
    X_train = SS.transform(X_train)
    X_valid = SS.transform(X_valid)
    params = {'h1': trial.suggest_int('h1' ,3 , 100),
              'h2': trial.suggest_int('h2' ,3 , 100),
              'h3': trial.suggest_int('h3' ,3 , 100),
              'lr' : trial.suggest_uniform('lr' , 0.002 , 0.1),
              'dr' : trial.suggest_uniform('dr' ,0.01 , 0.2),
               }
        
    
    model = nn_model(**params)
    model.fit(X_train, y_train, validation_data = (X_valid, y_valid), epochs = 30, batch_size = 3000, verbose = 0)
    
    model_pred = model.predict(X_valid)
    rmse = model.evaluate(X_valid, y_valid, verbose=0)
    
    return rmse[1]

In [None]:
train_time = 1 * 30 * 60
study_nn = optuna.create_study(direction='minimize', sampler=TPESampler(), study_name='DeepNN')
study_nn.optimize(nn_estimation, timeout=train_time)

print('Number of finished trials: ', len(study_nn.trials))
print('Best trial:')
trial_nn = study_nn.best_trial

print('\tValue: {}'.format(trial_nn.value))
print('\tParams: ')
for key, value in trial_nn.params.items():
    print('\t\t{}: {}'.format(key, value))

Number of finished trials:  105
Best trial:
	Value: 7.8848958015441895
	Params: 
		h1: 9
		h2: 50
		h3: 75
		lr: 0.005007293309139238
		dr: 0.19501276039972268

In [None]:
n_splits = 10

nn_params = trial_nn.params

nn_preds = np.zeros((X_test.shape[0],))
oof_nn_preds = np.zeros((X.shape[0],))
kf_nn_rmse = []

SS = StandardScaler().fit(X)
X_scaled = SS.transform(X)
X_test_scaled = SS.transform(X_test)

for fold, (train_idx, valid_idx) in enumerate(KFold(n_splits=n_splits, shuffle=True).split(X_scaled, y)):
    # Fetch the train-validation indices.
    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]
    
    X_train = SS.transform(X_train)
    X_valid = SS.transform(X_valid)
    
    # Create and fit the model using optuna hyperparameters
    model_nn = nn_model(**nn_params)
    model_nn.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=30, batch_size=3000, verbose=0)
    
    # Validation predictions.
    nn_rmse = model_nn.evaluate(X_valid, y_valid, verbose=0)
    kf_nn_rmse.append(nn_rmse[1])
    oof_nn_preds[valid_idx] += model_nn.predict(X_valid).reshape(-1,) / n_splits

    #Pred on test set 
    nn_preds += model_nn.predict(X_test_scaled).reshape(-1,) / n_splits


for i in range(0,10):
    print(f'Fold {i+1}/{n_splits} NN RMSE: {kf_nn_rmse[i]:.4f}')

In [None]:
min_value = 1e6
mae_plot_2 = []
for i in range(0,11):
    pred_ensemble = (0.1 * i) * (weights[0] * oof_lgb_preds + weights[1] * oof_cat_preds + weights[2] * oof_xgb_preds) + (1 - (0.1 * i)) * oof_nn_preds
    mae = mean_squared_error(y, pred_ensemble,squared=False)
    if mae < min_value:
        mae_plot_2.append(mae)
        min_value = mae
        weights_2 = [0.1*i, 1-(0.1*i)]
    else:
        mae_plot_2.append(mae)
#             print(f'lgb_coeff : {i} cat_coeff: {j} lgb_coeff: {upper - 1 - i - j}')
print(f'Min_RMSE : {min_value} Best_Weights: {weights_2}')
plt.plot(mae_plot_2)

# SUBMISSION

In [None]:
submission["loss"] =  (weights_2[0] * (weights[0] * lgb_preds + weights[1] * cat_preds + weights[2] * xgb_preds))/10 + (weights_2[1] * nn_preds)
submission.to_csv('submission_final.csv', index=False)

In [None]:
submission["loss"] =  (0.95 * (0.2 * lgb_preds + 0.6 * cat_preds + 0.2 * xgb_preds)) + (0.05 * nn_preds)
submission.to_csv('submission_average_model.csv', index=False)