In [None]:
!pip3 install -qq optuna
import numpy as np 
import pandas as pd

import xgboost
import catboost
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

from sklearn.linear_model import LinearRegression,Lasso, Ridge
from sklearn.experimental import enable_hist_gradient_boosting 
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor,ExtraTreesRegressor
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_squared_error

from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
import optuna
sns.set()

# Objective

The Objective of this task is to build a regression model to predict a target from a set of 14 features. The features are anonymised and are all *continuous*. The scoring is metric the Root Mean Squared Error (RMSE) : $$RMSE = \sqrt{\sum_{i = 1}^{N} (y_{i} - \hat{y}_{i})^{2}} $$

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2021/test.csv")
print(train.shape)
print(test.shape)

X, y = train.iloc[:,1:-1].values, train.iloc[:,-1].values
feat_names = list(train.columns[1:-1])
kfold = KFold(n_splits=5,random_state=2021, shuffle=True)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.15, random_state=2021)

# Data Exploration

+ Train has 300000 observations with 14 features, test has 200000 observations to predict

+ There are no missing features

+ Some suggest removing the outlier `y = 0` for preprocessing



## Univariate Analysis

+ The features all appear to come from non-normal distributions. Target appears to be bimodal, possibly from a mixture of gaussians

+ + Features are roughly between 0 - 1, Target is roughly 0 - 10 


+ Some have suggested that the data may be *time series*. The autocorrelation between for each column is low, and the correlation between `y` and the differenced Xs `X_{i + 1} - X_{i}` is also low so this is likely not the case

+ https://www.kaggle.com/c/tabular-playground-series-jan-2021/discussion/210484
suggests that the distribution of the features in the training and testing sets is the same, so optimising for hyperparameters using cross validation in the training set is key


In [None]:
train.iloc[:,1:].describe()

In [None]:
%%time
fig, ax = plt.subplots(nrows=3, ncols=5, figsize=(30,15))
for i in range(15):
    sns.distplot(train.iloc[:, i + 1], ax= ax[i // 5, i % 5])

In [None]:
# autocorrelations
for i in range(1,16):
    print(i,train.iloc[:,i].autocorr())

In [None]:
train.iloc[:,1:-1].diff().corrwith(train.iloc[:,-1])

## Bivariate Analysis

+ The features appear to have a low correlation in both pearson, spearman with the target

+ Some clusters of correlated features

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
sns.heatmap(train.iloc[:,1:].corr(),annot=True)

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
sns.heatmap(train.iloc[:,1:].corr(method='spearman'),annot=True)

In [None]:
%%time
fig, ax = plt.subplots(nrows=3, ncols=5, figsize=(30,15))
for i in range(14):
    sns.scatterplot(x=X[:, i], y=y,ax= ax[i // 5, i % 5])

# Modelling



Others have suggested a 2-step approach:
+ Use unsupervised learning: a Gaussian Mixture Model to determine 2 clusters / mixtures for `y` 
+ Predict E[Y | cluster = i] usin a regression model

+ https://www.kaggle.com/iamleonie/handling-multimodal-distributions-fe-techniques

+ https://www.kaggle.com/chrisbradley/tab-playground-predicting-bimodal-distribution


Evaluate models using 5 fold CV to see if there are any that seem to be better without parameter tuning


## Linear Models



| | Baseline |  Linear Regression | Lasso | Ridge |
| -- | -- | -- | -- | -- |
| **RMSE** | 0.7330696085805828 | 0.7262295101854237| 0.7330691414275544 | 0.7262363800486569 | 

https://scikit-learn.org/stable/modules/linear_model.html#linear-model


The baseline is to set all the predictions to the in-sample mean: let $\hat{y} = \bar{y}. $For Ridge and Linear Regression, the residuals look almost the same as the original y. Ridge performs slightly better, so this may suggest that all features have *some* information as opposed to shrinking them to 0 via Lasso. Fitting on the full dataset, lasso shrinks all the coefficients to 0 (i.e. predicting mean). Overall, a nonlinear learner may be better.



In [None]:
# Baseline 
baseline_score = mean_squared_error(y,np.mean(y) * np.ones(y.shape[0]),squared=False)

lr = LinearRegression()
lr_scores = cross_val_score(lr,X, y, cv=kfold,scoring='neg_root_mean_squared_error')

lasso = Lasso()
lasso_scores = cross_val_score(lasso,X, y, cv=kfold,scoring='neg_root_mean_squared_error')

ridge = Ridge()
ridge_scores = cross_val_score(ridge,X, y, cv=kfold,scoring='neg_root_mean_squared_error')

print(-lr_scores, np.mean(-lr_scores))
print(-lasso_scores, np.mean(-lasso_scores))
print(-ridge_scores, np.mean(-ridge_scores))

fig, ax = plt.subplots(ncols=4,nrows=2, figsize=(20, 10))
ridge.fit(X, y)X
lr.fit(X,y)
feat_names2, coef_order = zip(*sorted(zip(feat_names,ridge.coef_), key=lambda k : abs(k[1]), reverse=True))
sns.barplot(list(coef_order),list(feat_names2), ax=ax[0,0])
sns.scatterplot(y - ridge.predict(X), y,ax=ax[0,1])
sns.distplot(y - ridge.predict(X),ax=ax[0,2])
sns.distplot(ridge.predict(X),ax=ax[0,3])
feat_names2, coef_order2 = zip(*sorted(zip(feat_names,lr.coef_), key=lambda k : abs(k[1]), reverse=True))
sns.barplot(list(coef_order2),list(feat_names2), ax=ax[1,0])
sns.scatterplot(y - lr.predict(X), y,ax=ax[1,1])
sns.distplot(y - lr.predict(X),ax=ax[1,2])
sns.distplot(lr.predict(X),ax=ax[1,3])
print("Ridge on full dataset",mean_squared_error(y, ridge.predict(X), squared=False))

# Gradient Boosted Decision Trees

Some experiments on CPU, evaluating RMSE using 5-fold CV, and fixed parameters / parameters from other peoples' notebooks

| | XGB | CatBoost | LGBM | HistGradientBoosting |
| -- | -- | -- | -- | -- |
| **RMSE** | 0.70263173 | 0.6989878380818764 | 0.7001857209114114 | 0.700016686352503 |
| **time** | 23min 14s| 5min 47s |  9.48 s  |  31s |
| **params** | | iterations = 1000 | |  max_iter = 1000, learning_rate=0.08, max_depth |


https://towardsdatascience.com/catboost-vs-light-gbm-vs-xgboost-5f93620723db

### XGBoost
+ comparably slower on CPU
+ https://www.kaggle.com/hamzaghanmi/xgboost-hyperparameter-tuning-using-optuna

### CatBoost
+ https://catboost.ai/docs/concepts/python-reference_parameters-list.html
+ https://catboost.ai/docs/concepts/parameter-tuning.html


### LightGBM

+ https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html

### scikit-learn's HistGradientBoosting 

+ https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingRegressor.html#sklearn.ensemble.HistGradientBoostingRegressor.
+ optimised some parameters one by one using GridSearchCV
+ Quite fast, but seems to be performance seems to be capped at 0.7







# Hyperparameter tuning

The most important hyperparmeters are probably:
+ N_estimators / iterations - number of trees in boosting. greater number of trees might decrease bias but lead to overfitting
+ max_depth - depth of each tree; complexity of the model
+ max_leaf_nodes - lower max_leaf_nodes might imply more regularisation
+ learning_rate - higher learning rate speeds up training
+ l2_regularization -

How?

+ Select using 5-Fold Cross Validation with GridSearchCV, RandomizedSearchCV
+ many public notebooks favour `optuna`

## Optuna

+ https://www.kaggle.com/bowaka/tps21-optuna-lgb-fast-hyper-parameter-tunning

In [None]:
# %%time
# hr = HistGradientBoostingRegressor(max_iter=750, max_depth=None, early_stopping=True, validation_fraction=0.1,
#                                    learning_rate=0.08, max_leaf_nodes=128,l2_regularization=0, random_state=2021)
# hr_scores = cross_val_score(hr,X, y, cv=kfold,scoring='neg_root_mean_squared_error')
# print(-hr_scores, -np.mean(hr_scores))

# %%time 
# from sklearn.model_selection import GridSearchCV
# # params = {'learning_rate' : [0.05, 0.06, 0.07, 0.08, 0.1]}
# # params = {'max_iter':[500, 750, 1000], "max_depth":[5,8, None]}
# params = {'max_leaf_nodes':[64, 128, 256], "max_depth":[5,8, None]}


# hr = HistGradientBoostingRegressor(learning_rate=0.08,l2_regularization=0, max_iter=750,
#                                    early_stopping=True,validation_fraction=0.1,
#                                    random_state=2021)



# grid_search = GridSearchCV(hr, 
#                            param_grid = params, 
#                            cv=kfold,
#                            scoring = 'neg_root_mean_squared_error', 
#                            n_jobs = -1, 
#                            verbose = 0)
# grid_search.fit(X, y)

# pd.DataFrame(grid_search.cv_results_)

In [None]:
# %%time
# xgbr = XGBRegressor(n_estimators=500, max_depth=3, learning_rate=0.02, objective="reg:squarederror")
# xgbc_scores = cross_val_score(xgbc,X, y, cv=kfold,scoring='neg_root_mean_squared_error')
# print(-xgbc_scores)

# %%time
# xgb_params = {'lambda': 0.0030282073258141168, 'alpha': 0.01563845128469084, 'colsample_bytree': 0.5,
#              'subsample': 0.7,'n_estimators': 4000, 'learning_rate': 0.01,'max_depth': 15,
#              'random_state': 2020, 'min_child_weight': 257}

# xgbr = XGBRegressor(**xgb_params)
# xgbr.fit(X, y)

In [None]:
# %%time
# cr = CatBoostRegressor(iterations=1000,verbose=False)
# cr_scores = cross_val_score(cr,train.iloc[:,1:-1], train.iloc[:,-1], cv=kfold,scoring='neg_root_mean_squared_error')
# print(-cr_scores, np.mean(-cr_scores))

In [None]:
# %%time
# from lightgbm import LGBMRegressor

# lb_params = {'learning_rate':0.005, 'num_iterations':5000,'objective': 'regression','metric': 'rmse','verbosity': -1,
#              'boosting_type': 'gbdt','feature_pre_filter': False,'lambda_l1': 4.616521116348607,'lambda_l2': 1.9781272803424497,
#              'num_leaves': 102,'feature_fraction': 0.4,'bagging_fraction': 1.0,'bagging_freq': 0,'min_child_samples': 20, "seed":2021}
# lb = LGBMRegressor(**lb_params)
# lb_scores = cross_val_score(lb,X, y, cv=kfold,scoring='neg_root_mean_squared_error')
# print(-lb_scores, np.mean(-lb_scores))

In [None]:
# from catboost import CatBoostRegressor     

# def objective(trial):
#     param = {
#         'learning_rate': trial.suggest_categorical('learning_rate', [0.01, 0.03, 0.05, 0.08, 0.1]),
#         'iterations': trial.suggest_categorical("iterations",[750, 1000, 2000]),
#         'max_depth': trial.suggest_int("depth", 5, 12),
#         "random_strength": trial.suggest_int("random_strength", 0, 100),
#         'random_state': 2021,
#         "verbose":0,
#         "task_type":"GPU"
#     }
#     cbr = CatBoostRegressor(**param)  
        
#     cbr_scores = cross_val_score(cbr,X, y, cv=kfold,scoring='neg_root_mean_squared_error')
    
#     return np.mean(-cbr_scores)

# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_trials=50)

# Ensembling

+ Ensembling from different models (XGB + CatBoost + LGBM) : https://www.kaggle.com/shkanda/ensemble-lgb-xgb-cat

+ Averaging predictions from a model trained on K-folds : https://www.kaggle.com/shogosuzuki/0-69713-lightgbm-with-small-learning-rate

+ Averaging from a model using different seeds?

In [None]:
%%time
# {'learning_rate': 0.01, 'iterations': 2000, 'depth': 12, 'random_strength': 52}


# n_seeds = 10
# rand_seeds = np.random.randint(2048, size=n_seeds)
# print(rand_seeds)    

i = 0
for train_index, test_index in tqdm(kfold.split(X)):
    cr = CatBoostRegressor(verbose=False,iterations=2000, learning_rate=0.01, 
                           random_strength=52, max_depth=12,random_seed=2021)
    cr.fit(X[train_index,:], y[train_index])
    train[f'target{i}'] = cr.predict(X)
    test[f'target{i}'] = cr.predict(test.iloc[:,1:].values)
    i += 1
train['target_final'] = train.loc[:,["target"+str(i) for i in range(5)]].mean(axis=1)
print("In Sample: ", mean_squared_error(y, train['target_final'], squared=False))
test['target'] = test[["target"+str(i) for i in range(5)]].mean(axis=1)
test[['id','target']].to_csv("submission.csv",index=False)

## Public Leaderboard

| **Method** | **params** | in-sample | **public leaderboard** |
| -- | -- | -- | -- |
| CatBoost + Averaging using 10 random seeds |  iterations = 500, max_depth= 8|  | 0.69856 | 


## Model Interpretabilitlity

Could use Shap or Feature Importances

**Catboost**
+ All features look important
+ Residuals still look bimodal
+ https://shap.readthedocs.io/en/latest/example_notebooks/tabular_examples/tree_based_models/Catboost%20tutorial.html

In [None]:
fig, ax = plt.subplots(ncols=4,figsize=(20, 5))
feat_names2, coef_order = zip(*sorted(zip(feat_names,cr.get_feature_importance()), key=lambda k : abs(k[1]), reverse=True))
sns.barplot(list(coef_order),list(feat_names2), ax=ax[0])
sns.scatterplot(y - train['target_final'], y,ax=ax[1])
sns.distplot(y - train['target_final'],ax=ax[2])
sns.distplot(train['target_final'],ax=ax[3])

# explainer = shap.TreeExplainer(model)
# shap_values = explainer.shap_values(X)

# Other approaches

## Deep Neural Network

DNNs could be a worthwhile approach but have yet to explore

In [None]:
# import tensorflow as tf
# from tensorflow import keras
# from tensorflow.keras import layers
# from keras.utils import to_categorical
# from tensorflow.keras import callbacks
# from tensorflow.keras.optimizers import Adam
# from tensorflow.keras.layers import Dense, Input
# from sklearn.model_selection import train_test_split

# inputs = Input(shape=(14,))
# x = layers.Dense(128, activation="relu")(inputs)
# x = layers.Dense(64, activation="relu")(x)
# output = layers.Dense(1)(x)
# model = keras.Model(inputs, output)    

# model.compile(Adam(lr=1e-3), "mse", metrics=["mse", tf.keras.metrics.RootMeanSquaredError()])

# X_train, X_test, y_train, y_test = train_test_split(train.iloc[:,1:-1],train.iloc[:,-1],train_size=0.8, random_state=2021)
# history = model.fit(X_train, y_train, batch_size=64, epochs=10)

## Other Models

+ Gaussian Process - runs out of memory using sklearn
+ Kernel Ridge - runs out of memory
+ KNN, SVR - will likely have the same problems

In [None]:
# # from sklearn.gaussian_process import GaussianProcessRegressor
# # gpr = GaussianProcessRegressor()
# # gpr_scores = cross_val_score(gpr,X, y, cv=kfold,scoring='neg_root_mean_squared_error')

In [None]:
# from sklearn.kernel_ridge import KernelRidge
# kr = KernelRidge()
# kr_scores = cross_val_score(kr,X, y, cv=kfold,scoring='neg_root_mean_squared_error')

# Trees

Unable to get 5 fold cross validation for Random Forest and Extra Trees to run in a reasonable amount of time. The previous results with linear/Ridge regression also suggest that column subsampling may be less effective than using all features (so perhaps Boosting would be better)

In [None]:
# %%time
# rf = RandomForestRegressor(n_estimators=500,max_depth=5)
# rf_scores = cross_val_score(rf,X, y, cv=kfold,scoring='neg_root_mean_squared_error')
# print(-hr_scores, -np.mean(rf_scores), -np.std(hr_scores))

In [None]:
# %%time
# rf = ExtraTreesRegressor(n_estimators=500, max_depth=5)
# rf_scores = cross_val_score(rf,X, y, cv=kfold,scoring='neg_root_mean_squared_error')
# print(-hr_scores, -np.mean(rf_scores), -np.std(hr_scores))