In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import os

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

from sklearn.linear_model import Ridge, Lasso, LinearRegression
import lightgbm as lgb
from sklearn.ensemble import AdaBoostRegressor
import catboost as cgb
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
from sklearn.ensemble import StackingRegressor

from sklearn.metrics import mean_absolute_error

from matplotlib import pyplot as plt 
%matplotlib inline
import seaborn as sns

In [None]:
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

train_df = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/train.csv')
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/test.csv') 

# EDA

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(train_df.corr(),annot=True)
plt.show()

In [None]:
np.linalg.det(train_df.corr())

In [None]:
X_cols = [x for x in train_df.columns.tolist() if 'cont' in x]
plt.figure(figsize=(20,10))
subplot_count = 1
for i in range(7): 
    for j in range(2): 
        plt.subplot(2, 7, subplot_count)
        train_df[X_cols[subplot_count-1]].plot.box()
        subplot_count += 1
plt.show()

In [None]:
test_df.info()

In [None]:
test_df.describe()

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(test_df.corr(),annot=True)
plt.show()

In [None]:
np.linalg.det(test_df.corr())

## Data Processing for Lasso and Ridge Regression

In [None]:
# Code from https://www.kaggle.com/tosinabase/jan-21-regularized-regression-ridge-and-lasso

y = train_df['target']
X_lr = train_df.drop(['id', 'target'], axis=1)

scaler = StandardScaler()
scaler.fit(X_lr)

X_lr = scaler.transform(X_lr)
X_lr_test = scaler.transform(test_df.drop('id', axis=1).values)

X_train_lr, X_val_lr, y_train_lr, y_val_lr = train_test_split(X_lr, y, test_size=0.3, random_state=17, shuffle=False)

# Model

In [None]:
y = train_df['target']
X = train_df.drop(['target'], axis=1)

X_train_df, X_val_df, y_train_df, y_val_df = train_test_split(X, y, test_size =0.3, shuffle=False)

del train_df

## Lasso

In [None]:
m1 = Lasso(alpha=0.001, random_state=123)

In [None]:
m1_fit = m1.fit(X_train_lr, y_train_lr)
print('Score reached: {} '.format(m1.score(X_train_lr, y_train_lr)))

# Score without scaling: 0.0176265428750495 
# Score with scaling: 0.01859676573900082 

In [None]:
X_test_lr = test_df.drop(['id'], axis=1)
y_test_lasso = m1.predict(X_test_lr)

In [None]:
plt.figure(figsize=(20,10))
plt.bar(height=m1_fit.coef_, x=X.columns.values[1:])
plt.title("Feature importances via coefficients")
plt.show()

## Ridge

In [None]:
# Parameter from https://www.kaggle.com/tosinabase/jan-21-regularized-regression-ridge-and-lasso
m2 = Ridge(alpha=0.1)

In [None]:
m2_fit = m2.fit(X_train_lr, y_train_lr)
print('Score reached: {} '.format(m2.score(X_train_lr, y_train_lr)))
# Score 0.01865954277402282 

In [None]:
X_test_lr = test_df.drop(['id'], axis=1)
y_test_ridge = m2.predict(X_test_lr)

In [None]:
plt.figure(figsize=(20,10))
plt.bar(height=m2_fit.coef_, x=X.columns.values[1:])
plt.title("Feature importances via coefficients")
plt.show()

## LightGBM

In [None]:
lgb_train = lgb.Dataset(X_train_df, y_train_df, free_raw_data=False)
lgb_eval = lgb.Dataset(X_val_df, y_val_df, free_raw_data=False)

In [None]:
# # param values c.f. https://www.kaggle.com/zephyrwang666/riiid-lgbm-bagging2
# param = {'num_leaves': sp_randint(10, 500), 'n_estimators': sp_randint(10, 6000), 'max_bin':sp_randint(100, 800), 'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4], 
#          'feature_fraction': sp_uniform(0, 1), 'bagging_fraction': sp_uniform(0, 1), "bagging_seed": [47], 
#          'objective': ['regression'], 'max_depth': [-1], 
#          'learning_rate': sp_uniform(0, 1), "boosting_type": ["gbdt"], 
#          'metric': ['rmse'], "verbosity": [-1], 
#          'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100], 'reg_lambda': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100], 
#          'random_state': [47]}

# m3 = lgb.LGBMRegressor(verbose_eval = 30, num_boost_round = 10000, early_stopping_rounds = 10, n_estimators=3000)

# '''
# Hyperparameter optimisation
# '''
# # Code from https://www.kaggle.com/rtatman/lightgbm-hyperparameter-optimisation-lb-0-761#Model-fitting-with-HyperParameter-optimisation
# #This parameter defines the number of hyperparameter points to be tested
# n_HP_points_to_test = 150

# gsLGBM = RandomizedSearchCV(
#     estimator=m3, param_distributions=param, 
#     n_iter=n_HP_points_to_test,
#     cv=5,
#     refit=True,
#     random_state=47,
#     verbose=True)

In [None]:
# gsLGBM.fit(X_train_df, y_train_df, eval_set = (X_val_df, y_val_df), eval_metric = 'rmse')
# print('Best score reached: {} with params: {} '.format(gsLGBM.best_score_, gsLGBM.best_params_))

In [None]:
gsLGBM.best_params_
# Best score: 0.0937872506205801
# opt_parameters_LGBM = {'bagging_fraction': 0.7997942505658034,
#  'bagging_seed': 47, 'boosting_type': 'gbdt',
#  'feature_fraction': 0.31477581669804067, 'learning_rate': 0.03875307567633712,
#  'max_bin': 491, 'max_depth': -1,
#  'metric': 'rmse', 'min_child_weight': 100.0,
#  'n_estimators': 2559, 'num_leaves': 272,
#  'objective': 'regression',
#  'random_state': 47, 'reg_alpha': 10,
#  'reg_lambda': 1, 'verbosity': -1}

In [None]:
m3 = lgb.LGBMRegressor(valid_sets = [lgb_train, lgb_eval], verbose_eval = 30, num_boost_round = 10000, early_stopping_rounds = 10, n_jobs=4, **opt_parameters_LGBM)
m3.fit(X_train_df, y_train_df, eval_set = (X_val_df, y_val_df), eval_metric = 'rmse')

In [None]:
X_test = test_df
y_test_lgbm = m3.predict(X_test)

In [None]:
plt.figure(figsize=(20,10))
lightgbm.plot_importance(m3)
plt.title("Feature importances")
plt.show()

## ADABoost

In [None]:
# m4 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=3, min_samples_leaf=1, min_impurity_decrease=10, random_state=47), random_state=47)

In [None]:
# param = {'learning_rate': sp_uniform(0, 1), 'n_estimators': sp_randint(5, 100)}

# '''
# Hyperparameter optimisation
# '''
# # Code from https://www.kaggle.com/rtatman/lightgbm-hyperparameter-optimisation-lb-0-761#Model-fitting-with-HyperParameter-optimisation
# # This parameter defines the number of HP points to be tested
# n_HP_points_to_test = 50

# gsADA = RandomizedSearchCV(
#     estimator=m4, param_distributions=param, 
#     n_iter=n_HP_points_to_test,
#     cv=3,
#     refit=True,
#     random_state=47,
#     verbose=True)

In [None]:
# gsADA.fit(X_train_df, y_train_df)
# print('Best score reached: {} with params: {} '.format(gsADA.best_score_, gsADA.best_params_))

In [None]:
# Just in case, the parameters should be printed in here. 
# Score: -3.3217493390580444e-05
opt_parameters_ADA = {'learning_rate': 0.028555288989857153, 'n_estimators': 36} 

In [None]:
m4 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=3, min_samples_leaf=1, min_impurity_decrease=10, random_state=47), random_state=47, **opt_parameters_ADA)
m4.fit(X_train_df, y_train_df)

In [None]:
X_test = test_df
y_test_ada = m4.predict(X_test)

## CatBoost

In [None]:
m5 = CatBoostRegressor(random_seed=47)

In [None]:
param = {'learning_rate': sp_uniform(0, 1), 'n_estimators': sp_randint(5, 100), eta=sp_uniform(0, 1), num_trees=sp_randint(5, 100)}

'''
Hyperparameter optimisation
'''
# Code from https://www.kaggle.com/rtatman/lightgbm-hyperparameter-optimisation-lb-0-761#Model-fitting-with-HyperParameter-optimisation
# This parameter defines the number of HP points to be tested
n_HP_points_to_test = 50

gsCB = RandomizedSearchCV(
    estimator=m5, param_distributions=param, 
    n_iter=n_HP_points_to_test,
    cv=3,
    refit=True,
    random_state=47,
    verbose=True)

In [None]:
gsCB.fit(X_train_df, y_train_df)
print('Best score reached: {} with params: {} '.format(gsCB.best_score_, gsCB.best_params_))

In [None]:
X_test = test_df
y_test_cb = m5.predict(X_test)

## Ensembling the Models

In [None]:
# Code from https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python#Second-Level-Predictions-from-the-First-level-Output
gbm = xgb.XGBRegressor(
 learning_rate = 0.01,
 n_estimators= 100,
 max_depth= 4,
 min_child_weight= 2,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'reg:squaredlogerror',
 nthread= -1,
 verbosity=3,
 random_state=20)

# Code from https://stackoverflow.com/questions/65713104/sklearn-asking-for-eval-dataset-when-there-is-one/65714374?noredirect=1#comment116194594_65714374
lgbm_params = m1.get_params()

# remove early_stopping_rounds as your model is already fitted the data
lgbm_params["early_stopping_rounds"] = None
m1.set_params(**lgbm_params)

estimators = [('lgbm', m1), ('ada', m2), ('lasso', m3), ('ridge', m4)]

gbm = StackingRegressor(estimators=estimators, final_estimator=gbm, cv=5, verbose=1)

In [None]:
# del m1
# del m2
# del gsLGBM
# del gsADA
# del opt_parameters_LGBM
# del opt_parameters_ADA

In [None]:
gbm.fit(X_train_df, y_train_df)

In [None]:
# gbm.score(X_train_df, y_train_df)

In [None]:
X_test = test_df
y_test_gbm = gbm.predict(X_test)

# Submission

In [None]:
lasso_submission = pd.DataFrame({'id': test_df['id'], 'target': y_test_lasso})
lasso_submission.to_csv('lasso_submission.csv', index=False)

In [None]:
ridge_submission = pd.DataFrame({'id': test_df['id'], 'target': y_test_ridge})
ridge_submission.to_csv('ridge_submission.csv', index=False)

In [None]:
lgbm_submission = pd.DataFrame({'id': test_df['id'], 'target': y_test_lgbm})
lgbm_submission.to_csv('lgbm_submission.csv', index=False)

In [None]:
ada_submission = pd.DataFrame({'id': test_df['id'], 'target': y_test_ada})
ada_submission.to_csv('ada_submission.csv', index=False)

In [None]:
gbm_submission = pd.DataFrame({'id': test_df['id'], 'target': y_test_gbm})
gbm_submission.to_csv('gbm_submission.csv', index=False)