## Basic Exploratory Data Analysis(EDA) 

## preparations

In [None]:
#load packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

**Load data**

In [None]:
train = pd.read_csv("../input/tabular-playground-series-jan-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-jan-2021/test.csv")

## **Basic information**


In [None]:
# a quick look into the data
train.head()

In [None]:
#see if there are null values
train.info()

great! no null values in this dataset.

In [None]:
#get some statistical information
train.describe()

### Distribution

In [None]:
#visualize target distribution

sns.distplot(a=train['target'], rug = True)

notice that there is a training sample whose target value is ”abnormally“ small.

In [None]:
train[train['target']<4] # find the samples whose target value is smaller than 4

In [None]:
#visulization of 14 features
fig = plt.figure(figsize=(18,16))
train_feature = train.drop(['id','target'],axis=1)
for index,col in enumerate(train_feature):
    plt.subplot(5,3,index+1)
    sns.distplot(train_feature.loc[:,col], kde = False)
fig.tight_layout(pad=1.0)

### Correlation

In [None]:
# corralation heatmap
mask = np.zeros_like(train_feature.corr())
mask[np.tril_indices_from(mask)] = True

feature_corr = train_feature.corr()
sns.heatmap(feature_corr,cmap= "Blues",mask = mask.T)


## Baseline Regression

train/test set split

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


train_X, val_X,  train_Y, val_Y = train_test_split(
    train_feature, train['target'], test_size=0.2, shuffle=True)


### CatBoosting Baseline

Let's use CatBoostRegressor as our baseline model.

In [None]:
from catboost import CatBoostRegressor
cat = CatBoostRegressor(random_state = 7, loss_function='RMSE', verbose = False)
cat.fit(train_X, train_Y)

val_pred = cat.predict(val_X)
score = np.sqrt(mean_squared_error(val_Y, val_pred)) 

print("CB model RMSE: ",end = "")
print(score)

Export baseline model prediction.

In [None]:

test_pred = cat.predict(test.drop("id",axis = 1))

submission = pd.DataFrame({
        "id": test["id"],
        "target":test_pred
    })
submission.to_csv('baseline_cat.csv', index=False)

### Feature Importance

Moreover, we can easily derive feature importance after training the CatRegressor model. For more information, you may refer to official doc on [Feature importance - Catboost](https://catboost.ai/docs/features/feature-importances-calculation.html#feature-importances-calculation)

In [None]:

plt.figure(figsize=(10, 10))
plt.barh(cat.feature_names_, cat.feature_importances_,height =0.5)


## model tuning

We may use LightGBM, XGBoosting and CatBoost as our base models for model stacking. Before applying [model stacking](https://machinelearningmastery.com/stacking-ensemble-machine-learning-with-python/), we shall fine-tune the base models. [Bayesian Optimization](https://towardsdatascience.com/shallow-understanding-on-bayesian-optimization-324b6c1f7083) is a efficient optimizaion methods by practice.

### LGBM tuning

In [None]:
from bayes_opt import BayesianOptimization
import lightgbm

#codes below are taken from https://www.kaggle.com/yevonnaelandrew/lgbm-cat-xgb-optimization-stacking


dtrain = lightgbm.Dataset(data=train_feature, label=train['target'])

def hyp_lgbm(num_leaves, feature_fraction, bagging_fraction, max_depth, min_split_gain, min_child_weight, learning_rate):
      
        params = {'application':'regression','num_iterations': 5000,
                  'early_stopping_round':100, 'metric':'rmse'}
        params["num_leaves"] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['min_split_gain'] = min_split_gain
        params['min_child_weight'] = min_child_weight
        params['learning_rate'] = learning_rate
        cv_result = lightgbm.cv(params, dtrain, nfold=3, 
                                seed=7, stratified=False, 
                                verbose_eval =None, metrics=['rmse'])
        
        return -np.min(cv_result['rmse-mean']) 
        #add a minus because Bayesian Optimization can only be performed to approximate maxima.

In [None]:
pds = {
    'num_leaves': (5, 50),
    'feature_fraction': (0.2, 1),
    'bagging_fraction': (0.2, 1),
    'max_depth': (2, 20),
    'min_split_gain': (0.001, 0.1),
    'min_child_weight': (10, 50),
    'learning_rate': (0.01, 0.5),
      }

In [None]:
# codes below takes a long execution time, uncomment to see the process
# optimizer = BayesianOptimization(hyp_lgbm,pds,random_state=7)
# optimizer.maximize(init_points=10, n_iter=50)

In [None]:
# optimizer.max['params']

### CatBoost Tuning

In [None]:
import catboost as cgb

def cat_hyp(depth, bagging_temperature, l2_leaf_reg, learning_rate):
  params = {"iterations": 100,
            "loss_function": "RMSE",
            "verbose": False} 
  params["depth"] = int(round(depth)) 
  params["bagging_temperature"] = bagging_temperature
  params["learning_rate"] = learning_rate
  params["l2_leaf_reg"] = l2_leaf_reg
  
  cat_feat = [] # Categorical features list, we have nothing in this dataset
  cv_dataset = cgb.Pool(data=train_feature, label=train['target'], cat_features=cat_feat)

  scores = cgb.cv(cv_dataset,
              params,
              fold_count=3)
  return -np.min(scores['test-RMSE-mean']) 

In [None]:
# Search space
pds = {'depth': (3, 10),
       'bagging_temperature': (0.1,10),
       'l2_leaf_reg': (0.1, 10),
       'learning_rate': (0.05, 0.3),
        }

In [None]:
# optimizer = BayesianOptimization(cat_hyp, pds, random_state=7)
# optimizer.maximize(init_points=10, n_iter=80)

In [None]:
# optimizer.max['params']

### XGBoosting tuning

In [None]:
import xgboost as xgb

dtrain = xgb.DMatrix(train_feature, train['target'], feature_names=train_feature.columns.values)
def hyp_xgb(max_depth, subsample, colsample_bytree,min_child_weight, gamma, learning_rate):
    params = {
    'objective': 'reg:squarederror',
    'eval_metric':'rmse',
    'nthread':-1
     }
    
    params['max_depth'] = int(round(max_depth))
    params['subsample'] = max(min(subsample, 1), 0)
    params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['min_child_weight'] = int(min_child_weight)
    params['gamma'] = max(gamma, 0)
    params['learning_rate'] = learning_rate
    scores = xgb.cv(params, dtrain, num_boost_round=500,verbose_eval=False, 
                    early_stopping_rounds=10, nfold=3)
    return -scores['test-rmse-mean'].iloc[-1]

In [None]:
pds ={
  'min_child_weight':(3, 20),
  'gamma':(0, 5),
  'subsample':(0.7, 1),
  'colsample_bytree':(0.1, 1),
  'max_depth': (3, 10),
  'learning_rate': (0.01, 0.5)
}

In [None]:
# optimizer = BayesianOptimization(hyp_xgb, pds, random_state=7)
# optimizer.maximize(init_points=4, n_iter=15)

## model stacking

In [None]:
## parameters derived from Bayesian Optimizaion fine-tuning
param_lgbm = {
     'bagging_fraction': 0.973905385549851,
     'feature_fraction': 0.2945585590881137,
     'learning_rate': 0.03750332268701348,
     'max_depth': int(7.66),
     'min_child_weight': int(41.36),
     'min_split_gain': 0.04033836353603582,
     'num_leaves': int(46.42),
     'application':'regression',
     'num_iterations': 5000,
     'metric': 'rmse'
}

param_cat = {
     'bagging_temperature': 0.31768713094131684,
     'depth': int(8.03),
     'l2_leaf_reg': 1.3525686450404295,
     'learning_rate': 0.18,
     'iterations': 150,
     'loss_function': 'RMSE',
     'verbose': False
}


param_xgb = {
     'colsample_bytree': 0.8119098377889549,
     'gamma': 2.244423418642122,
     'learning_rate': 0.015800631696721114,
     'max_depth': int(9.846),
     'min_child_weight': int(15.664),
     'subsample': 0.82345,
     'objective': 'reg:squarederror',
     'eval_metric':'rmse',
     'num_boost_roun' : 500
}

In [None]:
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor

estimators = [
        ('lgbm', lightgbm.LGBMRegressor(**param_lgbm, random_state=7, n_jobs=-1)),
        ('xgbr', XGBRegressor(**param_xgb, random_state=7, nthread=-1)),
        ('cat', CatBoostRegressor(**param_cat))
]

reg = StackingRegressor(
    estimators=estimators,
    final_estimator=lightgbm.LGBMRegressor(),
    n_jobs=-1,
    cv=5
)

train_X, val_X,  train_Y, val_Y = train_test_split(
    train_feature, train['target'], test_size=0.2, shuffle=True)

reg.fit(train_X,train_Y)

val_pred = reg.predict(val_X)
score = np.sqrt(mean_squared_error(val_Y, val_pred))

print("Final model RMSE: ",end = "")
print(score)



finally we can make prediction to the test set.

In [None]:
#predict
reg = StackingRegressor(
    estimators=estimators,
    final_estimator=lightgbm.LGBMRegressor(),
    n_jobs=-1,
    cv=5
)

reg.fit(train_feature, train['target'])


In [None]:
test_pred = reg.predict(test.drop("id",axis = 1))

submission = pd.DataFrame({
        "id": test["id"],
        "target":test_pred
    })
submission.to_csv('stacking_sub.csv', index=False)