In [None]:
#Import libraries

import pandas as pd
import numpy as np

!pip install -q fastai==2.2.5 fastcore==1.3.19 fast-tabnet==0.2.0

from fastai.tabular.all import *
from fast_tabnet.core import *

!pip install -Uqq fastbook 
import fastbook
fastbook.setup_book()

from fastbook import *

In [None]:
#set input path and download the test and training datasets

input_path = Path('/kaggle/input/tabular-playground-series-jan-2021')
train_df = pd.read_csv(input_path/'train.csv')
test_df = pd.read_csv(input_path/'test.csv')

In [None]:
def r_mse(pred,y): 
    return round(math.sqrt(((pred-y)**2).mean()), 6)

def m_rmse(m, xs, y): 
    return r_mse(m.predict(xs), y)

In [None]:
from IPython.display import Image, display_svg, SVG

pd.options.display.max_rows = 20
pd.options.display.max_columns = 8

In [None]:
train_df_bkp = train_df.copy(deep=False)

In [None]:
from sklearn.model_selection import train_test_split

target = train_df.pop('target')
X_train, X_test, y_train, y_test = train_test_split(train_df, target, train_size=0.80)

# Simple model #1: Decision tree with stopping criteria (max leaves = 4)

In [None]:
from sklearn.tree import DecisionTreeRegressor

#Creating a Decision tree -- with stopping criteria (max leaves = 4)

m = DecisionTreeRegressor(max_leaf_nodes=4)
m.fit(X_train, y_train);

In [None]:
print ("training error", m_rmse(m, X_train, y_train))
print ("test error", m_rmse(m, X_test, y_test))

In [None]:
draw_tree(m, X_train, size=15, leaves_parallel=True, precision=3)

### with 25 max leaf nodes

In [None]:
m25 = DecisionTreeRegressor(max_leaf_nodes=25)
m25.fit(X_train, y_train);

In [None]:
print ("training error", m_rmse(m25, X_train, y_train))
print ("test error", m_rmse(m25, X_test, y_test))

# Simple model #2: Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

def rf(xs, y, n_estimators=40, max_samples=50000,
       max_features='sqrt', min_samples_leaf=5, **kwargs):
    return RandomForestRegressor(n_jobs=-1, n_estimators=n_estimators,
        max_samples=max_samples, max_features=max_features,
        min_samples_leaf=min_samples_leaf, oob_score=True).fit(xs, y)

In [None]:
mrf = rf(X_train, y_train)

In [None]:
# before box cox
print ("training error", m_rmse(mrf, X_train, y_train))
print ("test error", m_rmse(mrf, X_test, y_test))

In [None]:
def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
                       ).sort_values('imp', ascending=False)

In [None]:
fi = rf_feat_importance(mrf, X_train)
fi[:14]

In [None]:
def plot_fi(fi):
    return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)

plot_fi(fi[:14]);

# Light GBM

In [None]:
#remove the id columns 

X_train.pop('id')
X_test.pop('id')

In [None]:
import lightgbm as lgb

LGB = lgb.LGBMRegressor(random_state=33, n_estimators=5000, min_data_per_group=5, boosting_type='gbdt',
 num_leaves=246, max_dept=-1, learning_rate=0.005, subsample_for_bin=200000,
 lambda_l1= 1.07e-05, lambda_l2= 2.05e-06, n_jobs=-1, cat_smooth=1.0, 
 importance_type='split', metric='rmse', min_child_samples=20, min_gain_to_split=0.0, feature_fraction=0.5, 
 bagging_freq=6, min_sum_hessian_in_leaf=0.001, min_data_in_leaf=100, bagging_fraction=0.80)

In [None]:
m_LGB = LGB.fit(X_train, y_train)

In [None]:
print ("training error", m_rmse(m_LGB, X_train, y_train))
print ("test error", m_rmse(m_LGB, X_test, y_test))

In [None]:
#view the importance of the features

lgb.plot_importance(m_LGB, ax=None, height=0.2, xlim=None, ylim=None, 
                      title='Feature importance', xlabel='Feature importance', ylabel='Features', 
                      importance_type='split', max_num_features=None, 
                      ignore_zero=True, figsize=None, dpi=None, grid=True, precision=7)

# Hypothesis: feature scaling : box-cox transformation

In [None]:

input_path = Path('/kaggle/input/tabular-playground-series-jan-2021')
train_df = pd.read_csv(input_path/'train.csv')
test_df = pd.read_csv(input_path/'test.csv')

In [None]:
from scipy.stats import boxcox

train_df['cont5'] = boxcox(train_df['cont5'], 0)

In [None]:
target = train_df.pop('target')

X_train, X_test, y_train, y_test = train_test_split(train_df, target, train_size=0.80)

In [None]:
#remove the id columns 

X_train.pop('id')
X_test.pop('id')

In [None]:
import lightgbm as lgb

LGB = lgb.LGBMRegressor(random_state=33, n_estimators=5000, min_data_per_group=5, boosting_type='gbdt',
 num_leaves=246, max_dept=-1, learning_rate=0.005, subsample_for_bin=200000,
 lambda_l1= 1.07e-05, lambda_l2= 2.05e-06, n_jobs=-1, cat_smooth=1.0, 
 importance_type='split', metric='rmse', min_child_samples=20, min_gain_to_split=0.0, feature_fraction=0.5, 
 bagging_freq=6, min_sum_hessian_in_leaf=0.001, min_data_in_leaf=100, bagging_fraction=0.80)

m_LGB_box_cox = LGB.fit(X_train, y_train)

In [None]:
print ("training error", m_rmse(m_LGB_box_cox, X_train, y_train))
print ("test error", m_rmse(m_LGB_box_cox, X_test, y_test))

# Hypothesis 2: Normalizing leads to better performance
**##### Excluded from the article**

In [None]:
SEED = 42
set_seed(SEED, reproducible=True)

y_names = ['target']
contl = list(train_df_bkp.columns.values)
contl.remove('id')
cont_names = contl

#cat_names is blank because the dataset does not contain any categorical variables

cat_names = []

procs = [FillMissing, Normalize]
#procs = [FillMissing]
splits = RandomSplitter(seed=SEED)(range_of(train_df))

In [None]:
#using tabular pandas
to = TabularPandas(
    train_df_bkp, 
    procs=procs, 
    cat_names=cat_names, 
    cont_names=cont_names, 
    y_names=y_names, 
    y_block=RegressionBlock(),
    splits=splits,
)

In [None]:
#visualisation before Normalisation
import seaborn as sns

j = 1
plt.figure()
fig, ax = plt.subplots(5, 3,figsize=(12, 22))
for feature in cont_names:
    plt.subplot(5, 3,j)
    sns.distplot(X_train[feature],color="blue", kde=True,bins=120, label='train')
    sns.distplot(X_test[feature],color="red", kde=True,bins=120, label='test')
    plt.xlabel(feature, fontsize=9); plt.legend()
    j += 1
plt.show()

In [None]:
# before Normalisation
X_train.describe()

In [None]:
#visualisation after Normalisation

j = 1
plt.figure()
fig, ax = plt.subplots(5, 3,figsize=(12, 22))
for feature in cont_names:
    plt.subplot(5, 3,j)
    sns.distplot(to.train.xs[feature],color="blue", kde=True,bins=120, label='train')
    sns.distplot(to.valid.xs[feature],color="red", kde=True,bins=120, label='test')
    plt.xlabel(feature, fontsize=9); plt.legend()
    j += 1
plt.show()

In [None]:
# after Normalisation
to.train.xs.describe()

In [None]:
import lightgbm as lgb

LGB = lgb.LGBMRegressor(random_state=33, n_estimators=5000, min_data_per_group=5, boosting_type='gbdt',
 num_leaves=246, max_dept=-1, learning_rate=0.005, subsample_for_bin=200000,
 lambda_l1= 1.07e-05, lambda_l2= 2.05e-06, n_jobs=-1, cat_smooth=1.0, 
 importance_type='split', metric='rmse', min_child_samples=20, min_gain_to_split=0.0, feature_fraction=0.5, 
 bagging_freq=6, min_sum_hessian_in_leaf=0.001, min_data_in_leaf=100, bagging_fraction=0.80)

In [None]:
m_LGB_norm = LGB.fit(to.train.xs, to.train.y)

In [None]:
print ("training error", m_rmse(m_LGB_norm, to.train.xs, to.train.y))
print ("test error", m_rmse(m_LGB_norm, to.valid.xs, to.valid.y))

In [None]:
to.train.y.describe()

In [None]:
to.valid.y.describe()

In [None]:
from sklearn.tree import DecisionTreeRegressor

#Creating a Decision tree -- with stopping criteria (max leaves = 4)

m = DecisionTreeRegressor(max_leaf_nodes=4)
m.fit(to.train.xs, to.train.y);

In [None]:
print ("training error", m_rmse(m, to.train.xs, to.train.y))
print ("test error", m_rmse(m, to.valid.xs, to.valid.y))

In [None]:
from sklearn.tree import DecisionTreeRegressor

#Creating a Decision tree -- with stopping criteria (max leaves = 4)

m = DecisionTreeRegressor(max_leaf_nodes=4)
m.fit(X_train, y_train);

In [None]:
print ("training error", m_rmse(m, X_train, y_train))
print ("test error", m_rmse(m, X_test, y_test))