# Here we will take a closer look at the feature selection on the example of the Bike Rentals dataset.
Data source and description:

http://archive.ics.uci.edu/ml/datasets/Bike+Sharing+Dataset

# Plan:
1. Data exploration.
2. Random forest model.
3. Feature importance understanding and selection.

# References:
 - https://arxiv.org/pdf/1309.6392.pdf
 - Friedman, Jerome H. “Greedy function approximation: A gradient boosting machine.” Annals of statistics (2001): 1189-1232.↩



In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import plot_partial_dependence
import pdpbox
from pdpbox import pdp, get_dataset, info_plots
import xgboost as xgb
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm

np.random.seed(1)


In [None]:
hour = pd.read_csv("../input/bike-sharing-dataset/hour.csv")

# First, we will make a data exploration.

In [None]:
hour.head(6)

In [None]:
hour.shape

In [None]:
# Plottong data.

In [None]:
hour['season'].unique()

In [None]:
hour['yr'].unique()

In [None]:
hour['mnth'].unique()

In [None]:
hour['hr'].unique()

In [None]:
hour['holiday'].unique()

In [None]:
hour['weekday'].unique()

In [None]:
hour['workingday'].unique()

In [None]:
hour['weathersit'].unique()

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(hour['temp'], '.')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.distplot(hour['temp'])

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(hour['atemp'], '.')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.distplot(hour['atemp'])
# are they related?

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(hour['hum'][:200], '.')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.distplot(hour['hum'])

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(hour['windspeed'], '.')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.distplot(hour['windspeed'])

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(hour['casual'], '.')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.distplot(hour['casual'])

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(hour['registered'], '.')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.distplot(hour['registered'])

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(hour['cnt'], '.')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.distplot(hour['cnt'])

In [None]:
# Exclude casual and registered and use cnt as target.

In [None]:
# Number of days since the 01.01.2011 (the first day in the dataset). 
# This feature was introduced to take account of the trend over time.

hour['date'] = pd.to_datetime(hour['dteday'])

basedate = pd.Timestamp('2011-01-01')
hour['days_since'] = hour['date'].apply(lambda x: (x - basedate).days)

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(hour['hum'], hour['days_since'], '.')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.regplot(x=hour["temp"], y=hour["cnt"])

In [None]:
plt.figure(figsize=(12, 6))
sns.jointplot(x=hour["temp"], y=hour["cnt"], kind='scatter')

In [None]:
plt.figure(figsize=(12, 6))
sns.regplot(x=hour["atemp"], y=hour["cnt"])

Temp and atemp show clear influence on cnt.

In [None]:
plt.figure(figsize=(12, 6))
sns.regplot(x=hour["hum"], y=hour["cnt"])

In [None]:
plt.figure(figsize=(12, 6))
sns.regplot(x=hour["days_since"], y=hour["cnt"])

In [None]:
# For categorial features.
plt.figure(figsize=(12, 6))
sns.violinplot(x="season", y="cnt", data=hour, palette="muted")

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x="season", y="cnt", data=hour)

In [None]:
plt.figure(figsize=(12, 6))
sns.violinplot(x="yr", y="cnt", data=hour, palette="muted")

In [None]:
plt.figure(figsize=(12, 6))
sns.violinplot(x="mnth", y="cnt", data=hour, palette="muted")

In [None]:
plt.figure(figsize=(12, 6))
sns.violinplot(x="hr", y="cnt", data=hour, palette="muted")

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x="hr", y="cnt", data=hour)

In [None]:
plt.figure(figsize=(12, 6))
sns.violinplot(x="holiday", y="cnt", data=hour, palette="muted")

In [None]:
plt.figure(figsize=(12, 6))
sns.violinplot(x="weekday", y="cnt", data=hour, palette="muted")

In [None]:
plt.figure(figsize=(12, 6))
sns.violinplot(x="workingday", y="cnt", data=hour, palette="muted")

In [None]:
plt.figure(figsize=(12, 6))
sns.violinplot(x="weathersit", y="cnt", data=hour, palette="muted")

In [None]:
# Cyclical features. Hour, weekday and month. It is usefull for NN algorhitms. Tree algos are robust without it.

def encode_cyclical(data, col_name, max_val):
    data[col_name + '_sin'] = np.sin(2 * np.pi * data[col_name] / max_val)
    data[col_name + '_cos'] = np.cos(2 * np.pi * data[col_name] / max_val)
    return data


hour = encode_cyclical(hour, 'hr', 24)
hour = encode_cyclical(hour, 'mnth', 12)
hour = encode_cyclical(hour, 'weekday', 7)


In [None]:
features = ['season', 'yr', 'mnth', 'hr', 'holiday',
            'weekday', 'workingday', 'weathersit', 'temp', 'atemp',
            'hum', 'windspeed', 'days_since']

X, y = hour[features], hour['cnt']

# Since the features are far from linear we will try a simple tree algorhitm first namely Random Forest.

In [None]:
def rand_forest_model(X, y):
    rmse_arr = []
    
    kf = KFold(n_splits=5, random_state=1, shuffle=True)

    for n, (train_index, val_index) in enumerate(kf.split(X, y)):
        print(f'fold: {n}')
        train_X = X.iloc[train_index].values
        val_X = X.iloc[val_index].values
        train_y = y[train_index].values
        val_y = y[val_index].values
        
        regr = RandomForestRegressor(max_depth=20, n_estimators=140, random_state=0)
        regr.fit(train_X, train_y)
        # print(regr.feature_importances_)

        y_pred = regr.predict(val_X)
        
        # Predicted values should be non negative.
        y_pred[y_pred < 0] = 0
        
        rmse = np.sqrt(mean_squared_error(val_y, y_pred))
        rmse_arr.append(rmse)
        
    print('RMSE list:', rmse_arr)
    print('RMSE AVG:', np.mean(rmse_arr))
    return {'rmse_arr': rmse_arr, 'y_pred': y_pred, 'y_val': val_y, 'train_X': train_X, 'model': regr}


res = rand_forest_model(X, y)

In [None]:
# Pot predicitons.
plt.figure(figsize=(12, 6))
plt.plot(res['y_pred'], '.', label='pred')
plt.plot(res['y_val'], '.', label='original')
plt.legend()
plt.show()


In [None]:
# plotting absolute deviation.
plt.figure(figsize=(12, 6))
plt.plot(np.abs(res['y_pred'] - res['y_val']), '.')
plt.title('Deviation from val.')
plt.show()


# Here, we analyze feature influence on the result (feature importance).

In [None]:
# Evaluating features importance for RFregressor model.
# res['model'].feature_importances_
plt.figure(figsize=(12, 6))
sns.barplot(x=res['model'].feature_importances_, y=features)
plt.title('Feature importances')

# Partial dependence plots.

In [None]:
plot_partial_dependence(estimator=res['model'], X=res['train_X'], features=[(0, 2), 2], feature_names=features) 

In [None]:
plot_partial_dependence(estimator=res['model'], X=res['train_X'], features=[0, 1], feature_names=features) 

In [None]:
plot_partial_dependence(estimator=res['model'], X=res['train_X'], features=[2, 3], feature_names=features) 


Here we can a see strong dependance on "hour", obviously people rent more during day than night and at specific hours.

In [None]:
plot_partial_dependence(estimator=res['model'], X=res['train_X'], features=[4, 5], feature_names=features) 

Weekday plays a significant role.

In [None]:
plot_partial_dependence(estimator=res['model'], X=res['train_X'], features=[6, 7], feature_names=features) 

Weather situation has hight variation in PDP, means high dependance. Working day plays smaller role.

In [None]:
plot_partial_dependence(estimator=res['model'], X=res['train_X'], features=[8, 9], feature_names=features) 

Temperature plays a big role, no surprise when its cold its not pleasant to bike. These two features might depend on each other (check it later).

In [None]:
print(np.corrcoef(hour["temp"], hour["atemp"]))

In [None]:
plt.figure(figsize=(12, 6))
sns.regplot(x=hour["temp"], y=hour["atemp"])

Temp or atemp can be excluded (one of them).

In [None]:
plot_partial_dependence(estimator=res['model'], X=res['train_X'], features=[10, 11], feature_names=features) 

Here humidity plays a big role.

In [None]:
plot_partial_dependence(estimator=res['model'], X=res['train_X'], features=[12], feature_names=features) 

It seems this feature captures trend in rentals.

Now we are going to exclude couple of important features and not important and understand its influence on the resulting score.

In [None]:
# Excluding holiday, "not very important" feature.
features = ['season', 'yr', 'mnth', 'hr',
            'weekday', 'workingday', 'weathersit', 'temp', 'atemp',
            'hum', 'windspeed', 'days_since']

X, y = hour[features], hour['cnt']

rand_forest_model(X, y)

RMSE changes not much.

In [None]:
# excluding atemp, "not very important" feature.
features = ['season', 'yr', 'mnth', 'hr', 'holiday',
            'weekday', 'workingday', 'weathersit', 'temp', 'atemp',
            'hum', 'windspeed', 'days_since']

X, y = hour[features], hour['cnt']

rand_forest_model(X, y)

As expected removing atemp changes RMSE not much (almost the same value).

In [None]:
# Excluding hour, "very important" feature.
features = ['season', 'yr', 'mnth', 'holiday',
            'weekday', 'workingday', 'weathersit', 'temp', 'atemp',
            'hum', 'windspeed', 'days_since']

X, y = hour[features], hour['cnt']

rand_forest_model(X, y)

RMSE rises a lot. So the hour is obviously an important feature.

In [None]:
# Lets try to remove days_since feature.
features = ['season', 'yr', 'mnth', 'hr', 'holiday',
            'weekday', 'workingday', 'weathersit', 'temp',
            'hum', 'windspeed']

X, y = hour[features], hour['cnt']

rand_forest_model(X, y)

Days since is an importnt feature.

In [None]:
features = ['season', 'yr', 'mnth', 'hr', 'holiday',
            'weekday', 'workingday', 'weathersit', 'temp',
            'hum', 'windspeed']

rmse_ft_arr = []

for n in range(len(features)):
    print(f'step {n}')
    features = ['season', 'yr', 'mnth', 'hr', 'holiday',
            'weekday', 'workingday', 'weathersit', 'temp',
            'hum', 'windspeed']
    features.remove(features[n])
    X, y = hour[features], hour['cnt']

    res1 = rand_forest_model(X, y)

    rmse_ft_arr.append(np.mean(res1['rmse_arr']))


In [None]:
features = ['season', 'yr', 'mnth', 'hr', 'holiday',
            'weekday', 'workingday', 'weathersit', 'temp',
            'hum', 'windspeed']

plt.figure(figsize=(12, 6))
sns.barplot(x=rmse_ft_arr, y=features)
plt.title('RMSE vs removed feature.')


# ICE plots.

In [None]:
features = ['season', 'yr', 'mnth', 'hr', 'holiday',
            'weekday', 'workingday', 'weathersit', 'temp', 'atemp',
            'hum', 'windspeed', 'days_since']

data_df = pd.DataFrame(res['train_X'], columns=features)

pdp_hr = pdp.pdp_isolate(
    model=res['model'], dataset=data_df, model_features=features, feature='hr', num_grid_points=200
)

fig, axes = pdp.pdp_plot(pdp_hr, 'hr', plot_lines=True, frac_to_plot=400)

In [None]:
plot_partial_dependence(estimator=res['model'], X=res['train_X'], features=[3], feature_names=features, grid_resolution=200) 

# XGB Model.

In [None]:
features = ['season', 'yr', 'mnth', 'hr', 'holiday',
            'weekday', 'workingday', 'weathersit', 'temp',
            'hum', 'windspeed', 'days_since']

X, y = hour[features], hour['cnt']

pars = {
    'learning_rate': 0.1,
    'max_depth': 12,
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'gamma': 0.25,
    'n_estimators': 280
}

def xgb_model(X, y, pars):    
    rmse_arr = []
    
    kf = KFold(n_splits=5, random_state=1, shuffle=True)

    for n, (train_index, val_index) in enumerate(kf.split(X, y)):
#         print(f'fold: {n}')
        
        train_X = X.iloc[train_index]
        val_X = X.iloc[val_index]
        train_y = y[train_index]
        val_y = y[val_index]
        
        xgb_train = xgb.DMatrix(train_X, label=train_y)
        xgb_eval = xgb.DMatrix(val_X, label=val_y)
        
        xgb_model = xgb.train(pars,
              xgb_train,
              num_boost_round=800,
              evals=[(xgb_train, 'train'), (xgb_eval, 'val')],
              verbose_eval=False,
              early_stopping_rounds=30
             )
    
        y_pred = xgb_model.predict(xgb.DMatrix(val_X))

        rmse = np.sqrt(mean_squared_error(val_y, y_pred))
        rmse_arr.append(rmse)
        
    print('RMSE list:', rmse_arr)
    print('RMSE AVG:', np.mean(rmse_arr))
    return {'rmse_arr': rmse_arr, 'y_pred': y_pred, 'y_val': val_y, 'train_X': train_X, 'model': xgb_model}


features = ['season', 'yr', 'mnth', 'hr', 'holiday',
            'weekday', 'workingday', 'weathersit',
            'temp', 'hum', 'windspeed', 'days_since']

X, y = hour[features], hour['cnt']

res = xgb_model(X, y, pars)


# Simple hyperparameters tuning.
reference:

https://xgboost.readthedocs.io/en/latest/tutorials/param_tuning.html

In [None]:
par_grid = {
    "max_depth": [3, 6, 7, 8, 9],
    "min_child_weight": [0.5, 1, 3],
    "gamma": [0.25, 0.5, 0.8, 0.9, 1.1],
    "n_estimators": [60, 80, 100, 140]
#     "learning_rate": [0.05, 0.15, 0.25, 0.30],
#     "colsample_bytree": [0.3, 0.4, 0.5, 0.7, 0.9],
#     "etha": [0.01, 0.5, 0.1, 0.2],
#     "subsample": [0.5, 0.7, 1.0],
#     "lambda": [0.5, 1.0, 2.0]
}

rmse_avg_min = 1e10
min_pars = None


print('total:', len(ParameterGrid(par_grid)))
for n, par in enumerate(ParameterGrid(par_grid)):
    print(f'step {n}')
    
    model_pars = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
    }
    
    for k, v in par.items():
        model_pars[k] = v 
    
    res = xgb_model(X, y, model_pars)
    
    rmse_avg = np.mean(res['rmse_arr'])
    
    if rmse_avg < rmse_avg_min:
        rmse_avg_min = rmse_avg
        min_pars = par

        
print(f'Best AVG RMSE: {rmse_avg_min}')
print('Best parameters:', min_pars)

Usually if one instance of the model is trained slowly, it could take a lot of time to find optimal hyperparameters in one kernel. This procedure can be done with several cores via "lazy paralelism".

In [None]:
print(f'With tunned parameters:', min_pars)
print('xgb model gives:')
res = xgb_model(X, y, min_pars)

The average RMSE with xgb is smaler than with random forest.

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(np.abs(res['y_pred'] - res['y_val']), '.')
plt.title('Abs deviation from validation set.')
plt.ylabel('Rentals')
plt.show()