In [107]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from math import sqrt
from sklearn.metrics import mean_squared_error

In [91]:
# Load dataset
df =pd.read_csv("day.csv")

In [92]:
# removing outliers if any
q1 = df["cnt"].quantile(0.25)
q3 = df["cnt"].quantile(0.75)
iqr = q3-q1 #Interquartile range
fence_low  = q1-1.5*iqr
fence_high = q3+1.5*iqr
df = df.loc[(df["cnt"] > fence_low) & (df["cnt"] < fence_high)]

# Another way to detect outlier via 3 standard deviation rule
len(df[((df.cnt - df.cnt.mean()) / df.cnt.std()).abs() < 3])


# Seems like no outliers present

731

In [93]:
# Converting categorical variables as dummies
y=df["cnt"].tolist()
df = pd.concat([df, pd.get_dummies(df['mnth'],prefix="mnth")], axis=1)
df = pd.concat([df, pd.get_dummies(df['weekday'],prefix="weekday")], axis=1)
df = pd.concat([df, pd.get_dummies(df['weathersit'],prefix="weathersit")], axis=1)
df.drop( ['instant','mnth','weekday','weathersit', 'dteday', 'casual','registered','cnt'], axis=1, inplace=True)
X=df.copy(deep=False)

In [94]:
# Split dataset into train-test as 80:20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [95]:
# Loading different models
clf1=RandomForestRegressor()
clf2=GradientBoostingRegressor()
clf3=LinearRegression()

In [96]:
# Training all 3 models
clf1.fit(X_train,y_train)
clf2.fit(X_train,y_train)
clf3.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [97]:
# RMSE for all 3 models

y_pred1=clf1.predict(X_test)
y_pred2=clf2.predict(X_test)
y_pred3=clf3.predict(X_test)

print("RMSE for Random Forest Regressor: ",sqrt(mean_squared_error(y_pred1,y_test)))
print("RMSE for Gradient Boosting Regressor: ",sqrt(mean_squared_error(y_pred2,y_test)))
print("RMSE for Linear Regression Model: ",sqrt(mean_squared_error(y_pred3,y_test)))

RMSE for Random Forest Regressor:  693.2658981792133
RMSE for Gradient Boosting Regressor:  651.8561014138761
RMSE for Linear Regression Model:  793.2677819676381


In [98]:
#Seems Like Gradient Boosting is performing better than rest, Tuning Hyper Parameters for the same

In [109]:
param_grid = {
            'learning_rate': [0.1,0.01,0.001],
            'max_depth': [3,4, 6, 10, 15],
            'max_features': [0.8, 0.6,None,'log2','sqrt'],
            'n_estimators': [50,100, 200, 300, 1000]
        }
        # Create a based model
gbr = GradientBoostingRegressor()
#Instantiate the grid search model
clf = GridSearchCV(estimator = gbr, param_grid = param_grid,cv = 3)
clf.fit(X_train, y_train)


GridSearchCV(cv=3, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'learning_rate': [0.1, 0.01, 0.001], 'max_depth': [3, 4, 6, 10, 15], 'max_features': [0.8, 0.6, None, 'log2', 'sqrt'], 'n_estimators': [50, 100, 200, 300, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [116]:
y_pred=clf.predict(X_test)
print("RMSE for Gradient Boosting Regressor after HP tuning: ",sqrt(mean_squared_error(y_pred,y_test)))

RMSE for Gradient Boosting Regressor after HP tuning:  630.69867852846


In [121]:
# Example Input file
X_test.to_csv("sample_input.csv",index=False)

In [123]:
newDF=pd.DataFrame()
newDF["Predicted_values"]=y_pred

In [125]:
newDF.to_csv("sample_output.csv",index=False)