In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings("ignore")

%matplotlib inline

In [2]:
day=pd.read_csv("day.csv");day.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [3]:
hour=pd.read_csv("hour.csv")
hour.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [4]:
day=day.drop(columns=["instant"])
hour=hour.drop(columns=["instant"])

In [5]:
hour.dtypes

dteday         object
season          int64
yr              int64
mnth            int64
hr              int64
holiday         int64
weekday         int64
workingday      int64
weathersit      int64
temp          float64
atemp         float64
hum           float64
windspeed     float64
casual          int64
registered      int64
cnt             int64
dtype: object

In [6]:
# - instant: record index
# - dteday : date
# - season : season (1:winter, 2:spring, 3:summer, 4:fall)
# - yr : year (0: 2011, 1:2012)
# - mnth : month ( 1 to 12)
# - hr : hour (0 to 23)
# - holiday : weather day is holiday or not (extracted from [Web Link])
# - weekday : day of the week
# - workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
# + weathersit :
# - 1: Clear, Few clouds, Partly cloudy, Partly cloudy
# - 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
# - 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
# - 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
# - temp : Normalized temperature in Celsius. The values are derived via (t-t_min)/(t_max-t_min), t_min=-8, t_max=+39 (only in hourly scale)
# - atemp: Normalized feeling temperature in Celsius. The values are derived via (t-t_min)/(t_max-t_min), t_min=-16, t_max=+50 (only in hourly scale)
# - hum: Normalized humidity. The values are divided to 100 (max)
# - windspeed: Normalized wind speed. The values are divided to 67 (max)
# - casual: count of casual users
# - registered: count of registered users
# - cnt: count of total rental bikes including both casual and registered

In [7]:
day["year"]=[i.split("-")[0] for i in day["dteday"]]
day["month"]=[i.split("-")[1] for i in day["dteday"]]
day["date"]=[i.split("-")[2] for i in day["dteday"]]

hour["year"]=[i.split("-")[0] for i in hour["dteday"]]
hour["month"]=[i.split("-")[1] for i in hour["dteday"]]
hour["date"]=[i.split("-")[2] for i in hour["dteday"]]

In [8]:
day=day.drop(columns=["dteday","year","month","date"])

hour=hour.drop(columns=["dteday","year","month","date"])

In [9]:
day

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,1,0,1,0,2,1,1,0.200000,0.212122,0.590435,0.160296,108,1454,1562
4,1,0,1,0,3,1,1,0.226957,0.229270,0.436957,0.186900,82,1518,1600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726,1,1,12,0,4,1,2,0.254167,0.226642,0.652917,0.350133,247,1867,2114
727,1,1,12,0,5,1,2,0.253333,0.255046,0.590000,0.155471,644,2451,3095
728,1,1,12,0,6,0,2,0.253333,0.242400,0.752917,0.124383,159,1182,1341
729,1,1,12,0,0,0,1,0.255833,0.231700,0.483333,0.350754,364,1432,1796


In [10]:
hour

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0000,3,13,16
1,1,0,1,1,0,6,0,1,0.22,0.2727,0.80,0.0000,8,32,40
2,1,0,1,2,0,6,0,1,0.22,0.2727,0.80,0.0000,5,27,32
3,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0000,3,10,13
4,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,1,1,12,19,0,1,1,2,0.26,0.2576,0.60,0.1642,11,108,119
17375,1,1,12,20,0,1,1,2,0.26,0.2576,0.60,0.1642,8,81,89
17376,1,1,12,21,0,1,1,1,0.26,0.2576,0.60,0.1642,7,83,90
17377,1,1,12,22,0,1,1,1,0.26,0.2727,0.56,0.1343,13,48,61


## for the hour

In [11]:
from sklearn.model_selection import train_test_split
# from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
X = hour.drop(columns=["cnt"])
y = hour.cnt

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42,shuffle=True)


X = hour.drop(columns=["cnt"])
y = hour.cnt

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42,shuffle=True)

In [12]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgbm
import xgboost as xg
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import QuantileTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import ExtraTreeRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
# from sklearn.linear_model import LogisticRegression

#generic function to fit model and return metrics for every algorithm
def boost_models(x):
    #transforming target variable through quantile transformer
    regr_trans = TransformedTargetRegressor(regressor=x, transformer=QuantileTransformer(output_distribution='normal'))
    regr_trans.fit(X_train, y_train)
    yhat = regr_trans.predict(X_test)
    algoname= x.__class__.__name__
    return algoname, round(r2_score(y_test, yhat),3), round(mean_absolute_error(y_test, yhat),2), round(np.sqrt(mean_squared_error(y_test, yhat)),2),round(mean_squared_error(y_test,yhat),2)

algo=[GradientBoostingRegressor(), lgbm.LGBMRegressor(), xg.XGBRFRegressor(),DecisionTreeRegressor(),LinearRegression(),
     KNeighborsRegressor(),RandomForestRegressor(),BaggingRegressor(ExtraTreeRegressor(), random_state=42),
     Ridge(alpha=0.1,normalize=True,max_iter=1e5)]
score=[]
for a in algo:
    score.append(boost_models(a))

 #Collate all scores in a table
pd.DataFrame(score, columns=['Model', 'Score', 'MAE', 'RMSE','MSE'])

Unnamed: 0,Model,Score,MAE,RMSE,MSE
0,GradientBoostingRegressor,0.999,3.54,5.74,32.97
1,LGBMRegressor,0.999,1.92,4.02,16.19
2,XGBRFRegressor,0.996,6.23,11.36,129.14
3,DecisionTreeRegressor,0.999,2.37,4.91,24.06
4,LinearRegression,0.942,32.06,43.06,1853.82
5,KNeighborsRegressor,1.0,1.41,2.56,6.57
6,RandomForestRegressor,1.0,0.97,2.62,6.85
7,BaggingRegressor,0.999,1.78,4.05,16.43
8,Ridge,0.946,31.29,41.53,1725.06


In [13]:
parameters ={'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
     'max_features': [0.3,0.5,0.7,0.9],
     'min_samples_leaf': [3,5,7,10,15],
     'min_samples_split': [2,5,10],
     'n_estimators': [50,100,200,400,600]}
from sklearn.model_selection import ParameterGrid
param_size = ParameterGrid(parameters)
len(param_size)

3000

In [14]:
from sklearn.model_selection import RandomizedSearchCV

# from sklearn.model_selection import RandomizedSearchCV
# from sklearn.ensemble import RandomForestRegressor
# random_search=RandomizedSearchCV(estimator = RandomForestRegressor(), param_distributions=parameters,verbose=1, n_jobs=-1,
#                             n_iter=200)
# random_result = random_search.fit(X_train, y_train)
# print('Best Score: ', random_result.best_score_*100)
# print('Best Params: ', random_result.best_params_)

## for the day

In [15]:
X1 = day.drop(columns=["cnt"])
y1 = day.cnt

X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.25, random_state=42,shuffle=True)

In [16]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgbm
import xgboost as xg
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import QuantileTransformer
# from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import ExtraTreeRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression

#generic function to fit model and return metrics for every algorithm
def boost_models(x):
    #transforming target variable through quantile transformer
    regr_trans = TransformedTargetRegressor(regressor=x)
#                                             transformer=QuantileTransformer(output_distribution='normal'))
    regr_trans.fit(X_train1, y_train1)
    yhat1 = regr_trans.predict(X_test1)
    algoname= x.__class__.__name__
    return algoname, round(r2_score(y_test1, yhat1),3), round(mean_absolute_error(y_test1, yhat1),2), round(np.sqrt(mean_squared_error(y_test1, yhat1)),2),round(mean_squared_error(y_test1,yhat1),2)

algo=[GradientBoostingRegressor(), lgbm.LGBMRegressor(), xg.XGBRFRegressor(),LinearRegression(),
     KNeighborsRegressor(),RandomForestRegressor(),BaggingRegressor(ExtraTreeRegressor(), random_state=42),
     Ridge(alpha=0.1,normalize=True,max_iter=1e5)]
score=[]
for a in algo:
    score.append(boost_models(a))

 #Collate all scores in a table
pd.DataFrame(score, columns=['Model', 'Score', 'MAE', 'RMSE','MSE'])

Unnamed: 0,Model,Score,MAE,RMSE,MSE
0,GradientBoostingRegressor,0.997,79.03,109.28,11941.75
1,LGBMRegressor,0.997,75.73,113.97,12989.42
2,XGBRFRegressor,0.995,91.2,131.95,17410.84
3,LinearRegression,1.0,0.0,0.0,0.0
4,KNeighborsRegressor,0.998,51.79,77.31,5976.69
5,RandomForestRegressor,0.997,65.93,100.27,10054.14
6,BaggingRegressor,0.996,85.46,131.15,17200.07
7,Ridge,0.99,145.5,196.31,38536.26


In [17]:
# Define our candidate hyperparameters
parameters_k  = [{'n_neighbors': [2,3,4,5,6], 'weights': ['uniform','distance']}]

In [18]:
#doesn't show any significant change

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
random_search=RandomizedSearchCV(estimator = KNeighborsRegressor(), param_distributions=parameters_k,verbose=1, n_jobs=-1,
                            n_iter=200)
random_result = random_search.fit(X_train1, y_train1)
print('Best Score: ', random_result.best_score_*100)
print('Best Params: ', random_result.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Score:  99.80337468749771
Best Params:  {'weights': 'distance', 'n_neighbors': 6}


In [None]:
# better choose RandomForestRegressor for hours regression

# better choose LinearRegression for day regression

In [19]:
model_random=RandomForestRegressor().fit(X_train, y_train)
model_linear=LinearRegression().fit(X_train1, y_train1) 

In [20]:
#different ways to make package in python 


from joblib import Parallel, delayed
import joblib


# Save the model as a pickle in a file
joblib.dump(model_linear, 'model_linear.pkl')
joblib.dump(model_random, 'model_random.pkl')

# Load the model from the file
random_from_joblib = joblib.load('model_random.pkl')
linear_from_joblib = joblib.load('model_linear.pkl')

# Use the loaded model to make predictions
random_from_joblib .predict(X_test)


array([426.42,  87.35,   4.  , ...,  88.72,  13.95, 147.64])

In [21]:
X_test

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered
12830,3,1,6,19,0,6,0,1,0.80,0.6970,0.27,0.1940,185,240
8688,1,1,1,20,1,1,0,1,0.24,0.2273,0.41,0.2239,5,83
7091,4,0,10,2,0,5,1,1,0.32,0.3030,0.66,0.2836,1,3
12230,2,1,5,19,0,2,1,1,0.78,0.7121,0.52,0.3582,69,457
431,1,0,1,0,0,4,1,1,0.26,0.2273,0.56,0.3881,5,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4731,3,0,7,1,0,4,1,2,0.74,0.7273,0.84,0.2239,5,16
15987,4,1,11,20,0,6,0,1,0.34,0.3333,0.53,0.1343,23,158
10347,1,1,3,6,0,1,1,1,0.34,0.3485,0.57,0.1045,2,86
4111,3,0,6,5,0,6,0,1,0.62,0.6061,0.61,0.1343,6,8


In [22]:
linear_from_joblib .predict(X_test1)

array([6606., 1550., 3747., 6041., 7538., 7264., 1605., 2209., 7499.,
       5743., 1796., 3068., 4891., 5260., 2133., 2471., 2046., 8156.,
       5362., 2298., 7697., 5463., 5409., 1872., 1807., 5130., 2121.,
       7436., 3830., 5557., 2743., 3644., 6196., 7494., 5918., 3372.,
       7582., 6053., 2566., 1263., 3944., 3956., 7580., 4906., 6966.,
        705., 4458., 5298., 6043., 4996., 3351., 2431., 1011., 4475.,
       4725., 4727., 2395., 3351., 4788., 7175., 6153., 7442., 1471.,
       7865., 6530., 6211., 7403., 4302., 2077., 7333., 3117., 1635.,
       3811., 4595., 4363., 2034., 5686., 4748., 1416., 4401., 2114.,
       2028., 5668.,   22., 3940., 4118., 5423., 1495., 3620., 3403.,
       1501., 7040., 5992., 4990., 3095., 2832., 4713., 2368., 3409.,
       4991., 7713., 3785., 6664., 2496., 2077., 5267., 1162., 2739.,
       6861., 1360., 4602., 7282., 3570., 8167., 6230., 4511., 1461.,
       3272., 5923., 2177., 6398., 1817., 3239., 1349., 3867., 5146.,
       4541., 3368.,

In [23]:
X_test1

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered
703,4,1,12,0,2,1,1,0.475833,0.469054,0.733750,0.174129,551,6055
33,1,0,2,0,4,1,1,0.186957,0.177878,0.437826,0.277752,61,1489
300,4,0,10,0,5,1,2,0.330833,0.318812,0.585833,0.229479,456,3291
456,2,1,4,0,0,0,2,0.425833,0.417287,0.676250,0.172267,2347,3694
633,4,1,9,0,2,1,1,0.550000,0.544179,0.570000,0.236321,845,6693
...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,2,0,4,0,4,1,1,0.459167,0.453892,0.407083,0.325258,745,3444
82,2,0,3,0,4,1,2,0.285000,0.270833,0.805833,0.243787,166,1699
51,1,0,2,1,1,0,2,0.303333,0.284075,0.605000,0.307846,195,912
522,2,1,6,0,3,1,1,0.554167,0.542292,0.611250,0.077125,1027,6028
