In [1]:
import pandas as pd
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
import numpy as np
from scipy import stats
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from pprint import pprint

## Loading Data


In [2]:
df = pd.read_csv('train.csv') 

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    10886 non-null  object 
 1   season      10886 non-null  int64  
 2   holiday     10886 non-null  int64  
 3   workingday  10886 non-null  int64  
 4   weather     10886 non-null  int64  
 5   temp        10886 non-null  float64
 6   atemp       10886 non-null  float64
 7   humidity    10886 non-null  int64  
 8   windspeed   10886 non-null  float64
 9   casual      10886 non-null  int64  
 10  registered  10886 non-null  int64  
 11  count       10886 non-null  int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.7+ KB


In [4]:
X = df.drop(['count', 'casual', 'registered'], axis=1) 
y = df['count']
#drop 'casual', 'registered here because test.csv doesn't have these columns

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

In [6]:
def extract(df):
        df.datetime = pd.to_datetime(df.datetime)
        df['day'] = df.datetime.dt.day
        df['hour'] = df.datetime.dt.hour
        df['weekday'] = df.datetime.dt.weekday
        df['year'] = df.datetime.dt.year
        df['month'] = df.datetime.dt.month
        
        return pd.concat([df[['day']],pd.get_dummies(df[['year','month','weekday','hour']],columns=['year','month','weekday','hour'])],axis=1)



## Create pipeline

In [7]:
preprosessor = ColumnTransformer([
    ('do_nothing', 'passthrough', ['holiday', 'workingday', 'season', 'weather', 'windspeed', 'temp', 'humidity']),
    ('time_extact', FunctionTransformer(extract), ['datetime']),
    #('one_hot_encoding', OneHotEncoder(sparse = False), ['season','weather']),
    #('0_imputer', SimpleImputer(strategy='mean', fill_value=0), ['windspeed']),
    ], 
    remainder='drop')  

In [8]:
# create the model pipeline
pipeline = make_pipeline(preprosessor, RandomForestRegressor())

In [9]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('do_nothing', 'passthrough',
                                                  ['holiday', 'workingday',
                                                   'season', 'weather',
                                                   'windspeed', 'temp',
                                                   'humidity']),
                                                 ('time_extact',
                                                  FunctionTransformer(func=<function extract at 0x7fa322c17430>),
                                                  ['datetime'])])),
                ('randomforestregressor', RandomForestRegressor())])

In [10]:
X_train

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
2930,2011-07-11 00:00:00,3,0,1,1,28.70,32.575,65,12.9980
7669,2012-05-18 22:00:00,2,0,1,1,22.96,26.515,52,22.0028
1346,2011-04-01 23:00:00,2,0,1,1,12.30,15.910,61,6.0032
9432,2012-09-16 09:00:00,3,0,0,1,23.78,27.275,60,8.9981
453,2011-02-01 23:00:00,1,0,1,3,8.20,9.850,93,12.9980
...,...,...,...,...,...,...,...,...,...
5734,2012-01-14 02:00:00,1,0,0,1,6.56,8.335,47,11.0014
5191,2011-12-10 09:00:00,4,0,0,1,11.48,12.880,61,19.0012
5390,2011-12-18 16:00:00,4,0,0,1,11.48,13.635,48,16.9979
860,2011-02-19 07:00:00,1,0,0,1,15.58,19.695,17,35.0008


In [11]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8164 entries, 2930 to 7270
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    8164 non-null   object 
 1   season      8164 non-null   int64  
 2   holiday     8164 non-null   int64  
 3   workingday  8164 non-null   int64  
 4   weather     8164 non-null   int64  
 5   temp        8164 non-null   float64
 6   atemp       8164 non-null   float64
 7   humidity    8164 non-null   int64  
 8   windspeed   8164 non-null   float64
dtypes: float64(3), int64(5), object(1)
memory usage: 637.8+ KB


In [12]:
pipeline.score(X_train, y_train)

0.9886268208446299

In [13]:
pipeline.score(X_test, y_test)

0.927260363191386

## Cross Validation

In [14]:
pipeline = make_pipeline(preprosessor, RandomForestRegressor()) #an unfitted model, default value

In [15]:
cross_acc = cross_val_score(estimator=pipeline,        # estimator: # the model you want to evaluate/unfitted 
                            X=X_train,            # the training input data 
                            y=y_train,            # the training output data  
                            cv=5,                # number of cross validation datasets 
                            scoring='r2'   # evaluation metric 
                            ) 

In [16]:
cross_acc

array([0.90039583, 0.91246852, 0.91274002, 0.91729668, 0.90043166])

In [17]:
cross_acc.mean()   

0.9086665411208805

## Grid Search Tuning

In [18]:
pipeline_opt = make_pipeline(preprosessor, RandomForestRegressor())  #an unfitted model

In [19]:
pprint(pipeline_opt.get_params())

{'columntransformer': ColumnTransformer(transformers=[('do_nothing', 'passthrough',
                                 ['holiday', 'workingday', 'season', 'weather',
                                  'windspeed', 'temp', 'humidity']),
                                ('time_extact',
                                 FunctionTransformer(func=<function extract at 0x7fa322c17430>),
                                 ['datetime'])]),
 'columntransformer__do_nothing': 'passthrough',
 'columntransformer__n_jobs': None,
 'columntransformer__remainder': 'drop',
 'columntransformer__sparse_threshold': 0.3,
 'columntransformer__time_extact': FunctionTransformer(func=<function extract at 0x7fa322c17430>),
 'columntransformer__time_extact__accept_sparse': False,
 'columntransformer__time_extact__check_inverse': True,
 'columntransformer__time_extact__func': <function extract at 0x7fa322c17430>,
 'columntransformer__time_extact__inv_kw_args': None,
 'columntransformer__time_extact__inverse_func': None,
 

In [20]:
# define our hyperparameters to combine
hyperparam_grid = {
    'randomforestregressor__max_depth': [2, 5, 10, 20, 30],
    'randomforestregressor__n_estimators': [5, 10, 50, 100, 200],
    'randomforestregressor__min_samples_leaf': [1,5,10]
}


In [21]:
grid_cv = GridSearchCV(estimator=pipeline_opt,            # unfitted model/estimator
                       param_grid=hyperparam_grid,  # hyperparameters dict
                       cv=5,                        # number of folds, k
                       scoring='r2')                # scoring metric

In [22]:
# fit all models with all the different hyperparamters
grid_cv.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('do_nothing',
                                                                         'passthrough',
                                                                         ['holiday',
                                                                          'workingday',
                                                                          'season',
                                                                          'weather',
                                                                          'windspeed',
                                                                          'temp',
                                                                          'humidity']),
                                                                        ('time_extact',
                                                          

In [23]:
results_df = pd.DataFrame(grid_cv.cv_results_)

In [24]:
results_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_randomforestregressor__max_depth,param_randomforestregressor__min_samples_leaf,param_randomforestregressor__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.027260,0.003888,0.006815,0.000193,2,1,5,"{'randomforestregressor__max_depth': 2, 'rando...",0.272442,0.286807,0.300588,0.291957,0.247325,0.279824,0.018643,70
1,0.045762,0.005518,0.008383,0.002022,2,1,10,"{'randomforestregressor__max_depth': 2, 'rando...",0.287372,0.297735,0.266821,0.290645,0.268343,0.282183,0.012393,69
2,0.158643,0.001113,0.009107,0.000129,2,1,50,"{'randomforestregressor__max_depth': 2, 'rando...",0.293064,0.290371,0.265576,0.284289,0.277800,0.282220,0.009848,68
3,0.328379,0.015753,0.012924,0.001767,2,1,100,"{'randomforestregressor__max_depth': 2, 'rando...",0.289348,0.298006,0.266678,0.297413,0.280802,0.286449,0.011704,64
4,0.623534,0.016642,0.017367,0.000680,2,1,200,"{'randomforestregressor__max_depth': 2, 'rando...",0.295696,0.298718,0.269894,0.297752,0.280312,0.288474,0.011445,61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,0.096717,0.004891,0.007651,0.000207,30,10,5,"{'randomforestregressor__max_depth': 30, 'rand...",0.765539,0.821676,0.829374,0.822125,0.802977,0.808338,0.023113,30
71,0.255365,0.121492,0.013229,0.008723,30,10,10,"{'randomforestregressor__max_depth': 30, 'rand...",0.802153,0.824201,0.845197,0.831736,0.814571,0.823572,0.014659,27
72,0.897316,0.103138,0.018721,0.004763,30,10,50,"{'randomforestregressor__max_depth': 30, 'rand...",0.810777,0.840669,0.844405,0.848798,0.822490,0.833428,0.014435,23
73,1.705207,0.078575,0.026521,0.001340,30,10,100,"{'randomforestregressor__max_depth': 30, 'rand...",0.813602,0.843083,0.839957,0.845081,0.829419,0.834228,0.011641,21


In [25]:
results_df.shape

(75, 16)

In [26]:
results_df.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_randomforestregressor__max_depth',
       'param_randomforestregressor__min_samples_leaf',
       'param_randomforestregressor__n_estimators', 'params',
       'split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score',
       'std_test_score', 'rank_test_score'],
      dtype='object')

In [27]:
results_df[['param_randomforestregressor__max_depth', 'param_randomforestregressor__min_samples_leaf', 'param_randomforestregressor__n_estimators', 'mean_test_score',
       'std_test_score', 'rank_test_score']].sort_values('rank_test_score')

Unnamed: 0,param_randomforestregressor__max_depth,param_randomforestregressor__min_samples_leaf,param_randomforestregressor__n_estimators,mean_test_score,std_test_score,rank_test_score
64,30,1,200,0.909772,0.006090,1
63,30,1,100,0.909237,0.006358,2
62,30,1,50,0.908590,0.004589,3
49,20,1,200,0.897027,0.006024,4
48,20,1,100,0.895857,0.006149,5
...,...,...,...,...,...,...
12,2,10,50,0.278764,0.011962,71
6,2,5,10,0.272110,0.022013,72
10,2,10,5,0.271200,0.025128,73
11,2,10,10,0.267622,0.015453,74


In [28]:
grid_cv.best_params_  

{'randomforestregressor__max_depth': 30,
 'randomforestregressor__min_samples_leaf': 1,
 'randomforestregressor__n_estimators': 200}

In [29]:
#estimating the test performance after tuning
pipeline_best = make_pipeline(preprosessor,RandomForestRegressor(max_depth=30, n_estimators=200, min_samples_leaf=1))

cross_acc = cross_val_score(estimator=pipeline_best,  # estimator: # the model you want to evaluate 
                            X=X_train,          # the training input data 
                            y=y_train,          # the training output data  
                            cv=5,               # number of cross validation datasets, k   
                            scoring='r2')       # evaluation metric 

In [30]:
cross_acc

array([0.9026446 , 0.9118887 , 0.91296451, 0.91856648, 0.90263365])

In [31]:
cross_acc.mean()

0.9097395888279612

In [32]:
pipeline_best.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('do_nothing', 'passthrough',
                                                  ['holiday', 'workingday',
                                                   'season', 'weather',
                                                   'windspeed', 'temp',
                                                   'humidity']),
                                                 ('time_extact',
                                                  FunctionTransformer(func=<function extract at 0x7fa322c17430>),
                                                  ['datetime'])])),
                ('randomforestregressor',
                 RandomForestRegressor(max_depth=30, n_estimators=200))])

In [33]:
pipeline_best.score(X_train, y_train)

0.9884791844130679

In [34]:
pipeline_best.score(X_test, y_test)

0.9276784363277204

## RMSLE

In [35]:
def rmsle(y_pred, y,convertExp=True):
    if convertExp:
        y_pred = np.exp(y_pred),
        y = np.exp(y)
    log1 = np.nan_to_num(np.array([np.log(v + 1) for v in y_pred]))
    log2 = np.nan_to_num(np.array([np.log(v + 1) for v in y]))
    calc = (log1 - log2) ** 2
    return np.sqrt(np.mean(calc))

In [36]:
y_pred = pipeline_best.predict(X_train) 

In [37]:
rmsle(y_pred, y_train, convertExp=True)

#doesn't work

  y_pred = np.exp(y_pred),
  result = getattr(ufunc, method)(*inputs, **kwargs)
  calc = (log1 - log2) ** 2


inf

## Kaggle Submission

In [38]:
kaggle_data = pd.read_csv('test.csv')

In [39]:
kaggle_data

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0000
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0000
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.880,56,11.0014
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.880,56,11.0014
...,...,...,...,...,...,...,...,...,...
6488,2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014
6489,2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014
6490,2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014
6491,2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981


In [40]:
predictions = pipeline_best.predict(kaggle_data)

In [41]:
submission = pd.DataFrame({'datetime':kaggle_data['datetime'],'count':predictions})

In [42]:
submission

Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,23.204564
1,2011-01-20 01:00:00,17.896035
2,2011-01-20 02:00:00,16.639171
3,2011-01-20 03:00:00,14.498180
4,2011-01-20 04:00:00,12.206628
...,...,...
6488,2012-12-31 19:00:00,207.067119
6489,2012-12-31 20:00:00,134.504325
6490,2012-12-31 21:00:00,114.728435
6491,2012-12-31 22:00:00,107.664905


In [43]:
#Convert DataFrame to a csv file that can be uploaded
filename = 'Bike Sharing Demand RF.csv'
submission.to_csv(filename,index=False)
print('Saved file: ' + filename)        

Saved file: Bike Sharing Demand RF.csv
