### Import Dependencies and Load the data

In [42]:
import os
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, confusion_matrix
from sklearn.ensemble import RandomForestRegressor

In [43]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline

In [44]:
path_hour = None
for root, dirs, filename in os.walk(os.getcwd()): 
    for file in filename:
        if file == 'hour_new.csv':
            path_hour = os.path.join(root, file)
            

In [45]:
df = pd.read_csv(path_hour)

### Feature Engineering

In [46]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,0,1,2011-01-01,springer,2011,January,0,yes,Saturday,holiday or weekend,"Clear, Few clouds, Partly cloudy, Partly cloudy",0.24,0.2879,0.81,0.0,3,13,16
1,1,2,2011-01-01,springer,2011,January,1,yes,Saturday,holiday or weekend,"Clear, Few clouds, Partly cloudy, Partly cloudy",0.22,0.2727,0.8,0.0,8,32,40
2,2,3,2011-01-01,springer,2011,January,2,yes,Saturday,holiday or weekend,"Clear, Few clouds, Partly cloudy, Partly cloudy",0.22,0.2727,0.8,0.0,5,27,32


In [47]:
df.columns

Index(['Unnamed: 0', 'instant', 'dteday', 'season', 'yr', 'mnth', 'hr',
       'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp',
       'hum', 'windspeed', 'casual', 'registered', 'cnt'],
      dtype='object')

In [48]:
df = df.drop(columns=['Unnamed: 0', 'instant', 'dteday'])
df.head(4)

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,springer,2011,January,0,yes,Saturday,holiday or weekend,"Clear, Few clouds, Partly cloudy, Partly cloudy",0.24,0.2879,0.81,0.0,3,13,16
1,springer,2011,January,1,yes,Saturday,holiday or weekend,"Clear, Few clouds, Partly cloudy, Partly cloudy",0.22,0.2727,0.8,0.0,8,32,40
2,springer,2011,January,2,yes,Saturday,holiday or weekend,"Clear, Few clouds, Partly cloudy, Partly cloudy",0.22,0.2727,0.8,0.0,5,27,32
3,springer,2011,January,3,yes,Saturday,holiday or weekend,"Clear, Few clouds, Partly cloudy, Partly cloudy",0.24,0.2879,0.75,0.0,3,10,13


In [49]:
cat_features = [x for x in df.columns if df[x].nunique() <= 25]
num_features = [x for x in df.columns if x not in cat_features]

In [50]:
cat_features

['season',
 'yr',
 'mnth',
 'hr',
 'holiday',
 'weekday',
 'workingday',
 'weathersit']

**Encoding Categorical Features**

In [None]:
nominal_cols = ['holiday', 'weathersit', 'workingday']
ordinal_cols = ['season', 'mnth', 'weekday']
ordinal_categories = [['springer', 'summer', 'fall', 'winter'],
                      ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'],
                      [ 'Sunday', 'Monday', 'Tueday', 'Wednesday', 'Thursday', 'Friday','Saturday']]
numerical_cols = ['temp', 'atemp', 'hum', 'windspeed']


In [None]:
trf1 = ColumnTransformer(transformers=[
    ('ohe', OneHotEncoder(sparse_output=False, drop='first'), nominal_cols),
    ('ord', OrdinalEncoder(categories=ordinal_categories), ordinal_cols)
],remainder='passthrough')

In [None]:
trf2 = ColumnTransformer(transformers=[
    ('scale',StandardScaler(), numerical_cols)
], remainder='passthrough')

In [63]:
trf3 =  RandomForestRegressor()

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   season      17379 non-null  object 
 1   yr          17379 non-null  int64  
 2   mnth        17379 non-null  object 
 3   hr          17379 non-null  int64  
 4   holiday     17379 non-null  object 
 5   weekday     17379 non-null  object 
 6   workingday  17379 non-null  object 
 7   weathersit  17379 non-null  object 
 8   temp        17379 non-null  float64
 9   atemp       17379 non-null  float64
 10  hum         17379 non-null  float64
 11  windspeed   17379 non-null  float64
 12  casual      17379 non-null  int64  
 13  registered  17379 non-null  int64  
 14  cnt         17379 non-null  int64  
dtypes: float64(4), int64(5), object(6)
memory usage: 2.0+ MB


In [66]:
pipe = Pipeline([
    ('trf1', trf1),
    ('trf3',trf3)
])

In [67]:
X = df.drop(columns=['casual','registered','cnt'])
y = df['cnt']

In [68]:
X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                 test_size=0.2,
                                                random_state=42)

In [69]:
y_train

335       72
7035     518
8051       3
2133     172
8485       1
        ... 
11284    359
11964    812
5390     189
860      100
15795    779
Name: cnt, Length: 13903, dtype: int64

In [70]:
from sklearn import set_config
set_config(display='diagram')

In [71]:
pipe.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [73]:
y_pred = pipe.predict(X_test)

In [74]:
r2_score(y_pred, y_test)

0.9405813926990337

array([379.39,  91.55,  11.73, ...,  81.88, 373.79, 240.05])