In [1]:
# data analysis stack
import pandas as pd
import numpy as np

# data visualization stack
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set() # set seaborn as default style

# data pre-processing stack
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
    PolynomialFeatures
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

#machine learning stack
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression


from sklearn.model_selection import GridSearchCV

# miscellaneous
import time
import warnings
warnings.filterwarnings("ignore")

In [3]:
bike = pd.read_csv("../week03/data/train.csv", index_col=0, parse_dates=True)
bike.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [4]:
bike["log_casual"]=np.log1p(bike["casual"])
bike["log_registered"]=np.log1p(bike["registered"])

In [5]:
def addcolumns(df):
    df["year"] = df.index.year
    df["month"] = df.index.month
    df["hour"] = df.index.hour
    df["weather"].replace([4], [3], inplace = True)
    return df

In [6]:
addcolumns(bike)
bike

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,log_casual,log_registered,year,month,hour
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3,13,16,1.386294,2.639057,2011,1,0
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8,32,40,2.197225,3.496508,2011,1,1
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5,27,32,1.791759,3.332205,2011,1,2
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3,10,13,1.386294,2.397895,2011,1,3
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0,1,1,0.000000,0.693147,2011,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-12-19 19:00:00,4,0,1,1,15.58,19.695,50,26.0027,7,329,336,2.079442,5.799093,2012,12,19
2012-12-19 20:00:00,4,0,1,1,14.76,17.425,57,15.0013,10,231,241,2.397895,5.446737,2012,12,20
2012-12-19 21:00:00,4,0,1,1,13.94,15.910,61,15.0013,4,164,168,1.609438,5.105945,2012,12,21
2012-12-19 22:00:00,4,0,1,1,13.94,17.425,61,6.0032,12,117,129,2.564949,4.770685,2012,12,22


In [98]:
def monthindex(year, month):
    return (year-2011)*12 + month
    
bike['month_idx'] = monthindex(bike["year"], bike["month"])

In [10]:
bike['weekday'] = bike.index.weekday + 1
bike.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,log_casual,log_registered,year,month,hour,month_idx,day_of_week,weekday
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,1.386294,2.639057,2011,1,0,1,6,6
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2.197225,3.496508,2011,1,1,1,6,6
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,1.791759,3.332205,2011,1,2,1,6,6
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,1.386294,2.397895,2011,1,3,1,6,6
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,0.0,0.693147,2011,1,4,1,6,6


**CASUAL USERS**

In [11]:
train,test = train_test_split(bike, test_size=0.2, random_state=101)

In [12]:
# we need to delete either the "temp" or the "atemp" because they are highly corelated with each other.
numerical_features = [
     'temp', 
     'humidity',
     'windspeed',
     'month_idx',
     'hour' 
]

categorical_features = [
    'holiday',
    'workingday',
    'weekday',
    'weather'
]

features = numerical_features + categorical_features

target_variable = 'log_casual'


In [13]:
X_casual_train,y_casual_train = train[features], train[target_variable]

In [14]:
# scaling and polynomial features
numerical_transformer = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('polynomial', PolynomialFeatures())
    ]
)

In [15]:
# one-hot encoding
categorical_transformer = Pipeline(
    steps=[
        ('ohe', OneHotEncoder(drop='first'))
    ]
)

In [16]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [17]:
estimator = Pipeline(
    steps=[
        ('preprocessor', preprocessor),   # preprocessing step
        ('lasso', Lasso()) # lasso regression
    ]
)

In [18]:
param_grid = {
    'preprocessor__num__polynomial__degree': [4],
    'preprocessor__num__polynomial__interaction_only': [False,True],
    'lasso__alpha': [100.,10.,1.,0.1,0.01],
    'lasso__max_iter': [5_000, 10_000,20_000]
}

In [19]:
gscv = GridSearchCV(
    estimator=estimator,
    param_grid=param_grid,
    scoring='r2',
    cv=5, 
    n_jobs=-1,
    verbose=1
)

In [20]:
# initial time
ti = time.time()

# grid-search cross-validation
gscv.fit(X_casual_train,y_casual_train)

# final time 
tf = time.time()

# time taken
print(f"time taken: {round(tf-ti,2)} sec")

Fitting 5 folds for each of 30 candidates, totalling 150 fits
time taken: 19.89 sec


In [21]:
gscv.cv_results_

{'mean_fit_time': array([0.23005977, 0.10099335, 0.17078781, 0.11367297, 0.19273062,
        0.09287019, 0.19516473, 0.08853636, 0.15862908, 0.0947823 ,
        0.1678926 , 0.09265676, 0.16960344, 0.08304114, 0.16183844,
        0.09263768, 0.16570301, 0.09843564, 0.2494791 , 0.09455628,
        0.26225605, 0.10310035, 0.24902687, 0.10128469, 0.73325715,
        0.09680586, 0.76633291, 0.10694547, 0.6834578 , 0.10204468]),
 'std_fit_time': array([0.02621363, 0.01841723, 0.00443732, 0.01729378, 0.02315191,
        0.01102349, 0.02225712, 0.00695561, 0.00790266, 0.00382429,
        0.00838159, 0.01025344, 0.01580267, 0.00502478, 0.01187724,
        0.00536708, 0.00645495, 0.01291908, 0.01050075, 0.01198723,
        0.00556076, 0.01713036, 0.02015297, 0.00795516, 0.0388491 ,
        0.00412789, 0.10750186, 0.01149417, 0.09846312, 0.02858834]),
 'mean_score_time': array([0.03977423, 0.03490372, 0.04125147, 0.0356648 , 0.03922033,
        0.0326232 , 0.04102464, 0.02820859, 0.03798842, 0.02

In [23]:
# list of columns to show
column_list = ['param_preprocessor__num__polynomial__degree',
               'param_preprocessor__num__polynomial__interaction_only',
               'param_lasso__alpha',
               'param_lasso__max_iter',
               'mean_test_score',
               'std_test_score',
               'rank_test_score'
              ]
# create result dataframe
result_df = pd.DataFrame(gscv.cv_results_)[column_list]

# rename columns
result_df.rename(
    columns=lambda name: name.split('__')[-1],inplace=True
)

# order by rank
result_df.sort_values(
    by='rank_test_score', ascending=True, inplace=True, ignore_index=True
)

result_df

Unnamed: 0,degree,interaction_only,alpha,max_iter,mean_test_score,std_test_score,rank_test_score
0,4,False,0.01,10000,0.817814,0.007092,1
1,4,False,0.01,5000,0.817814,0.007092,1
2,4,False,0.01,20000,0.817814,0.007092,1
3,4,False,0.1,20000,0.657246,0.006787,4
4,4,False,0.1,10000,0.657246,0.006787,4
5,4,False,0.1,5000,0.657246,0.006787,4
6,4,True,0.01,20000,0.618719,0.01054,7
7,4,True,0.01,10000,0.618719,0.01054,7
8,4,True,0.01,5000,0.618719,0.01054,7
9,4,True,0.1,20000,0.561331,0.007897,10


In [24]:
gscv.best_params_

{'lasso__alpha': 0.01,
 'lasso__max_iter': 5000,
 'preprocessor__num__polynomial__degree': 4,
 'preprocessor__num__polynomial__interaction_only': False}

In [25]:
round(gscv.best_score_,6)

0.817814

In [26]:
best_model_casual = gscv.best_estimator_
best_model_casual

In [27]:
best_model_casual.fit(X_casual_train,y_casual_train);

In [28]:
# training score
casual_training_score = best_model_casual.score(X_casual_train, y_casual_train)

print(f'Casual Train score: {round(casual_training_score,6)}')


Casual Train score: 0.821153


**Casual Test **

In [29]:
X_casual_test,y_casual_test = test[features], test[target_variable]

In [30]:
best_model_casual.fit(X_casual_test,y_casual_test);

In [31]:
# training score
casual_test_score = best_model_casual.score(X_casual_test, y_casual_test)

print(f'Casual Test score: {round(casual_test_score,6)}')

Casual Test score: 0.822295


**Casual total prediction**

In [32]:
X_casual_total, y_casual_total = bike[features], bike[target_variable]

In [33]:
y_casual_pred = best_model_casual.predict(X_casual_total)
y_casual_pred

array([1.48057208, 0.9511494 , 0.73362157, ..., 2.27855827, 2.25470619,
       2.07337434])

**REGISTERED USERS**

In [64]:
numerical_features = [
     'temp',
     'humidity',
     'windspeed',
     'month_idx',
     'hour' 
]

categorical_features = [
    'holiday',
    'workingday',
    'weekday',
    'weather'
]

features = numerical_features + categorical_features

target_variable = 'log_registered'


In [65]:
X_registered_train, y_registered_train = train[features], train[target_variable]

In [66]:
# scaling and polynomial features
numerical_transformer = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('polynomial', PolynomialFeatures())
    ]
)

In [67]:
# one-hot encoding
categorical_transformer = Pipeline(
    steps=[
        ('ohe', OneHotEncoder(drop='first'))
    ]
)

In [68]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [69]:
estimator = Pipeline(
    steps=[
        ('preprocessor', preprocessor),   # preprocessing step
        ('lasso', Lasso()) # lasso regression
    ]
)

In [70]:
param_grid = {
    'preprocessor__num__polynomial__degree': [5],
    'preprocessor__num__polynomial__interaction_only': [False,True],
    'lasso__alpha': [100.,10.,1.,0.1,0.01],
    'lasso__max_iter': [5_000, 10_000,20_000]
}

In [71]:
gscv = GridSearchCV(
    estimator=estimator,
    param_grid=param_grid,
    scoring='r2',
    cv=5, 
    n_jobs=-1,
    verbose=1
)

In [72]:
# initial time
ti = time.time()

# grid-search cross-validation
gscv.fit(X_registered_train,y_registered_train)

# final time 
tf = time.time()

# time taken
print(f"time taken: {round(tf-ti,2)} sec")

Fitting 5 folds for each of 30 candidates, totalling 150 fits
time taken: 34.33 sec


In [73]:
gscv.cv_results_

{'mean_fit_time': array([0.32619724, 0.06792297, 0.28758125, 0.15352683, 0.31421704,
        0.11384525, 0.22251372, 0.12746487, 0.22086339, 0.07080388,
        0.3650012 , 0.06737084, 0.2218605 , 0.07081261, 0.27296438,
        0.07316427, 0.21739836, 0.08799424, 0.52935176, 0.07598839,
        0.522544  , 0.07818527, 0.52098632, 0.07900157, 5.4818882 ,
        0.09982524, 5.35389366, 0.10034742, 4.57057757, 0.10624661]),
 'std_fit_time': array([0.10620876, 0.00428752, 0.06632667, 0.03252569, 0.0588974 ,
        0.01833889, 0.04737465, 0.01271577, 0.01846747, 0.00415445,
        0.1427331 , 0.00513048, 0.01608981, 0.00793252, 0.03068632,
        0.00857692, 0.01112403, 0.01673336, 0.04666297, 0.0105298 ,
        0.04005958, 0.01177234, 0.04469005, 0.01052762, 0.12419836,
        0.01629294, 0.26567383, 0.00926995, 0.79540119, 0.01484245]),
 'mean_score_time': array([0.04094191, 0.02272358, 0.08137317, 0.0383162 , 0.05090795,
        0.02523274, 0.06462517, 0.04155555, 0.03924241, 0.02

In [74]:
# list of columns to show
column_list = ['param_preprocessor__num__polynomial__degree',
               'param_preprocessor__num__polynomial__interaction_only',
               'param_lasso__alpha',
               'param_lasso__max_iter',
               'mean_test_score',
               'std_test_score',
               'rank_test_score'
              ]
# create result dataframe
result_df = pd.DataFrame(gscv.cv_results_)[column_list]

# rename columns
result_df.rename(
    columns=lambda name: name.split('__')[-1],inplace=True
)

# order by rank
result_df.sort_values(
    by='rank_test_score', ascending=True, inplace=True, ignore_index=True
)

result_df

Unnamed: 0,degree,interaction_only,alpha,max_iter,mean_test_score,std_test_score,rank_test_score
0,5,False,0.01,10000,0.741149,0.012144,1
1,5,False,0.01,5000,0.741149,0.012144,1
2,5,False,0.01,20000,0.741149,0.012144,1
3,5,False,0.1,20000,0.610024,0.014701,4
4,5,False,0.1,10000,0.610024,0.014701,4
5,5,False,0.1,5000,0.610024,0.014701,4
6,5,True,0.01,20000,0.465801,0.019683,7
7,5,True,0.01,10000,0.465801,0.019683,7
8,5,True,0.01,5000,0.465801,0.019683,7
9,5,True,0.1,20000,0.44179,0.017672,10


In [75]:
gscv.best_params_

{'lasso__alpha': 0.01,
 'lasso__max_iter': 5000,
 'preprocessor__num__polynomial__degree': 5,
 'preprocessor__num__polynomial__interaction_only': False}

In [76]:
round(gscv.best_score_,6)

0.741149

In [77]:
best_model_registered = gscv.best_estimator_
best_model_registered

In [78]:
best_model_registered.fit(X_registered_train,y_registered_train);

In [79]:
# training score
registered_train_score = best_model_registered.score(X_registered_train, y_registered_train)

print(f'Registered Train score: {round(registered_train_score,6)}')


Registered Train score: 0.746643


**Registered Test**

In [80]:
X_registered_test,y_registered_test = test[features], test[target_variable]

In [81]:
best_model_registered.fit(X_registered_test,y_registered_test);

In [82]:
# training score
registered_test_score = best_model_registered.score(X_registered_test, y_registered_test)

print(f'Registered Test score: {round(registered_test_score,6)}')


Registered Test score: 0.765034


**Registered total prediction**

In [83]:
X_registered_total, y_registered_total = bike[features], bike[target_variable]

In [84]:
y_registered_pred = best_model_registered.predict(X_registered_total)
y_registered_pred

array([2.20920215, 1.1780434 , 0.92263077, ..., 5.49387229, 5.14107308,
       4.23005365])

In [85]:
saturday = pd.DataFrame({
    "datetime": [pd.to_datetime("20230415-14:00:00")],
    "season": [1],
    "holiday": [0],
    "workingday": [0],
    "weather": [3],
    "temp": [23.3],
    "atemp": [24],
    "humidity": [76],
    "windspeed": [5],
    "casual": [""],
    "registered": [""],
    "count": [""],
}).set_index("datetime")

In [86]:
saturday = addcolumns(saturday)
saturday

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,hour
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2023-04-15 14:00:00,1,0,0,3,23.3,24,76,5,,,,2023,4,14


In [87]:
saturday['month_idx'] =  saturday["month"]

In [88]:
saturday['weekday'] = saturday.index.weekday + 1
saturday.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,hour,month_idx,weekday
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2023-04-15 14:00:00,1,0,0,3,23.3,24,76,5,,,,2023,4,14,4,6


In [89]:
X_saturday_casual = saturday[features]
X_saturday_register = saturday[features]

In [90]:
y_pred_casual = best_model_casual.predict(X_saturday_casual)
y_pred_casual

array([4.26775084])

In [91]:
y_pred_register = best_model_registered.predict(X_saturday_casual)
y_pred_register

array([4.75945651])

In [92]:
y_pred_casual_nonlog = np.exp(y_pred_casual)-1
y_pred_casual_nonlog

array([70.36095266])

In [93]:
y_pred_register_nonlog = np.exp(y_pred_register)-1
y_pred_register_nonlog

array([115.68249315])

In [97]:
saturday_count = y_pred_casual_nonlog + y_pred_register_nonlog
saturday_count 

array([186.04344581])