In [1]:
# data analysis stack
import pandas as pd
import numpy as np

# data visualization stack
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set() # set seaborn as default style

# data pre-processing stack
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
    PolynomialFeatures
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

#machine learning stack
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression


from sklearn.model_selection import GridSearchCV

# miscellaneous
import time
import warnings
warnings.filterwarnings("ignore")

In [2]:
bike = pd.read_csv("../week03/data/train.csv", index_col=0, parse_dates=True)
bike.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [3]:
bike_kag=pd.read_csv('../week03/data/test.csv',index_col=0,parse_dates=True)
bike_kag

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0000
2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0000
2011-01-20 03:00:00,1,0,1,1,10.66,12.880,56,11.0014
2011-01-20 04:00:00,1,0,1,1,10.66,12.880,56,11.0014
...,...,...,...,...,...,...,...,...
2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014
2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014
2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014
2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981


In [4]:
bike["log_casual"]=np.log1p(bike["casual"])
bike["log_registered"]=np.log1p(bike["registered"])

In [5]:
def addcolumns(df):
    df["year"] = df.index.year
    df["month"] = df.index.month
    df["hour"] = df.index.hour
    df["weather"].replace([4], [3], inplace = True)
    return df

In [6]:
addcolumns(bike)
bike

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,log_casual,log_registered,year,month,hour
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3,13,16,1.386294,2.639057,2011,1,0
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8,32,40,2.197225,3.496508,2011,1,1
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5,27,32,1.791759,3.332205,2011,1,2
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3,10,13,1.386294,2.397895,2011,1,3
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0,1,1,0.000000,0.693147,2011,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-12-19 19:00:00,4,0,1,1,15.58,19.695,50,26.0027,7,329,336,2.079442,5.799093,2012,12,19
2012-12-19 20:00:00,4,0,1,1,14.76,17.425,57,15.0013,10,231,241,2.397895,5.446737,2012,12,20
2012-12-19 21:00:00,4,0,1,1,13.94,15.910,61,15.0013,4,164,168,1.609438,5.105945,2012,12,21
2012-12-19 22:00:00,4,0,1,1,13.94,17.425,61,6.0032,12,117,129,2.564949,4.770685,2012,12,22


In [7]:
def monthindex(year, month):
    return (year-2011)*12 + month
    
bike['month_idx'] = monthindex(bike["year"], bike["month"])

In [8]:
bike['weekday'] = bike.index.weekday + 1
bike.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,log_casual,log_registered,year,month,hour,month_idx,weekday
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,1.386294,2.639057,2011,1,0,1,6
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2.197225,3.496508,2011,1,1,1,6
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,1.791759,3.332205,2011,1,2,1,6
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,1.386294,2.397895,2011,1,3,1,6
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,0.0,0.693147,2011,1,4,1,6


**CASUAL USERS**

In [86]:
# we need to delete either the "temp" or the "atemp" because they are highly corelated with each other.
numerical_features = [
     'temp', 
     'humidity',
     'windspeed',
     'month_idx',
     'hour' 
]

categorical_features = [
    'holiday',
    'workingday',
    'weekday',
    'weather'
]

features = numerical_features + categorical_features

target_variable = 'log_casual'


In [87]:
X_casual_bike, y_casual_bike = bike[features], bike[target_variable]

In [88]:
# scaling and polynomial features
numerical_transformer = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('polynomial', PolynomialFeatures())
    ]
)

In [89]:
# one-hot encoding
categorical_transformer = Pipeline(
    steps=[
        ('ohe', OneHotEncoder(drop='first'))
    ]
)

In [90]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [91]:
estimator = Pipeline(
    steps=[
        ('preprocessor', preprocessor),   # preprocessing step
        ('lasso', Lasso()) # lasso regression
    ]
)

In [92]:
param_grid = {
    'preprocessor__num__polynomial__degree': [4],
    'preprocessor__num__polynomial__interaction_only': [False,True],
    'lasso__alpha': [100.,10.,1.,0.1,0.01],
    'lasso__max_iter': [5_000, 10_000,20_000]
}

In [93]:
gscv = GridSearchCV(
    estimator=estimator,
    param_grid=param_grid,
    scoring='r2',
    cv=5, 
    n_jobs=-1,
    verbose=1
)

In [94]:
# initial time
ti = time.time()

# grid-search cross-validation
gscv.fit(X_casual_bike,y_casual_bike)

# final time 
tf = time.time()

# time taken
print(f"time taken: {round(tf-ti,2)} sec")

Fitting 5 folds for each of 30 candidates, totalling 150 fits
time taken: 19.93 sec


In [95]:
gscv.cv_results_

{'mean_fit_time': array([0.21895185, 0.07560782, 0.14620843, 0.06773672, 0.15700879,
        0.11457787, 0.19261985, 0.08145967, 0.16703539, 0.08106875,
        0.15123081, 0.07995358, 0.17503839, 0.08916039, 0.1632236 ,
        0.08170733, 0.15551581, 0.07722259, 0.23648777, 0.08451147,
        0.22481322, 0.07160463, 0.23261342, 0.08342266, 0.81082454,
        0.09199719, 0.85910358, 0.09688702, 0.81046433, 0.10936279]),
 'std_fit_time': array([0.03333644, 0.00380578, 0.00806495, 0.00486233, 0.01537326,
        0.02572317, 0.02104414, 0.01334867, 0.00863188, 0.00742699,
        0.00478875, 0.00508058, 0.01776134, 0.01258582, 0.00707614,
        0.00522488, 0.0070145 , 0.00500506, 0.01846663, 0.00674894,
        0.01440732, 0.01488783, 0.03097473, 0.00853829, 0.20719703,
        0.00284768, 0.22641426, 0.0121176 , 0.2075928 , 0.02733821]),
 'mean_score_time': array([0.0393405 , 0.02549825, 0.03418341, 0.018786  , 0.03078055,
        0.0348701 , 0.03204722, 0.02329259, 0.03057542, 0.02

In [96]:
# list of columns to show
column_list = ['param_preprocessor__num__polynomial__degree',
               'param_preprocessor__num__polynomial__interaction_only',
               'param_lasso__alpha',
               'param_lasso__max_iter',
               'mean_test_score',
               'std_test_score',
               'rank_test_score'
              ]
# create result dataframe
result_df = pd.DataFrame(gscv.cv_results_)[column_list]

# rename columns
result_df.rename(
    columns=lambda name: name.split('__')[-1],inplace=True
)

# order by rank
result_df.sort_values(
    by='rank_test_score', ascending=True, inplace=True, ignore_index=True
)

result_df

Unnamed: 0,degree,interaction_only,alpha,max_iter,mean_test_score,std_test_score,rank_test_score
0,4,False,0.01,10000,0.64319,0.124777,1
1,4,False,0.01,5000,0.64319,0.124777,1
2,4,False,0.01,20000,0.64319,0.124777,1
3,4,True,0.01,20000,0.510327,0.052416,4
4,4,True,0.01,10000,0.510327,0.052416,4
5,4,True,0.01,5000,0.510327,0.052416,4
6,4,False,0.1,20000,0.487723,0.081381,7
7,4,False,0.1,10000,0.487723,0.081381,7
8,4,False,0.1,5000,0.487723,0.081381,7
9,4,True,0.1,20000,0.456614,0.030571,10


In [97]:
gscv.best_params_

{'lasso__alpha': 0.01,
 'lasso__max_iter': 5000,
 'preprocessor__num__polynomial__degree': 4,
 'preprocessor__num__polynomial__interaction_only': False}

In [98]:
round(gscv.best_score_,6)

0.64319

In [99]:
best_model_casual = gscv.best_estimator_
best_model_casual

In [101]:
best_model_casual.fit(X_casual_bike,y_casual_bike);

In [102]:
# training score
casual_training_score = best_model_casual.score(X_casual_bike, y_casual_bike)

print(f'Casual Train score: {round(casual_training_score,6)}')


Casual Train score: 0.819281


In [104]:
addcolumns(bike_kag)
bike_kag.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,hour,month_idx,weekday,log_casual,pred_casual
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011,1,0,1,4,2.496787,11.143414
2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,1,1,4,1.705331,4.503209
2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,2,1,4,1.3112,2.710624
2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,3,1,4,1.392509,3.024938
2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,4,1,4,1.752258,4.76761


In [107]:
bike_kag['month_idx'] = monthindex(bike_kag["year"], bike_kag["month"])
bike_kag

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,hour,month_idx,weekday,log_casual,pred_casual
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011,1,0,1,4,2.496787,11.143414
2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0000,2011,1,1,1,4,1.705331,4.503209
2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0000,2011,1,2,1,4,1.311200,2.710624
2011-01-20 03:00:00,1,0,1,1,10.66,12.880,56,11.0014,2011,1,3,1,4,1.392509,3.024938
2011-01-20 04:00:00,1,0,1,1,10.66,12.880,56,11.0014,2011,1,4,1,4,1.752258,4.767610
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014,2012,12,19,24,1,5.623252,275.787948
2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014,2012,12,20,24,1,5.595935,268.329223
2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014,2012,12,21,24,1,5.413028,223.309681
2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,2012,12,22,24,1,4.972277,143.355158


In [108]:
bike_kag['weekday'] = bike_kag.index.weekday + 1
bike_kag.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,hour,month_idx,weekday,log_casual,pred_casual
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011,1,0,1,4,2.496787,11.143414
2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,1,1,4,1.705331,4.503209
2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,2,1,4,1.3112,2.710624
2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,3,1,4,1.392509,3.024938
2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,4,1,4,1.752258,4.76761


In [109]:
X_casual_kag = bike_kag[features]

In [110]:
y_casual_kag = best_model_casual.predict(X_casual_kag)
y_casual_kag

array([0.81000208, 0.37307248, 0.09825154, ..., 1.91983025, 1.78246815,
       1.67903886])

In [111]:
bike_kag["log_casual"] = y_casual_kag
bike_kag

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,hour,month_idx,weekday,log_casual,pred_casual
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011,1,0,1,4,0.810002,11.143414
2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0000,2011,1,1,1,4,0.373072,4.503209
2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0000,2011,1,2,1,4,0.098252,2.710624
2011-01-20 03:00:00,1,0,1,1,10.66,12.880,56,11.0014,2011,1,3,1,4,0.040242,3.024938
2011-01-20 04:00:00,1,0,1,1,10.66,12.880,56,11.0014,2011,1,4,1,4,0.119095,4.767610
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014,2012,12,19,24,1,2.382094,275.787948
2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014,2012,12,20,24,1,2.137723,268.329223
2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014,2012,12,21,24,1,1.919830,223.309681
2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,2012,12,22,24,1,1.782468,143.355158


**REGISTERED USERS**

In [115]:
numerical_features = [
     'temp',
     'humidity',
     'windspeed',
     'month_idx',
     'hour' 
]

categorical_features = [
    'holiday',
    'workingday',
    'weekday',
    'weather'
]

features = numerical_features + categorical_features

target_variable = 'log_registered'


In [116]:
X_registered_bike, y_registered_bike = bike[features], bike[target_variable]

In [117]:
# scaling and polynomial features
numerical_transformer = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('polynomial', PolynomialFeatures())
    ]
)

In [118]:
# one-hot encoding
categorical_transformer = Pipeline(
    steps=[
        ('ohe', OneHotEncoder(drop='first'))
    ]
)

In [119]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [120]:
estimator = Pipeline(
    steps=[
        ('preprocessor', preprocessor),   # preprocessing step
        ('lasso', Lasso()) # lasso regression
    ]
)

In [121]:
param_grid = {
    'preprocessor__num__polynomial__degree': [5],
    'preprocessor__num__polynomial__interaction_only': [False,True],
    'lasso__alpha': [100.,10.,1.,0.1,0.01],
    'lasso__max_iter': [5_000, 10_000,20_000]
}

In [122]:
gscv = GridSearchCV(
    estimator=estimator,
    param_grid=param_grid,
    scoring='r2',
    cv=5, 
    n_jobs=-1,
    verbose=1
)

In [124]:
# initial time
ti = time.time()

# grid-search cross-validation
gscv.fit(X_registered_bike,y_registered_bike)

# final time 
tf = time.time()

# time taken
print(f"time taken: {round(tf-ti,2)} sec")

Fitting 5 folds for each of 30 candidates, totalling 150 fits
time taken: 47.41 sec


In [125]:
gscv.cv_results_

{'mean_fit_time': array([0.56477385, 0.08373213, 0.25671744, 0.08581824, 0.25452385,
        0.07256036, 0.26632781, 0.07784777, 0.25552702, 0.08094769,
        0.32852883, 0.11319604, 0.31051235, 0.09723787, 0.30010352,
        0.08816166, 0.29005995, 0.08370066, 0.7308248 , 0.09574728,
        0.72419372, 0.10418243, 0.71906648, 0.10665812, 6.50493641,
        0.11624379, 6.37917781, 0.11899843, 5.51765976, 0.11238198]),
 'std_fit_time': array([0.1526223 , 0.00495448, 0.00453882, 0.0054737 , 0.02090133,
        0.00944653, 0.01582082, 0.00575866, 0.0100264 , 0.00474741,
        0.01485139, 0.03620531, 0.02028082, 0.01033758, 0.01589572,
        0.00646062, 0.01628359, 0.00578253, 0.09100021, 0.00616598,
        0.07535453, 0.0106125 , 0.08582533, 0.00739973, 0.22482099,
        0.01442713, 0.23280686, 0.02201891, 0.64834031, 0.00319263]),
 'mean_score_time': array([0.08337455, 0.02495294, 0.04452472, 0.02585773, 0.03940177,
        0.0221725 , 0.05128627, 0.02390785, 0.04219184, 0.02

In [126]:
# list of columns to show
column_list = ['param_preprocessor__num__polynomial__degree',
               'param_preprocessor__num__polynomial__interaction_only',
               'param_lasso__alpha',
               'param_lasso__max_iter',
               'mean_test_score',
               'std_test_score',
               'rank_test_score'
              ]
# create result dataframe
result_df = pd.DataFrame(gscv.cv_results_)[column_list]

# rename columns
result_df.rename(
    columns=lambda name: name.split('__')[-1],inplace=True
)

# order by rank
result_df.sort_values(
    by='rank_test_score', ascending=True, inplace=True, ignore_index=True
)

result_df

Unnamed: 0,degree,interaction_only,alpha,max_iter,mean_test_score,std_test_score,rank_test_score
0,5,False,0.01,10000,0.566028,0.164594,1
1,5,False,0.01,5000,0.566028,0.164594,1
2,5,False,0.01,20000,0.566028,0.164594,1
3,5,True,0.01,20000,0.411172,0.029116,4
4,5,True,0.01,10000,0.411172,0.029116,4
5,5,True,0.01,5000,0.411172,0.029116,4
6,5,True,0.1,20000,0.370522,0.07418,7
7,5,True,0.1,10000,0.370522,0.07418,7
8,5,True,0.1,5000,0.370522,0.07418,7
9,5,False,1.0,20000,-0.050894,0.316696,10


In [127]:
gscv.best_params_

{'lasso__alpha': 0.01,
 'lasso__max_iter': 5000,
 'preprocessor__num__polynomial__degree': 5,
 'preprocessor__num__polynomial__interaction_only': False}

In [128]:
round(gscv.best_score_,6)

0.566028

In [129]:
best_model_registered = gscv.best_estimator_
best_model_registered

In [131]:
# training score
registered_train_score = best_model_registered.score(X_registered_train, y_registered_train)

print(f'Registered Train score: {round(registered_train_score,6)}')


Registered Train score: 0.746643


In [136]:
#For Kag registered
X_registered_kag = bike_kag[features]

# %%
y_registered_kag = best_model_registered.predict(X_registered_kag)
y_registered_kag

# %%
bike_kag["log_registered"] = y_registered_kag
bike_kag


Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,hour,month_idx,weekday,log_casual,pred_casual,log_registered
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011,1,0,1,4,2.496787,11.143414,1.677008
2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0000,2011,1,1,1,4,1.705331,4.503209,1.394647
2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0000,2011,1,2,1,4,1.311200,2.710624,1.124397
2011-01-20 03:00:00,1,0,1,1,10.66,12.880,56,11.0014,2011,1,3,1,4,1.392509,3.024938,1.285158
2011-01-20 04:00:00,1,0,1,1,10.66,12.880,56,11.0014,2011,1,4,1,4,1.752258,4.767610,1.686834
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014,2012,12,19,24,1,5.623252,275.787948,5.576323
2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014,2012,12,20,24,1,5.595935,268.329223,5.547816
2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014,2012,12,21,24,1,5.413028,223.309681,5.374494
2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,2012,12,22,24,1,4.972277,143.355158,4.953973


In [137]:
bike_kag["pred_casual"] = np.exp(bike_kag["log_casual"])-1
bike_kag["pred_registered"] = np.exp(bike_kag["log_registered"])-1
bike_kag

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,hour,month_idx,weekday,log_casual,pred_casual,log_registered,pred_registered
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011,1,0,1,4,2.496787,11.143414,1.677008,4.349525
2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0000,2011,1,1,1,4,1.705331,4.503209,1.394647,3.033552
2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0000,2011,1,2,1,4,1.311200,2.710624,1.124397,2.078359
2011-01-20 03:00:00,1,0,1,1,10.66,12.880,56,11.0014,2011,1,3,1,4,1.392509,3.024938,1.285158,2.615239
2011-01-20 04:00:00,1,0,1,1,10.66,12.880,56,11.0014,2011,1,4,1,4,1.752258,4.767610,1.686834,4.402350
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014,2012,12,19,24,1,5.623252,275.787948,5.576323,263.098621
2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014,2012,12,20,24,1,5.595935,268.329223,5.547816,255.676265
2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014,2012,12,21,24,1,5.413028,223.309681,5.374494,214.830652
2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,2012,12,22,24,1,4.972277,143.355158,4.953973,140.737027


In [138]:
bike_kag["count"]  = bike_kag["pred_casual"] + bike_kag["pred_registered"]
bike_kag

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,hour,month_idx,weekday,log_casual,pred_casual,log_registered,pred_registered,count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011,1,0,1,4,2.496787,11.143414,1.677008,4.349525,15.492939
2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0000,2011,1,1,1,4,1.705331,4.503209,1.394647,3.033552,7.536761
2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0000,2011,1,2,1,4,1.311200,2.710624,1.124397,2.078359,4.788983
2011-01-20 03:00:00,1,0,1,1,10.66,12.880,56,11.0014,2011,1,3,1,4,1.392509,3.024938,1.285158,2.615239,5.640177
2011-01-20 04:00:00,1,0,1,1,10.66,12.880,56,11.0014,2011,1,4,1,4,1.752258,4.767610,1.686834,4.402350,9.169960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014,2012,12,19,24,1,5.623252,275.787948,5.576323,263.098621,538.886570
2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014,2012,12,20,24,1,5.595935,268.329223,5.547816,255.676265,524.005488
2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014,2012,12,21,24,1,5.413028,223.309681,5.374494,214.830652,438.140333
2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,2012,12,22,24,1,4.972277,143.355158,4.953973,140.737027,284.092185


In [139]:
bike_kag_sub = bike_kag[["count"]]
bike_kag_sub

Unnamed: 0_level_0,count
datetime,Unnamed: 1_level_1
2011-01-20 00:00:00,15.492939
2011-01-20 01:00:00,7.536761
2011-01-20 02:00:00,4.788983
2011-01-20 03:00:00,5.640177
2011-01-20 04:00:00,9.169960
...,...
2012-12-31 19:00:00,538.886570
2012-12-31 20:00:00,524.005488
2012-12-31 21:00:00,438.140333
2012-12-31 22:00:00,284.092185


In [140]:
bike_kag_sub.to_csv("bike_kag_sub.csv", index = True)

### Kaggle_score = 0.933