In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [7]:
train_df = pd.read_csv("bike-sharing-demand/train.csv",parse_dates=['datetime'])

In [8]:
train_df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [9]:
test_df = pd.read_csv("bike-sharing-demand/test.csv",parse_dates=['datetime'])

test_df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014


In [10]:
columns = ['count', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'year', 'month', 'day', 'dayofweek','hour']

cols_normalize = ['temp','atemp','humidity','windspeed']

In [11]:
def add_features(df):
    df['year'] = df['datetime'].dt.year
    df['month'] = df['datetime'].dt.month
    df['day'] = df['datetime'].dt.day
    df['dayofweek']=df['datetime'].dt.day
    df['hour'] = df['datetime'].dt.hour

In [12]:
add_features(train_df)
add_features(test_df)

In [13]:
train_df['count']=train_df['count'].map(np.log1p)

In [14]:
train_df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,day,dayofweek,hour
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,2.833213,2011,1,1,1,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,3.713572,2011,1,1,1,1
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,3.496508,2011,1,1,1,2
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,2.639057,2011,1,1,1,3
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,0.693147,2011,1,1,1,4


In [15]:
scaler = StandardScaler()

In [16]:
scaler.fit(train_df[cols_normalize])

StandardScaler(copy=True, with_mean=True, with_std=True)

In [17]:
train_df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,day,dayofweek,hour
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,2.833213,2011,1,1,1,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,3.713572,2011,1,1,1,1
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,3.496508,2011,1,1,1,2
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,2.639057,2011,1,1,1,3
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,0.693147,2011,1,1,1,4


In [18]:
pca = PCA(n_components=0.9)
pca.fit(train_df[cols_normalize])

PCA(copy=True, iterated_power='auto', n_components=0.9, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [21]:
print(pca.n_components) #variance
pca.n_components_

0.9


3

In [22]:
train_df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,day,dayofweek,hour
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,2.833213,2011,1,1,1,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,3.713572,2011,1,1,1,1
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,3.496508,2011,1,1,1,2
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,2.639057,2011,1,1,1,3
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,0.693147,2011,1,1,1,4


In [23]:
transformed_data = pca.transform(train_df[cols_normalize])
transformed_data

array([[-21.50215306,  12.13273921, -10.59877225],
       [-20.56621054,  13.27481689, -10.84606528],
       [-20.56621054,  13.27481689, -10.84606528],
       ...,
       [  0.77997275,  10.13067661,   1.28634827],
       [ -0.59582495,   8.28205298,  -7.50714907],
       [ -5.10643955,   9.4760491 ,  -3.84316822]])

In [34]:
cols = ['count', 'season', 'holiday', 'workingday', 'weather','year', 'month', 'day', 'dayofweek','hour']
cols2 = ['count', 'season', 'holiday', 'workingday', 'weather','year', 'month', 'day', 'dayofweek','hour']

for i in range(pca.n_components_):
    cols.append("component_"+str(i))

In [35]:
cols

['count',
 'season',
 'holiday',
 'workingday',
 'weather',
 'year',
 'month',
 'day',
 'dayofweek',
 'hour',
 'component_0',
 'component_1',
 'component_2']

In [36]:
final_df = pd.DataFrame(columns=cols)

In [39]:
int_df = pd.DataFrame(transformed_data,columns=['component_0','component_1','component_2'])
int_df.head()

Unnamed: 0,component_0,component_1,component_2
0,-21.502153,12.132739,-10.598772
1,-20.566211,13.274817,-10.846065
2,-20.566211,13.274817,-10.846065
3,-15.583428,12.325531,-11.563773
4,-15.583428,12.325531,-11.563773


In [37]:
for col in cols2:
    final_df[col] = train_df[col]

final_df.head()
    


Unnamed: 0,count,season,holiday,workingday,weather,year,month,day,dayofweek,hour,component_0,component_1,component_2
0,2.833213,1,0,0,1,2011,1,1,1,0,,,
1,3.713572,1,0,0,1,2011,1,1,1,1,,,
2,3.496508,1,0,0,1,2011,1,1,1,2,,,
3,2.639057,1,0,0,1,2011,1,1,1,3,,,
4,0.693147,1,0,0,1,2011,1,1,1,4,,,


In [40]:
final_df['component_0'] = int_df['component_0']
final_df['component_1'] = int_df['component_1']
final_df['component_2'] = int_df['component_2']

final_df.head()

Unnamed: 0,count,season,holiday,workingday,weather,year,month,day,dayofweek,hour,component_0,component_1,component_2
0,2.833213,1,0,0,1,2011,1,1,1,0,-21.502153,12.132739,-10.598772
1,3.713572,1,0,0,1,2011,1,1,1,1,-20.566211,13.274817,-10.846065
2,3.496508,1,0,0,1,2011,1,1,1,2,-20.566211,13.274817,-10.846065
3,2.639057,1,0,0,1,2011,1,1,1,3,-15.583428,12.325531,-11.563773
4,0.693147,1,0,0,1,2011,1,1,1,4,-15.583428,12.325531,-11.563773


In [41]:
import xgboost as xgb

x_train = final_df.iloc[:,1:]
y_train = final_df.iloc[:,0]

In [43]:
y_train

0        2.833213
1        3.713572
2        3.496508
3        2.639057
4        0.693147
           ...   
10881    5.820083
10882    5.488938
10883    5.129899
10884    4.867534
10885    4.488636
Name: count, Length: 10886, dtype: float64

In [44]:
regressor = xgb.XGBRegressor(max_depth=5,n_estimators=150)

regressor

XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None, gamma=None,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=5,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=150, n_jobs=None, num_parallel_tree=None,
             objective='reg:squarederror', random_state=None, reg_alpha=None,
             reg_lambda=None, scale_pos_weight=None, subsample=None,
             tree_method=None, validate_parameters=None, verbosity=None)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(x_train, y_train, test_size=0.3, random_state=42)

In [49]:
regressor.fit(X_train,Y_train,eval_set=[(X_test,Y_test)])

[0]	validation_0-rmse:3.06297
[1]	validation_0-rmse:2.18092
[2]	validation_0-rmse:1.57263
[3]	validation_0-rmse:1.15376
[4]	validation_0-rmse:0.87201
[5]	validation_0-rmse:0.68623
[6]	validation_0-rmse:0.56749
[7]	validation_0-rmse:0.49212
[8]	validation_0-rmse:0.44829
[9]	validation_0-rmse:0.41754
[10]	validation_0-rmse:0.40202
[11]	validation_0-rmse:0.39322
[12]	validation_0-rmse:0.38897
[13]	validation_0-rmse:0.36836
[14]	validation_0-rmse:0.36003
[15]	validation_0-rmse:0.35727
[16]	validation_0-rmse:0.35608
[17]	validation_0-rmse:0.35532
[18]	validation_0-rmse:0.35343
[19]	validation_0-rmse:0.35288
[20]	validation_0-rmse:0.35205
[21]	validation_0-rmse:0.34400
[22]	validation_0-rmse:0.34275
[23]	validation_0-rmse:0.34238
[24]	validation_0-rmse:0.34211
[25]	validation_0-rmse:0.34207
[26]	validation_0-rmse:0.34074
[27]	validation_0-rmse:0.33627
[28]	validation_0-rmse:0.32834
[29]	validation_0-rmse:0.32812
[30]	validation_0-rmse:0.32840
[31]	validation_0-rmse:0.32808
[32]	validation_0-

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=5,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=150, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [51]:
test_df = pd.read_csv("bike-sharing-demand/test.csv",parse_dates=['datetime'])

In [67]:
test_df.head()
add_features(test_df)


In [68]:
scaler = StandardScaler()
scaler.fit(test_df[cols_normalize])

pca = PCA(n_components=0.9)
pca.fit(test_df[cols_normalize])

PCA(copy=True, iterated_power='auto', n_components=0.9, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [69]:
transformed_data = pca.transform(test_df[cols_normalize])


In [70]:
inttest_df = pd.DataFrame(transformed_data,columns=['component_0','component_1','component_2'])

test_df['component_0'] = inttest_df['component_0']
test_df['component_1'] = inttest_df['component_1']
test_df['component_2'] = inttest_df['component_2']

In [71]:
final_test_df = test_df.iloc[:,1:]
final_test_df.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,component_0,component_1,component_2,year,month,day,dayofweek,hour
0,1,0,1,1,10.66,11.365,56,26.0027,8.573959,17.748251,10.713223,2011,1,20,20,0
1,1,0,1,1,10.66,13.635,56,0.0,5.504756,13.487307,-14.841451,2011,1,20,20,1
2,1,0,1,1,10.66,13.635,56,0.0,5.504756,13.487307,-14.841451,2011,1,20,20,2
3,1,0,1,1,10.66,12.88,56,11.0014,6.813134,15.1394,-4.019443,2011,1,20,20,3
4,1,0,1,1,10.66,12.88,56,11.0014,6.813134,15.1394,-4.019443,2011,1,20,20,4


In [72]:
final_test_df = final_test_df.drop(['temp','atemp','humidity','windspeed'],axis=1)

In [74]:
re_index = ['season', 'holiday', 'workingday', 'weather', 'year', 'month', 'day', 'dayofweek', 'hour', 'component_0', 'component_1', 'component_2']
final_test_df=final_test_df.reindex(columns=re_index)

final_pred = regressor.predict(final_test_df)

In [75]:
final_pred

array([2.3796558, 1.4996266, 0.9611697, ..., 4.690411 , 4.7823687,
       4.525414 ], dtype=float32)

In [80]:
test_df['count'] = np.expm1(final_pred)

In [83]:
test_df.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,component_0,component_1,component_2,year,month,day,dayofweek,hour,count
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,8.573959,17.748251,10.713223,2011,1,20,20,0,9.801185
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0,5.504756,13.487307,-14.841451,2011,1,20,20,1,3.480016


In [84]:
test_df[test_df['count']<0]

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,component_0,component_1,component_2,year,month,day,dayofweek,hour,count


In [85]:
test_df.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,component_0,component_1,component_2,year,month,day,dayofweek,hour,count
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,8.573959,17.748251,10.713223,2011,1,20,20,0,9.801185
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0,5.504756,13.487307,-14.841451,2011,1,20,20,1,3.480016


In [86]:
test_df[['datetime','count']].to_csv('predicted_count.csv',index=False)