# 06 자전거 대여 수요 예측
* 미션
    * [자전거 대여 수량 예측](https://www.kaggle.com/c/bike-sharing-demand)
* 평가지표
    * RMSLE
    * ![](../images/bike_metrix.PNG)
        * $n$ is ther number of hours in the test set
        * $p_i$ is your  predicted count
        * $a_i$ is the actual count
        * $log(x)$ is the natural logarithm

In [51]:
import numpy as np
import pandas as pd

In [52]:
data_path = '../data/06_bike/'

In [53]:
train_df = pd.read_csv(data_path + 'train.csv')
test_df = pd.read_csv(data_path+'test.csv')
submission_df = pd.read_csv(data_path+'sampleSubmission.csv')

In [54]:
train = train_df.copy()

In [55]:
# 데이터 합치기
all_data = pd.concat([train, test_df], ignore_index=True)
all_data.tail()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
17374,2012-12-31 19:00:00,1,0,1,2,10.66,12.88,60,11.0014,,,
17375,2012-12-31 20:00:00,1,0,1,2,10.66,12.88,60,11.0014,,,
17376,2012-12-31 21:00:00,1,0,1,1,10.66,12.88,60,11.0014,,,
17377,2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,,,
17378,2012-12-31 23:00:00,1,0,1,1,10.66,13.635,65,8.9981,,,


In [56]:
# 파생 피처 추가
all_data['datetime'] = pd.to_datetime(all_data['datetime'])

all_data['year'] = all_data['datetime'].dt.year
all_data['month'] = all_data['datetime'].dt.month
all_data['day'] = all_data['datetime'].dt.day
all_data['hour'] = all_data['datetime'].dt.hour
all_data['weekday'] = all_data['datetime'].dt.weekday

In [57]:
# 피처 제거
removal_feature = ['casual', 'registered', 'datetime', 'month', 'day', 'windspeed']
all_data = all_data.drop(columns=removal_feature)

In [58]:
# 데이터 나누기
X_train = all_data[~pd.isnull(all_data['count'])]
X_test = all_data[pd.isnull(all_data['count'])]
y = X_train['count']
log_y = np.log(y)
X_train = X_train.drop(columns=['count'])
X_test = X_test.drop(columns=['count'])

X_train.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,year,hour,weekday
0,1,0,0,1,9.84,14.395,81,2011,0,5
1,1,0,0,1,9.02,13.635,80,2011,1,5
2,1,0,0,1,9.02,13.635,80,2011,2,5
3,1,0,0,1,9.84,14.395,75,2011,3,5
4,1,0,0,1,9.84,14.395,75,2011,4,5


## BaggigRegressor

In [59]:
def rmsle(y_true, y_pred, convertExp=True):
    if convertExp:
        y_true = np.exp(y_true)
        y_pred = np.exp(y_pred)
    log_true = np.nan_to_num(np.log(y_true+1))
    log_pred = np.nan_to_num(np.log(y_pred+1))
    return np.sqrt(np.mean((log_true-log_pred)**2))

In [60]:

from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

# 모델 생성
bagging_model = BaggingRegressor(
    DecisionTreeRegressor(),
    n_estimators=140,
    oob_score=True,
    random_state=42,
    bootstrap=True
)

bagging_model.fit(X_train, log_y)
print(f'oob score : {bagging_model.oob_score_}')

oob score : 0.9483178766771876


In [61]:
# 예측
preds = bagging_model.predict(X_train)
# 평가
print(f'random forest RMSLE : {rmsle(log_y, preds, True):.4f}')

random forest RMSLE : 0.1125
