# 06 자전거 대여 수요 예측
* 미션
    * [자전거 대여 수량 예측](https://www.kaggle.com/c/bike-sharing-demand)
* 평가지표
    * RMSLE
    * ![](../images/bike_metrix.PNG)
        * $n$ is ther number of hours in the test set
        * $p_i$ is your  predicted count
        * $a_i$ is the actual count
        * $log(x)$ is the natural logarithm

In [39]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [40]:
data_path = '../data/06_bike/'

In [41]:
train_df = pd.read_csv(data_path + 'train.csv')
test_df = pd.read_csv(data_path+'test.csv')
submission_df = pd.read_csv(data_path+'sampleSubmission.csv')

In [42]:
print(train_df.shape, test_df.shape, submission_df.shape)

(10886, 12) (6493, 9) (6493, 2)


In [43]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    10886 non-null  object 
 1   season      10886 non-null  int64  
 2   holiday     10886 non-null  int64  
 3   workingday  10886 non-null  int64  
 4   weather     10886 non-null  int64  
 5   temp        10886 non-null  float64
 6   atemp       10886 non-null  float64
 7   humidity    10886 non-null  int64  
 8   windspeed   10886 non-null  float64
 9   casual      10886 non-null  int64  
 10  registered  10886 non-null  int64  
 11  count       10886 non-null  int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.7+ KB


In [44]:
train = train_df.copy()

In [45]:
# 데이터 합치기
all_data = pd.concat([train, test_df], ignore_index=True)
all_data.tail()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
17374,2012-12-31 19:00:00,1,0,1,2,10.66,12.88,60,11.0014,,,
17375,2012-12-31 20:00:00,1,0,1,2,10.66,12.88,60,11.0014,,,
17376,2012-12-31 21:00:00,1,0,1,1,10.66,12.88,60,11.0014,,,
17377,2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,,,
17378,2012-12-31 23:00:00,1,0,1,1,10.66,13.635,65,8.9981,,,


In [46]:
# 파생 피처 추가
all_data['datetime'] = pd.to_datetime(all_data['datetime'])

all_data['year'] = all_data['datetime'].dt.year
all_data['month'] = all_data['datetime'].dt.month
all_data['day'] = all_data['datetime'].dt.day
all_data['hour'] = all_data['datetime'].dt.hour
all_data['weekday'] = all_data['datetime'].dt.weekday

In [47]:
# 피처 제거
removal_feature = ['casual', 'registered', 'datetime', 'month', 'day', 'windspeed']
all_data = all_data.drop(columns=removal_feature)

In [48]:
# 데이터 나누기
X_train = all_data[~pd.isnull(all_data['count'])]
X_test = all_data[pd.isnull(all_data['count'])]
y = X_train['count']
X_train = X_train.drop(columns=['count'])
X_test = X_test.drop(columns=['count'])

X_train.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,year,hour,weekday
0,1,0,0,1,9.84,14.395,81,2011,0,5
1,1,0,0,1,9.02,13.635,80,2011,1,5
2,1,0,0,1,9.02,13.635,80,2011,2,5
3,1,0,0,1,9.84,14.395,75,2011,3,5
4,1,0,0,1,9.84,14.395,75,2011,4,5


## 6.5 성능 개선 I : 릿지 회귀 모델
### 릿지 회귀 모델
* L2 규제를 적용한 선형 회귀 모델
* 규제(regularization)란?
    * 모델이 훈련 데이터에 과대적합되지 않도록 해주는 방법
### 모델 성능 개선 프로세스
* ![](../images/base_vs_enhanced.PNG)

In [49]:
# 평가지표 계산 함수 작성
def rmsle(y_true, y_pred, convertExp=True):
    if convertExp:
        y_true = np.exp(y_true)
        y_pred = np.exp(y_pred)
    log_true = np.nan_to_num(np.log(y_true+1))
    log_pred = np.nan_to_num(np.log(y_pred+1))
    return np.sqrt(np.mean((log_true-log_pred)**2))

In [50]:
# scale
from sklearn.preprocessing import MinMaxScaler, StandardScaler
# scaler = MinMaxScaler().fit(X_train) # train data로만 훈련
scaler = StandardScaler().fit(X_train) # train data로만 훈련
X_train_scale = scaler.transform(X_train)
X_train_scale = pd.DataFrame(X_train_scale, columns=X_train.columns)

In [51]:
# 패키지 import
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [52]:
# 모델 생성
ridge_model = Ridge()
log_y = np.log(y)

In [53]:
# 그리드서치 객체 생성
ridge_params = {
    'max_iter': [3000],
    'alpha': [0.1, 1, 2, 3, 4, 10, 30, 100, 200, 300, 400, 800, 900, 1000]
}

rmsle_scorer = metrics.make_scorer(rmsle, greater_is_better=False)

gridsearch_ridge_model = GridSearchCV(
    estimator=ridge_model,  # 릿지 모델
    param_grid=ridge_params,# 하이퍼 파라미터
    scoring=rmsle_scorer,   # 평가 지표, alternative : 'accuracy', 'f1', 'roc_auc', 'recall'
    cv=5                    # 교차 검증 분할 수
)

In [54]:
# 그리드서치 수행
gridsearch_ridge_model.fit(X_train_scale, log_y)
print(f'best parameter : {gridsearch_ridge_model.best_params_}')

best parameter : {'alpha': 100, 'max_iter': 3000}


In [55]:
# 예측
preds = gridsearch_ridge_model.best_estimator_.predict(X_train_scale)

#평가
print(f'ridge RMSLE : {rmsle(log_y, preds, True):.4f}')

ridge RMSLE : 1.0203


In [56]:
pd.DataFrame([gridsearch_ridge_model.best_estimator_.coef_, X_train.columns]).T

Unnamed: 0,0,1
0,0.201083,season
1,0.00444,holiday
2,0.001998,workingday
3,-2.9e-05,weather
4,0.127986,temp
5,0.246131,atemp
6,-0.289081,humidity
7,0.217895,year
8,0.70953,hour
9,0.057848,weekday


In [57]:
### 예측 및 결과 제출
ridgereg_preds = gridsearch_ridge_model.best_estimator_.predict(scaler.transform(X_test))
submission_df['count'] = np.exp(ridgereg_preds)
# submission_df.to_csv('submission.csv', index=False)

