<font size=6><b>Bike Sharing Demand - ML

<img src="./logo.png">
* ref : https://www.kaggle.com/competitions/bike-sharing-demand/data <br>
* ref : https://dacon.io/competitions/official/235985/data

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns


sns.set()

#-------------------- 차트 관련 속성 (한글처리, 그리드) -----------
plt.rcParams['font.family']= 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

#-------------------- 주피터 , 출력결과 넓이 늘리기 ---------------
# from IPython.core.display import display, HTML
from IPython.display import display, HTML

display(HTML("<style>.container{width:100% !important;}</style>"))
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('max_colwidth', None)

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


# Data Load

In [3]:
train = pd.read_csv("./train.csv", parse_dates=['datetime'])
test  = pd.read_csv("./test.csv" , parse_dates=['datetime'])

In [4]:
df_list = [train, test]
for df in df_list:
    df.rename(columns = {'datetime' : 'regdate', 'count' : 'regcount'}, inplace = True)
    df.columns = df.columns.str.lower()
    print(df.info())    
    print("====="*10)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   regdate     10886 non-null  datetime64[ns]
 1   season      10886 non-null  int64         
 2   holiday     10886 non-null  int64         
 3   workingday  10886 non-null  int64         
 4   weather     10886 non-null  int64         
 5   temp        10886 non-null  float64       
 6   atemp       10886 non-null  float64       
 7   humidity    10886 non-null  int64         
 8   windspeed   10886 non-null  float64       
 9   casual      10886 non-null  int64         
 10  registered  10886 non-null  int64         
 11  regcount    10886 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(8)
memory usage: 1020.7 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6493 entries, 0 to 6492
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dty

# Feature Engineering

## 파생피쳐
- regdate
- day_type

In [5]:
df_list = [train, test]
for df in df_list:
    df['y'] = df['regdate'].dt.year
    df['m'] = df['regdate'].dt.month
    df['d'] = df['regdate'].dt.day
    df['h'] = df['regdate'].dt.hour
    df['w'] = df['regdate'].dt.dayofweek
    
    df.set_index('regdate', inplace=True)
    
    df['day_type'] = 0
    df['day_type'] = np.where( (df['holiday']==0) & (df['workingday'] == 1),   1,  df['day_type'])
    df['day_type'] = np.where( (df['holiday']==1) & (df['workingday'] == 0),   2,  df['day_type'])

In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6493 entries, 2011-01-20 00:00:00 to 2012-12-31 23:00:00
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   season      6493 non-null   int64  
 1   holiday     6493 non-null   int64  
 2   workingday  6493 non-null   int64  
 3   weather     6493 non-null   int64  
 4   temp        6493 non-null   float64
 5   atemp       6493 non-null   float64
 6   humidity    6493 non-null   int64  
 7   windspeed   6493 non-null   float64
 8   y           6493 non-null   int64  
 9   m           6493 non-null   int64  
 10  d           6493 non-null   int64  
 11  h           6493 non-null   int64  
 12  w           6493 non-null   int64  
 13  day_type    6493 non-null   int64  
dtypes: float64(3), int64(11)
memory usage: 760.9 KB


## windspeed 0 채우기

In [7]:
train.shape, test.shape

((10886, 17), (6493, 14))

In [8]:
def my_fill_windspeed(df):
    df = df[['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp',
           'humidity','y', 'm','d', 'h', 'w', 'day_type','windspeed']]

    df1 = df[df['windspeed'] != 0]
    y_df1 = df1['windspeed']
    X_df1 = df1.drop('windspeed', axis=1)
    
    rf = RandomForestRegressor(random_state=11)
    X_df1_8, X_df1_2, y_df1_8, y_df1_2 = train_test_split(X_df1, y_df1, test_size=0.2)
    rf.fit(X_df1_8, y_df1_8)
    
    pred = rf.predict(X_df1_2)
    mse_score = mean_squared_error(y_df1_2, pred)
    print("RMSE : ", np.sqrt(mse_score) )

    df0 = df[df['windspeed'] == 0]
    y_df0 = df0['windspeed']
    X_df0 = df0.drop('windspeed', axis=1)
    pred = rf.predict(X_df0)
    widx = X_df0.index.values
    df.loc[widx, 'windspeed'] = pred
    return df

In [9]:
df = pd.concat([train, test], axis=0, ignore_index=True)
train_idx = df.iloc[:train.shape[0], :].index
test_idx  = df.iloc[train.shape[0]:, :].index
#---------------------------------
df = my_fill_windspeed(df)
#---------------------------------
train = df.loc[train_idx]
test  = df.loc[test_idx]
test = test.reset_index(drop=True)

RMSE :  4.479894330395955


## Outlier 처리
- <font color='red'><b>train에만 처리

In [10]:
train.shape, test.shape

((10886, 14), (6493, 14))

In [11]:
del_idx_list = []
idx = train[train['weather']==4].index
del_idx_list.extend(idx)
idx = train[train['temp']>40].index
del_idx_list.extend(idx)
idx = train[train['windspeed']>50].index
del_idx_list.extend(idx)
train.drop(del_idx_list, axis=0, inplace=True)

In [12]:
train.shape, test.shape

((10880, 14), (6493, 14))

# 학습

## 타켓 선정
* 답안지 : count 제출
* regcount(A패턴)  = registered(A2패턴) + casual (B패턴) 
* <font color=red><b>registered(A2패턴) + casual (B패턴)  --> 이 값을 답안으로 제출

## 최종피쳐 선정

In [None]:
train.columns

<pre>
타겟(1) : 'casual' + 'registered'
타겟(2) : 'regcount'


'd' 피쳐 삭제
('m', 'h'),  'y', 'd' -->'w'


(+)3개 : 'holiday', 'workingday', 'day_type'
(+)2개 : 'holiday', 'workingday'
(+)1개 : 'day_type'


'weather', 'humidity',  'season', 
(+)'windspeed' : 'h' 비례? / 'm' 반비례?


다중공선 : (상관계수가 높은 경우)
'regcount' 'registered' 'casual' : 0.97  --> 'regcount' vs. 'registered'+'casual'
'temp' 'atemp'                   : 0.98  --> 'atemp' drop

선택적으로 모델에 적용
(+)'w' 'day_type' : -0.78
(+)'season' 'm'   : 0.97


* ('m', 'h'),  'y', 'w', 'weather', 'humidity',  'season', 'temp'
* (+)'windspeed'
* (+)('holiday', 'workingday'), ('day_type')
* (+)('w' 'day_type') : -0.78
* (+)('m') 'season'    : 0.97


# 학습 & 평가
* ref : https://suboptimal.wiki/explanation/mse/

*  $ MSE = \frac{\mathrm{1} }{\mathrm{n}} \sum\nolimits_{i=1}^{n} (Y_i - \hat{Y_i})^{2}$

* $ MAE = $

*  $ RMSE = \sqrt{\frac{1}{n}\Sigma_{i=1}^{n}{\Big(\frac{Y_i - \hat{Y_i}}{N}\Big)^2}}$

*  $ RMSLE = \sqrt{\frac{1}{n}\Sigma_{i=1}^{n}{\Big(\frac{log(Y_i+1) - log(\hat{Y_i}+1)}{N}\Big)^2}}$

<pre>
 ---------------------------------------------------------------------
 1. X(문제지), y(답안지) 분리
 2. 위에것을 8:2 비율로 train, test용으로 분리
      문제지80,문제지20, 답안지80,답안지20 = train_test_split(문제지, 답안지)
 3. 모델선택
                     모델.fit(문제지80, 답안지80)
           예측답안20 = 모델.predict(문제지20)
 4. 점수확인 : *score(답안지20, 예측답안20)
 ---------------------------------------------------------------------
 데이터프레임 : object X, 결측 X
 X,y분리 --> 8:2 --> 모델 --> fit --> predict  --> score --> 평가검증
 ---------------------------------------------------------------------


In [None]:
# ! pip install xgboost
# ! pip install lightgbm 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# ---- 추가 모델
from sklearn.ensemble import AdaBoostRegressor, VotingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [None]:
model_list = [ ("DTR"   , DecisionTreeRegressor(random_state=0)),
               ("LR"    , LinearRegression()                   ),
               ("ABOOST", AdaBoostRegressor()                  ),
               ("XGB"   , XGBRegressor()                       ),  
               ("LGBM"  , LGBMRegressor()                      ),
               ("VR"    , VotingRegressor([("XGB", XGBRegressor()), ("LGBM", LGBMRegressor())]) ) 
             ]
y_list = [ train['regcount'], train['casual'], train['registered'] ]
y_col  = [ 'regcount', 'casual', 'registered'] 
X = train.drop(['regcount','casual','registered'], axis=1)

score_list = []
for tpl in model_list :
    print( tpl[0] ) 
    for i, y in enumerate(y_list) :
        model = tpl[1]
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse_score = mean_squared_error(y_test, y_pred)
        
        print( y_col[i] )
        print("MSE : ", mse_score) 
        print("RMSE: ", np.sqrt(mse_score)) 
        print("-"*30)
        score_list.append([tpl[0], y_col[i], mse_score, np.sqrt(mse_score) ] )

In [None]:
score_df = pd.DataFrame(score_list, columns=["model","col","mse","rmse"])
score_df

In [None]:
plt.figure(figsize=(8,3))
ax1 = plt.subplot(1,2,1)
ax1.set_title("RMSE")
sns.barplot(
    data= score_df,
    x= "model",
    y= "rmse" ,
    hue = "col",
    ax = ax1
    
)
ax2 = plt.subplot(1,2,2)
ax2.set_title("MSE")
sns.barplot(
    data= score_df,
    x= "model",
    y= "mse" ,
    hue = "col",
    ax = ax2
)
plt.show()

* by 규환

In [None]:
y = train[['regcount','casual','registered']]
X = train.drop(['regcount','casual','registered'], axis=1)


In [None]:
X.head(), y.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [None]:
model = DecisionTreeRegressor(random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [None]:
print( y_test.shape )
y_test.head(3)

In [None]:
print( y_pred.shape )

In [None]:
y_pred[0]

In [None]:
print( y_pred.T[0].shape )
y_pred.T

In [None]:
y_test['casual']

In [None]:
# regcount(A패턴)  = registered(A2패턴) + casual (B패턴) 
score_regcount   = mean_squared_error(y_test['regcount']    , y_pred.T[0])
score_casual     = mean_squared_error(y_test['casual'], y_pred.T[1])
score_registered = mean_squared_error(y_test['registered']  , y_pred.T[2])

print(score_regcount, score_casual, score_registered)