<font size=6><b>Bike Sharing Demand - ML

<img src="./logo.png">
* ref : https://www.kaggle.com/competitions/bike-sharing-demand/data <br>
* ref : https://dacon.io/competitions/official/235985/data

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns


sns.set()

#-------------------- 차트 관련 속성 (한글처리, 그리드) -----------
plt.rcParams['font.family']= 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

#-------------------- 주피터 , 출력결과 넓이 늘리기 ---------------
# from IPython.core.display import display, HTML
from IPython.display import display, HTML

display(HTML("<style>.container{width:100% !important;}</style>"))
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('max_colwidth', None)

import warnings
warnings.filterwarnings(action='ignore')

In [2]:

from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV, KFold, StratifiedKFold, cross_validate
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error

from sklearn.ensemble     import RandomForestRegressor
from sklearn.tree         import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
# ---- 추가 모델
from sklearn.ensemble     import AdaBoostRegressor, VotingRegressor
from xgboost              import XGBRegressor
from lightgbm             import LGBMRegressor



# Data Load

In [3]:
train = pd.read_csv("./train.csv", parse_dates=['datetime'])
test  = pd.read_csv("./test.csv" , parse_dates=['datetime'])

In [4]:
df_list = [train, test]
for df in df_list:
    df.rename(columns = {'datetime' : 'regdate', 'count' : 'regcount'}, inplace = True)
    df.columns = df.columns.str.lower()
    print(df.info())    
    print("====="*10)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   regdate     10886 non-null  datetime64[ns]
 1   season      10886 non-null  int64         
 2   holiday     10886 non-null  int64         
 3   workingday  10886 non-null  int64         
 4   weather     10886 non-null  int64         
 5   temp        10886 non-null  float64       
 6   atemp       10886 non-null  float64       
 7   humidity    10886 non-null  int64         
 8   windspeed   10886 non-null  float64       
 9   casual      10886 non-null  int64         
 10  registered  10886 non-null  int64         
 11  regcount    10886 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(8)
memory usage: 1020.7 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6493 entries, 0 to 6492
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dty

## 점수보기

# Feature Engineering

In [5]:
train.columns

Index(['regdate', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'regcount'],
      dtype='object')

In [6]:
for col in train.columns:
    print(col, train[col].nunique())

regdate 10886
season 4
holiday 2
workingday 2
weather 4
temp 49
atemp 60
humidity 89
windspeed 28
casual 309
registered 731
regcount 822


## windspeed 0 채우기

In [7]:
train.shape, test.shape

((10886, 12), (6493, 9))

In [8]:
def my_fill_windspeed(df):
    target = df[['regcount','casual','registered','regdate']]
    df = df.drop(['regcount','casual','registered','regdate'], axis=1)

    df1   = df[df['windspeed'] != 0]
    y_df1 = df1['windspeed']
    X_df1 = df1.drop('windspeed', axis=1)
    
    rf = LGBMRegressor(random_state=0)
    X_df1_8, X_df1_2, y_df1_8, y_df1_2 = train_test_split(X_df1, y_df1, test_size=0.2, random_state=11)
    rf.fit(X_df1_8, y_df1_8)
    
    pred = rf.predict(X_df1_2)
    mse_score = mean_squared_error(y_df1_2, pred)
    print("RMSE : ", np.sqrt(mse_score) )

    df0 = df[df['windspeed'] == 0]
    y_df0 = df0['windspeed']
    X_df0 = df0.drop('windspeed', axis=1)
    pred = rf.predict(X_df0)
    widx = X_df0.index.values
    df.loc[widx, 'windspeed'] = pred
    
    
    return pd.concat([df, target], axis=1)

In [9]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6493 entries, 0 to 6492
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   regdate     6493 non-null   datetime64[ns]
 1   season      6493 non-null   int64         
 2   holiday     6493 non-null   int64         
 3   workingday  6493 non-null   int64         
 4   weather     6493 non-null   int64         
 5   temp        6493 non-null   float64       
 6   atemp       6493 non-null   float64       
 7   humidity    6493 non-null   int64         
 8   windspeed   6493 non-null   float64       
dtypes: datetime64[ns](1), float64(3), int64(5)
memory usage: 456.7 KB


In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   regdate     10886 non-null  datetime64[ns]
 1   season      10886 non-null  int64         
 2   holiday     10886 non-null  int64         
 3   workingday  10886 non-null  int64         
 4   weather     10886 non-null  int64         
 5   temp        10886 non-null  float64       
 6   atemp       10886 non-null  float64       
 7   humidity    10886 non-null  int64         
 8   windspeed   10886 non-null  float64       
 9   casual      10886 non-null  int64         
 10  registered  10886 non-null  int64         
 11  regcount    10886 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(8)
memory usage: 1020.7 KB


In [11]:
df = pd.concat([train, test], axis=0, ignore_index=True)
train_idx = df.iloc[:train.shape[0], :].index
test_idx  = df.iloc[train.shape[0]:, :].index
#---------------------------------
df = my_fill_windspeed(df)
#---------------------------------
train = df.loc[train_idx]
test  = df.loc[test_idx]
test = test.drop(['casual', 'registered', 'regcount'], axis=1)
test = test.reset_index(drop=True)

RMSE :  5.427882362943079


## 피쳐가공
* 파생 피쳐: regdate, day_type
* 휴무처리 : holiday, workingday
* 아웃라이어 : weather, temp, windspeed
* 로그스케일링(연속형) : 'temp', 'atemp','humidity', 'windspeed'
* 원핫인코딩(이산형) :  'season', 'weather','m', 'y', 'h', 'w', 'day_type'
* 다중공선 : ('temp', 'atemp') ('season', 'm') ('w', 'day_type')
* 불필요컬럼 삭제 :'d'

In [12]:
df_list = [train, test]
df_name = ['train', 'test']
for i, df in enumerate(df_list):
    df['y'] = df['regdate'].dt.year
    df['m'] = df['regdate'].dt.month
    df['d'] = df['regdate'].dt.day
    df['h'] = df['regdate'].dt.hour
    df['w'] = df['regdate'].dt.dayofweek
    #df['woy'] = df['regdate'].dt.weekofyear
    
    
    df['day_type'] = 0
    df['day_type'] = np.where( (df['holiday']==0) & (df['workingday'] == 1),   1,  df['day_type'])
    df['day_type'] = np.where( (df['holiday']==1) & (df['workingday'] == 0),   2,  df['day_type'])
    
    df['peak']   = df[['h', 'workingday']].apply(lambda x: (0, 1)[(x['workingday'] == 1 and  ( x['h'] == 8 or 17 <= x['h'] <= 18 or 12 <= x['h'] <= 12)) or (x['workingday'] == 0 and  10 <= x['h'] <= 19)], axis = 1)
    df['ideal']  = df[['temp', 'windspeed']].apply(lambda x: (0, 1)[x['temp'] > 27 and x['windspeed'] < 30], axis = 1)
    df['sticky'] = df[['humidity', 'workingday']].apply(lambda x: (0, 1)[x['workingday'] == 1 and x['humidity'] >= 60], axis = 1)
    
    #sandy
    df['holiday'] = df[['m', 'd', 'holiday', 'y']].apply(lambda x: (x['holiday'], 1)[x['y'] == 2012 and x['m'] == 10 and (x['m'] in [30])], axis = 1)
    #christmas day and others
    df['holiday'] = df[['m', 'd', 'holiday']].apply(lambda x: (x['holiday'], 1)[x['m'] == 12 and (x['m'] in [24, 26, 31])], axis = 1)
    df['workingday'] = df[['m', 'd', 'workingday']].apply(lambda x: (x['workingday'], 0)[x['m'] == 12 and x['m'] in [24, 31]], axis = 1)
    
    
    if i == 0 : # 아웃라이어 : train만 삭제
        #------------------------------------------------
        # windspeed는 0채우기 후 아웃라이어로 한꺼번에 이상치 삭제
        # 아웃라이어 삭제 : 주의주의주의 train만 지운다
        # df == train 넣고 호출
        #------------------------------------------------
        del_idx_list = []
        #idx = df[df['weather']==4].index
        #del_idx_list.extend(idx)
        idx = df[df['temp']>40].index
        del_idx_list.extend(idx)
        idx = df[df['windspeed']>50].index
        del_idx_list.extend(idx)
        df.drop(del_idx_list, axis=0, inplace=True)

    
    #---------(연속형피쳐:로그스케일링) ---------------
    # 'temp', 'atemp','humidity', 'windspeed'
    #------------------------------------------------
    df['temp']      = np.log1p( df['temp'] )
    df['atemp']     = np.log1p( df['atemp'] )
    df['humidity']  = np.log1p( df['humidity'] )
    df['windspeed'] = np.log1p( df['windspeed'] )
    
    #------------------------------------------------
    # 다중공선
    # 'temp', 'atemp'
    # 'season', 'm'
    # 'w', 'day_type'
    #------------------------------------------------
    df = df.drop(['atemp'], axis=1)
    # X = X.drop(['season'], axis=1)
    # X = X.drop(['day_type'], axis=1)
    
    
    #------------------------------------------------
    # 이산형 피쳐 원핫인코딩
    # ['season', 'weather','m', 'y', 'h', 'w', 'day_type']
    df = pd.get_dummies(df, columns=['season', 'weather','m', 'y', 'h', 'w', 'day_type'])
    
    #------------------------------------------------
    #X = df.drop(['regcount','casual','registered'], axis=1)
    df.drop('d', axis=1, inplace=True)
    df = df.set_index('regdate')
    
    #------------------------------------------------
    #print(df.info())
    print(df_name[i])
    globals()[df_name[i]] = df.copy()

train
test


In [13]:
train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 10881 entries, 2011-01-01 00:00:00 to 2012-12-19 23:00:00
Data columns (total 67 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   holiday     10881 non-null  int64  
 1   workingday  10881 non-null  int64  
 2   temp        10881 non-null  float64
 3   humidity    10881 non-null  float64
 4   windspeed   10881 non-null  float64
 5   regcount    10881 non-null  float64
 6   casual      10881 non-null  float64
 7   registered  10881 non-null  float64
 8   peak        10881 non-null  int64  
 9   ideal       10881 non-null  int64  
 10  sticky      10881 non-null  int64  
 11  season_1    10881 non-null  uint8  
 12  season_2    10881 non-null  uint8  
 13  season_3    10881 non-null  uint8  
 14  season_4    10881 non-null  uint8  
 15  weather_1   10881 non-null  uint8  
 16  weather_2   10881 non-null  uint8  
 17  weather_3   10881 non-null  uint8  
 18  weather_4   10881 non-null  uin

In [14]:
test.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6493 entries, 2011-01-20 00:00:00 to 2012-12-31 23:00:00
Data columns (total 64 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   holiday     6493 non-null   int64  
 1   workingday  6493 non-null   int64  
 2   temp        6493 non-null   float64
 3   humidity    6493 non-null   float64
 4   windspeed   6493 non-null   float64
 5   peak        6493 non-null   int64  
 6   ideal       6493 non-null   int64  
 7   sticky      6493 non-null   int64  
 8   season_1    6493 non-null   uint8  
 9   season_2    6493 non-null   uint8  
 10  season_3    6493 non-null   uint8  
 11  season_4    6493 non-null   uint8  
 12  weather_1   6493 non-null   uint8  
 13  weather_2   6493 non-null   uint8  
 14  weather_3   6493 non-null   uint8  
 15  weather_4   6493 non-null   uint8  
 16  m_1         6493 non-null   uint8  
 17  m_2         6493 non-null   uint8  
 18  m_3         6493 non-null   uint

## 타켓 선정
* 답안지 : count 제출
* regcount(A패턴)  = registered(A2패턴) + casual (B패턴) 
* <font color=red><b>registered(A2패턴) + casual (B패턴)  --> 이 값을 답안으로 제출

## 최종피쳐 선정

In [15]:
train.columns

Index(['holiday', 'workingday', 'temp', 'humidity', 'windspeed', 'regcount',
       'casual', 'registered', 'peak', 'ideal', 'sticky', 'season_1',
       'season_2', 'season_3', 'season_4', 'weather_1', 'weather_2',
       'weather_3', 'weather_4', 'm_1', 'm_2', 'm_3', 'm_4', 'm_5', 'm_6',
       'm_7', 'm_8', 'm_9', 'm_10', 'm_11', 'm_12', 'y_2011', 'y_2012', 'h_0',
       'h_1', 'h_2', 'h_3', 'h_4', 'h_5', 'h_6', 'h_7', 'h_8', 'h_9', 'h_10',
       'h_11', 'h_12', 'h_13', 'h_14', 'h_15', 'h_16', 'h_17', 'h_18', 'h_19',
       'h_20', 'h_21', 'h_22', 'h_23', 'w_0', 'w_1', 'w_2', 'w_3', 'w_4',
       'w_5', 'w_6', 'day_type_0', 'day_type_1', 'day_type_2'],
      dtype='object')

<pre>
타겟(1) : 'casual' + 'registered'
타겟(2) : 'regcount'


'd' 피쳐 삭제
('m', 'h'),  'y', 'd' -->'w'


(+)3개 : 'holiday', 'workingday', 'day_type'
(+)2개 : 'holiday', 'workingday'
(+)1개 : 'day_type'


'weather', 'humidity',  'season', 
(+)'windspeed' : 'h' 비례? / 'm' 반비례?


다중공선 : (상관계수가 높은 경우)
'regcount' 'registered' 'casual' : 0.97  --> 'regcount' vs. 'registered'+'casual'
'temp' 'atemp'                   : 0.98  --> 'atemp' drop

선택적으로 모델에 적용
(+)'w' 'day_type' : -0.78
(+)'season' 'm'   : 0.97


* ('m', 'h'),  'y', 'w', 'weather', 'humidity',  'season', 'temp'
* (+)'windspeed'
* (+)('holiday', 'workingday'), ('day_type')
* (+)('w' 'day_type') : -0.78
* (+)('m') 'season'    : 0.97


## 학습 & 평가
* ref : https://suboptimal.wiki/explanation/mse/

*  $ RMSLE = \sqrt{\frac{1}{n}\Sigma_{i=1}^{n}{\Big(\frac{log(Y_i+1) - log(\hat{Y_i}+1)}{N}\Big)^2}}$

## 공통함수

In [16]:
# ! pip install xgboost
# ! pip install lightgbm 

* class sklearn.linear_model.Ridge(alpha=1.0, *, fit_intercept=True, copy_X=True, max_iter=None, tol=0.0001, solver='auto', positive=False, random_state=None)
* class sklearn.linear_model.Lasso(alpha=1.0, *, fit_intercept=True, precompute=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False, positive=False, random_state=None, selection='cyclic')

* https://towardsdatascience.com/custom-implementation-of-feature-importance-for-your-voting-classifier-model-859b573ce0e0
* [bagging vs boosting] : http://egloos.zum.com/incredible/v/7478090
* [eval plot] https://hyemin-kim.github.io/2020/08/04/S-Python-sklearn4/#2-%ED%8F%89%EA%B0%80-%EC%A7%80%ED%91%9C-%EB%A7%8C%EB%93%A4%EA%B8%B0

In [17]:
modelsss = Ridge(alpha=1.0, random_state=0)
modelsss.__class__.__name__

'Ridge'

## 점수보기

In [18]:
# my_fit_score(train)

* https://towardsdatascience.com/custom-implementation-of-feature-importance-for-your-voting-classifier-model-859b573ce0e0
* [bagging vs boosting] : http://egloos.zum.com/incredible/v/7478090
* [eval plot] https://hyemin-kim.github.io/2020/08/04/S-Python-sklearn4/#2-%ED%8F%89%EA%B0%80-%EC%A7%80%ED%91%9C-%EB%A7%8C%EB%93%A4%EA%B8%B0

In [19]:
test.index.values

array(['2011-01-20T00:00:00.000000000', '2011-01-20T01:00:00.000000000',
       '2011-01-20T02:00:00.000000000', ...,
       '2012-12-31T21:00:00.000000000', '2012-12-31T22:00:00.000000000',
       '2012-12-31T23:00:00.000000000'], dtype='datetime64[ns]')

In [20]:
# """ Function to compute feature importance of Voting Classifier """    
# def my_voting_importance(voting_clf, weights):
#     imp_dict = dict()
#     for est in voting_clf.estimators_:
#         imp_dict[str(est)] = est.feature_importances_
    
#     fe_scores = [0]*len(list(imp_dict.values())[0])
#     for idx, imp_score in enumerate(imp_dict.values()):
#         imp_score_with_weight = imp_score*weights[idx]
#         fe_scores = list(np.add(fe_scores, list(imp_score_with_weight)))
#     return fe_scores

# coef_df = pd.DataFrame()
# coef_df['col']  = train.columns
# #---------------------------------------------------------
# y = df['regcount']
# X = df.drop(['regcount','casual','registered'], axis=1)
# X_train, X_test, y_train, y_r_test = train_test_split(X, y, random_state=0, test_size=0.2)
        
# vr = VotingRegressor([("DTR", RandomForestRegressor(random_state=0)), 
#                          ("LGBM", LGBMRegressor(random_state=0))
#                         ])
# vr.fit(X_train, y_train)
# for est in vr.estimators:
#     print(est)
# # coef_df['coef'] = my_voting_importance(model, [1, 1, 1])
# #---------------------------------------------------------
# # coef_df.sort_values('coef', ascending=False)

In [21]:
def my_view_chart(model_name, model, X_train):
    if model_name == "RIDGE" :
        ax1 = plt.subplot(4,1,1)
        s = pd.Series(model.coef_, index=X_train.columns).sort_values()
        sns.barplot(x=s.values, y=s.index, ax=ax1)
        plt.show()
    elif model_name == "RF" :
        ax2 = plt.subplot(4,1,2)
        s = pd.Series(model.feature_importances_, index=X_train.columns).sort_values()
        sns.barplot(x=s.values, y=s.index, ax=ax2)
        plt.show()
    elif model_name == "XGB" :
        ax3 = plt.subplot(4,1,3)
        # Weight, Gain, Cover
        s = pd.Series(model.get_booster().get_score(importance_type='weight'), index=X_train.columns).sort_values()
        sns.barplot(x=s.values, y=s.index, ax=ax3)
        plt.show()
    elif model_name == "LGBM" :
        ax4 = plt.subplot(4,1,4)  
        # Gain, Split
        #s = pd.Series(model.booster_.faeture_importance(importance_type="gain"), index=X_train.columns).sort_values()
        s = pd.Series(model.feature_importances_, index=X_train.columns).sort_values()
        sns.barplot(x=s.values, y=s.index, ax=ax4)
        plt.show()
            

In [22]:
# def my_fit_score(df,chart_view=False) :
#     model_list = [ 
#                    # ("RIDGE"  , Ridge(alpha=1.0, random_state=0)),
#                    # ("LASSO"  , Lasso(alpha=1.0, random_state=0)),
#                    ("DTR"    , DecisionTreeRegressor(random_state=0)),
#                    # ("RF"     , RandomForestRegressor(random_state=0)),
#                    # ("LR"     , LinearRegression()                   ),
#                    # ("ABOOST" , AdaBoostRegressor(random_state=0)                  ),
#                    ("XGB"    , XGBRegressor(random_state=0)                       ),  
#                    ("LGBM"   , LGBMRegressor(random_state=0)                      ),
#                    # ("VR-XGB-LGBM"    , VotingRegressor([("XGB", XGBRegressor(random_state=0)), ("LGBM", LGBMRegressor(random_state=0))]) ) ,
#                    # ("VR-RF-LGBM"    , VotingRegressor([("DTR", RandomForestRegressor(random_state=0)), ("LGBM", LGBMRegressor(random_state=0))]) )
#                  ]
    


#     #---------(타켓피쳐:로그스케일링) -----------
#     # y_c = df['casual'] 
#     # y_r = df['registered'] 
#     y_c = np.log1p( df['casual'] )
#     y_r = np.log1p( df['registered'] )
#     # print(y_c[:5], y_r[:5])
    
#     X = df.drop(['regcount','casual','registered'], axis=1)
    

    
    
#     kf = KFold(n_splits=5, shuffle=True, random_state=11)
#     for i, (train_index, test_index) in enumerate(kf.split(X)):
    
#         X_train = X.iloc[train_index]
#         X_test = X.iloc[test_index]
#         y_rtrain = y_r.iloc[train_index]
#         y_rtest = y_r.iloc[test_index]

        
        
#     for i, (train_index, test_index) in enumerate(kf.split(X)):
    
#         X_train = X.iloc[train_index]
#         X_test = X.iloc[test_index]
#         y_ctrain = y_c.iloc[train_index]
#         y_ctest = y_c.iloc[test_index]
    
    
    
# #     mydic  = {'min_samples_split':[1,2,3],
# #                 'min_samples_leaf':[1,2,3]}
# #     mydic = {'max_depth':[1,2,3]}
    
#     for tpl in model_list :
#         # print( tpl[0] ) 
        
#         # model = tpl[1]
#         # score_list = cross_val_score(model,X,y_r,scoring = 'neg_mean_squared_log_error',cv=5)
#         model = tpl[1]
#         if tpl[0] == 'DTR' or tpl[0]=='RF':
#             mydic  = {'min_samples_split':[1,2,3],
#                 'min_samples_leaf':[1,2,3]}
#         elif tpl[0] == 'XGB' or tpl[0]=='LGBM':
#             mydic = {'max_depth':[1,2,3]}

#         models = GridSearchCV(model, scoring = 'neg_mean_squared_log_error', cv=kf, param_grid = mydic) #9개 케이스의 모델들 생성

        
        
#         models.fit(X_train, y_ctrain)
#         y_pred = models.predict(X_test)
        
#         print("y_c : ", tpl[0] )
#         print(models.best_score_*-1)
        
        
        
#         models.fit(X_train, y_rtrain)
#         y_pred2 = models.predict(X_test)
        
#         print("y_r : " , tpl[0])
#         print(models.best_score_*-1)
        

        
        
#         y_pred_comb = np.expm1(y_pred)+np.expm1(y_pred2)
#         y_real_comb = np.expm1(y_rtest)+np.expm1(y_ctest)

#         msle_score = mean_squared_log_error(y_real_comb, y_pred_comb)
#         print("RMSLE: ", np.sqrt(msle_score*-1))
    

# #     kf = KFold(n_splits=5, shuffle=True, random_state=11)
# #     for i, (train_index, test_index) in enumerate(kf.split(X)):

# #         X_train = X.iloc[train_index]
# #         X_test = X.iloc[test_index]
# #         y_rtrain = y_r.iloc[train_index]
# #         y_rtest = y_r.iloc[test_index]



# #     for i, (train_index, test_index) in enumerate(kf.split(X)):

# #         X_train = X.iloc[train_index]
# #         X_test = X.iloc[test_index]
# #         y_ctrain = y_c.iloc[train_index]
# #         y_ctest = y_c.iloc[test_index]




# #     rf = RandomForestRegressor()

# #     rf.fit(X_train, y_rtrain)
# #     y_pred = rf.predict(X_test)

# #     rf.fit(X_train, y_ctrain)
# #     y_pred2 = rf.predict(X_test)

# #     y_pred_comb = np.expm1(y_pred)+np.expm1(y_pred2)
# #     y_real_comb = np.expm1(y_rtest)+np.expm1(y_ctest)

# #     msle_score = mean_squared_log_error(y_real_comb, y_pred_comb)
    
    
# #     sub = pd.read_csv('./sampleSubmission.csv')
# #     sub['count'] = y_pred_comb
# #     sub.to_csv('Submission1.csv')

# #     for tpl in model_list :
# #         print( tpl[0] ) 
        
# #         model = tpl[1]
# # #         X_train, X_test, y_train, y_r_test = train_test_split(X, y_r, random_state=0, test_size=0.2)
# # #         model.fit(X_train, y_train)
# # #         y_r_pred = model.predict(X_test)
        
# #         model = tpl[1]
# #         # X_train, X_test, y_train, y_c_test = train_test_split(X, y_c, random_state=0, test_size=0.2)
#         # model.fit(X_train, y_train)
#         # y_c_pred = model.predict(X_test)
        
#         #---------(타켓피쳐:로그스케일링) 복원-----------
#     y_pred_comb = np.expm1(y_r_pred)+np.expm1(y_c_pred)
#     y_real_comb = np.expm1(y_r_test)+np.expm1(y_c_test)
#     y_pred_comb[y_pred_comb < 0] = 0
        
# #         # print("yrtest", y_r_test[:5])
# #         # print("yctest", y_c_test[:5])
# #         # print("ypred", y_pred_comb[:5])
# #         msle_score = mean_squared_log_error(y_real_comb, y_pred_comb)

# #         print("RMSLE: ", np.sqrt(msle_score)) 
# #         print("-"*30)
        
# #         feature_names = X_train.columns
# #         importances = model.feature_importances_
# #         sns.barplot(y=feature_names, x=importances, estimator=np.mean)
# #         plt.title("Feature importances")
# #         plt.show()
        
# #         s = pd.Series(data=model.feature_importances_,
# #         index=train.columns)
# #         s = s.sort_values(ascending=False)
# #         sns.barplot(x=s, y=s.index)


In [23]:
def my_fit_score(df, chart_view=False) :
    model_list = [ 
                   #("RIDGE"  , Ridge(alpha=1.0, random_state=0)),
                   #("LASSO"  , Lasso(alpha=1.0, random_state=0)),
                   ("DTR"    , DecisionTreeRegressor(random_state=0)),
                   ("RF"     , RandomForestRegressor(random_state=0)),
                   #("LR"     , LinearRegression()                   ),
                   #("ABOOST" , AdaBoostRegressor(random_state=0)                  ),
                   ("XGB"    , XGBRegressor(random_state=0)                       ),   #booster=gblinear
                   ("LGBM"   , LGBMRegressor(random_state=0)                      ),
                   # model.booster().get_score(importance_type='weight')
                   ("VR-XGB-LGBM"  , VotingRegressor([("XGB", XGBRegressor(random_state=0)), ("LGBM", LGBMRegressor(random_state=0))]) ) ,
                   ("VR-RF-LGBM"   , VotingRegressor([("DTR", RandomForestRegressor(random_state=0)), ("LGBM", LGBMRegressor(random_state=0))]) )
                 ]
   
    
    #---------(타켓피쳐:로그스케일링) -----------
    # y_c = df['casual'] 
    # y_r = df['registered'] 
    y_c = np.log1p( df['casual'] )
    y_r = np.log1p( df['registered'] )
    # print(y_c[:5], y_r[:5])
    
    X = df.drop(['regcount','casual','registered'], axis=1)
    

    for tpl in model_list :
        print( tpl[0] )         
        model = tpl[1]
        X_train, X_test, y_train, y_r_test = train_test_split(X, y_r, random_state=0, test_size=0.2)
        model.fit(X_train, y_train)  
        #------------------------------------------------
        # feature_importance 차트 그리기
        if bool(chart_view) : 
            my_view_chart(tpl[0], model, X_train)
        #------------------------------------------------
        y_r_pred = model.predict(X_test)
        
        
        model = tpl[1]
        X_train, X_test, y_train, y_c_test = train_test_split(X, y_c, random_state=0, test_size=0.2)
        model.fit(X_train, y_train)
        y_c_pred = model.predict(X_test)
        
        
        #---------(타켓피쳐:로그스케일링) 복원-----------
        y_pred_comb = np.expm1(y_r_pred)+np.expm1(y_c_pred)
        y_real_comb = np.expm1(y_r_test)+np.expm1(y_c_test)
        y_pred_comb[y_pred_comb < 0] = 0
        
        # print("yrtest", y_r_test[:5])
        # print("yctest", y_c_test[:5])
        # print("ypred", y_pred_comb[:5])
        msle_score = mean_squared_log_error(y_real_comb, y_pred_comb)

        print("RMSLE: ", np.sqrt(msle_score)) 
        print("-"*30)
        

In [24]:
my_fit_score(train)

DTR
RMSLE:  0.4023520513244359
------------------------------
RF
RMSLE:  0.3054113795509723
------------------------------
XGB
RMSLE:  0.30162506914820986
------------------------------
LGBM
RMSLE:  0.29414778468571606
------------------------------
VR-XGB-LGBM
RMSLE:  0.28915645477161256
------------------------------
VR-RF-LGBM
RMSLE:  0.28951492740380846
------------------------------


In [25]:
# kf = KFold(n_splits=5, shuffle=True, random_state=11)
# for i, (train_index, test_index) in enumerate(kf.split(X)):
    
#     X_train = X.iloc[train_index]
#     X_test = X.iloc[test_index]
#     y_rtrain = y_r.iloc[train_index]
#     y_rtest = y_r.iloc[test_index]


        
# for i, (train_index, test_index) in enumerate(kf.split(X)):

#     X_train = X.iloc[train_index]
#     X_test = X.iloc[test_index]
#     y_ctrain = y_c.iloc[train_index]
#     y_ctest = y_c.iloc[test_index]
        
        

        
# rf = RandomForestRegressor()

# rf.fit(X_train, y_rtrain)
# y_pred = rf.predict(X_test)

# rf.fit(X_train, y_ctrain)
# y_pred2 = rf.predict(X_test)

# y_pred_comb = np.expm1(y_pred)+np.expm1(y_pred2)
# y_real_comb = np.expm1(y_rtest)+np.expm1(y_ctest)

# msle_score = mean_squared_log_error(y_real_comb, y_pred_comb)

In [26]:
train.head()

Unnamed: 0_level_0,holiday,workingday,temp,humidity,windspeed,regcount,casual,registered,peak,ideal,sticky,season_1,season_2,season_3,season_4,weather_1,weather_2,weather_3,weather_4,m_1,m_2,m_3,m_4,m_5,m_6,m_7,m_8,m_9,m_10,m_11,m_12,y_2011,y_2012,h_0,h_1,h_2,h_3,h_4,h_5,h_6,h_7,h_8,h_9,h_10,h_11,h_12,h_13,h_14,h_15,h_16,h_17,h_18,h_19,h_20,h_21,h_22,h_23,w_0,w_1,w_2,w_3,w_4,w_5,w_6,day_type_0,day_type_1,day_type_2
regdate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1
2011-01-01 00:00:00,0,0,2.383243,4.406719,1.933883,16.0,3.0,13.0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
2011-01-01 01:00:00,0,0,2.304583,4.394449,1.976391,40.0,8.0,32.0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
2011-01-01 02:00:00,0,0,2.304583,4.394449,1.976391,32.0,5.0,27.0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
2011-01-01 03:00:00,0,0,2.383243,4.330733,1.952949,13.0,3.0,10.0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
2011-01-01 04:00:00,0,0,2.383243,4.330733,1.952949,1.0,0.0,1.0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0


In [27]:
test.head()

Unnamed: 0_level_0,holiday,workingday,temp,humidity,windspeed,peak,ideal,sticky,season_1,season_2,season_3,season_4,weather_1,weather_2,weather_3,weather_4,m_1,m_2,m_3,m_4,m_5,m_6,m_7,m_8,m_9,m_10,m_11,m_12,y_2011,y_2012,h_0,h_1,h_2,h_3,h_4,h_5,h_6,h_7,h_8,h_9,h_10,h_11,h_12,h_13,h_14,h_15,h_16,h_17,h_18,h_19,h_20,h_21,h_22,h_23,w_0,w_1,w_2,w_3,w_4,w_5,w_6,day_type_0,day_type_1,day_type_2
regdate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1
2011-01-20 00:00:00,0,1,2.456164,4.043051,3.295937,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
2011-01-20 01:00:00,0,1,2.456164,4.043051,2.220872,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
2011-01-20 02:00:00,0,1,2.456164,4.043051,2.220872,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
2011-01-20 03:00:00,0,1,2.456164,4.043051,2.485023,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
2011-01-20 04:00:00,0,1,2.456164,4.043051,2.485023,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0


In [28]:
df.head()

Unnamed: 0_level_0,holiday,workingday,temp,humidity,windspeed,peak,ideal,sticky,season_1,season_2,season_3,season_4,weather_1,weather_2,weather_3,weather_4,m_1,m_2,m_3,m_4,m_5,m_6,m_7,m_8,m_9,m_10,m_11,m_12,y_2011,y_2012,h_0,h_1,h_2,h_3,h_4,h_5,h_6,h_7,h_8,h_9,h_10,h_11,h_12,h_13,h_14,h_15,h_16,h_17,h_18,h_19,h_20,h_21,h_22,h_23,w_0,w_1,w_2,w_3,w_4,w_5,w_6,day_type_0,day_type_1,day_type_2
regdate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1
2011-01-20 00:00:00,0,1,2.456164,4.043051,3.295937,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
2011-01-20 01:00:00,0,1,2.456164,4.043051,2.220872,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
2011-01-20 02:00:00,0,1,2.456164,4.043051,2.220872,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
2011-01-20 03:00:00,0,1,2.456164,4.043051,2.485023,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
2011-01-20 04:00:00,0,1,2.456164,4.043051,2.485023,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0


## 점수보기
* RF  
RMSLE:  0.3288546947603633  : 원본  
RMSLE:  0.327712532218228   : 풍속채우기  
RMSLE:  0.3029460779805034  : 스케일링  

* VR-XGB-LGBM  
RMSLE:  0.2839304212238602  : 타겟스케일링  
RMSLE:  0.28362762650351697 : 연속형스케일링

In [29]:
my_fit_score(train)

DTR
RMSLE:  0.4023520513244359
------------------------------
RF
RMSLE:  0.3054113795509723
------------------------------
XGB
RMSLE:  0.30162506914820986
------------------------------
LGBM
RMSLE:  0.29414778468571606
------------------------------
VR-XGB-LGBM
RMSLE:  0.28915645477161256
------------------------------
VR-RF-LGBM
RMSLE:  0.28951492740380846
------------------------------


In [30]:
# df_count.head()

In [31]:
# def __get_feature_importances(self, train_columns):
#     feature_imp = dict()
#     for est in self.model.estimators_:
#         if type(est) == catboost.core.CatBoostRegressor:
#             feature_imp['catboost'] = dict(zip(train_columns, est.feature_importances_))
#         elif type(est) == lightgbm.sklearn.LGBMRegressor:
#             feature_imp['lgbm'] = dict(zip(train_columns, est.feature_importances_))
#         elif type(est) == xgboost.sklearn.XGBRegressor:
#             feature_imp['xgboost'] = dict(zip(train_columns, est.feature_importances_))
#     return feature_imp


In [32]:
# df[df['windspeed']==0].value_counts()

# 최종 점수

## X, y 분리
    - 위의 df는 모델분석을 위한 것으로 최종 분석은 tr로 다시 x,y 분리한다.

In [33]:
y_tr_c = train['casual']
y_tr_r = train['registered']
X_tr = train.drop(['regcount', 'casual', 'registered'], axis=1)
X_tr.shape, y_tr_c.shape, y_tr_r.shape

((10881, 64), (10881,), (10881,))

In [34]:
from sklearn.metrics import make_scorer


## casual(y_tr_c) predict

In [35]:
kf = KFold(n_splits=5, shuffle=True, random_state=11)
model = XGBRegressor(random_state=0)  
mydic = {'max_depth':[6,7,8], 'learning_rate':[0.2,0.3,0.4]}
models_xg = GridSearchCV(model, scoring='neg_mean_squared_error', cv=kf, param_grid = mydic)
models_xg.fit(X_tr, np.log1p(y_tr_c))
print(models_xg.best_params_)
print(models_xg.best_score_)  

# 예측
best_model = models_xg.best_estimator_
count_pre_c = best_model.predict(test)
print(count_pre_c[:5])

{'learning_rate': 0.2, 'max_depth': 6}
-0.24678429759534412
[ 0.652473    0.5134295   0.89904267  0.25458243 -0.14036226]


## registerd(y_tr_r) predict

In [36]:
kf = KFold(n_splits=5, shuffle=True, random_state=11)
model = XGBRegressor(random_state=0)  
mydic = {'max_depth':[6,7,8], 'learning_rate':[0.2,0.3,0.4]}
models_xg = GridSearchCV(model, scoring='neg_mean_squared_error', cv=kf, param_grid = mydic)
models_xg.fit(X_tr, np.log1p(y_tr_r))
print(models_xg.best_params_)
print(models_xg.best_score_)  

# 예측
best_model = models_xg.best_estimator_
count_pre_r = best_model.predict(test)
print(count_pre_r[:5])

{'learning_rate': 0.2, 'max_depth': 7}
-0.09053344469015082
[2.3796496 1.5878246 1.3242908 1.2770926 1.0358442]


# 제출

In [37]:
correct = pd.read_csv("sampleSubmission.csv")

In [38]:
test_pred_xgb  = np.expm1(count_pre_c) + np.expm1(count_pre_r)

test_pred_xgb[test_pred_xgb < 0] = 0

In [39]:
correct['count'] = test_pred_xgb*0.2 + test_pred_xgb*0.8
correct.head()

Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,10.721401
1,2011-01-20 01:00:00,4.564105
2,2011-01-20 02:00:00,4.216767
3,2011-01-20 03:00:00,2.876121
4,2011-01-20 04:00:00,1.686527


## 답지 생성

In [40]:
correct.to_csv("c02.csv", index=False)
co = pd.read_csv("c02.csv")
co.head()

Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,10.721401
1,2011-01-20 01:00:00,4.564105
2,2011-01-20 02:00:00,4.216767
3,2011-01-20 03:00:00,2.876121
4,2011-01-20 04:00:00,1.686527


## Outlier 처리
* <font color=red size=4><b>train에만 처리

<pre>
RIDGE
RMSLE:  1.2381857382907928
------------------------------
LASSO
RMSLE:  1.225011162068078
------------------------------
DTR
RMSLE:  0.40846452875216704
------------------------------
RF
RMSLE:  0.33285798378220866
------------------------------
LR
RMSLE:  1.2382780216954419
------------------------------
ABOOST
RMSLE:  1.115326753305681
------------------------------
XGB
RMSLE:  0.4611753550135809
------------------------------
LGBM
RMSLE:  0.3999441627321239
------------------------------
VR-XGB-LGBM
RMSLE:  0.3877526497892151
------------------------------
VR-RF-LGBM
RMSLE:  0.34299001906970344
------------------------------

In [41]:
train.shape, test.shape

((10881, 67), (6493, 64))

In [42]:
# del_idx_list = []
# idx = train[train['weather']==4].index
# del_idx_list.extend(idx)
# idx = train[train['temp']>40].index
# del_idx_list.extend(idx)
# idx = train[train['windspeed']>50].index
# del_idx_list.extend(idx)
# train.drop(del_idx_list, axis=0, inplace=True)

In [43]:
train.shape, test.shape

((10881, 67), (6493, 64))

## 점수보기

In [44]:
# my_fit_score(train)

# 학습

## 타켓 선정
* 답안지 : count 제출
* regcount(A패턴)  = registered(A2패턴) + casual (B패턴) 
* <font color=red><b>registered(A2패턴) + casual (B패턴)  --> 이 값을 답안으로 제출

# 최종피쳐 선정

In [45]:
train.columns

Index(['holiday', 'workingday', 'temp', 'humidity', 'windspeed', 'regcount',
       'casual', 'registered', 'peak', 'ideal', 'sticky', 'season_1',
       'season_2', 'season_3', 'season_4', 'weather_1', 'weather_2',
       'weather_3', 'weather_4', 'm_1', 'm_2', 'm_3', 'm_4', 'm_5', 'm_6',
       'm_7', 'm_8', 'm_9', 'm_10', 'm_11', 'm_12', 'y_2011', 'y_2012', 'h_0',
       'h_1', 'h_2', 'h_3', 'h_4', 'h_5', 'h_6', 'h_7', 'h_8', 'h_9', 'h_10',
       'h_11', 'h_12', 'h_13', 'h_14', 'h_15', 'h_16', 'h_17', 'h_18', 'h_19',
       'h_20', 'h_21', 'h_22', 'h_23', 'w_0', 'w_1', 'w_2', 'w_3', 'w_4',
       'w_5', 'w_6', 'day_type_0', 'day_type_1', 'day_type_2'],
      dtype='object')

<pre>
타겟(1) : 'casual' + 'registered'
타겟(2) : 'regcount'


'd' 피쳐 삭제
('m', 'h'),  'y', 'd' -->'w'


(+)3개 : 'holiday', 'workingday', 'day_type'
(+)2개 : 'holiday', 'workingday'
(+)1개 : 'day_type'


'weather', 'humidity',  'season', 
(+)'windspeed' : 'h' 비례? / 'm' 반비례?


다중공선 : (상관계수가 높은 경우)
'regcount' 'registered' 'casual' : 0.97  --> 'regcount' vs. 'registered'+'casual'
'temp' 'atemp'                   : 0.98  --> 'atemp' drop

선택적으로 모델에 적용
(+)'w' 'day_type' : -0.78
(+)'season' 'm'   : 0.97


* ('m', 'h'),  'y', 'w', 'weather', 'humidity',  'season', 'temp'
* (+)'windspeed'
* (+)('holiday', 'workingday'), ('day_type')
* (+)('w' 'day_type') : -0.78
* (+)('m') 'season'    : 0.97


## 점수보기

# 학습 & 평가
* ref : https://suboptimal.wiki/explanation/mse/

*  $ RMSLE = \sqrt{\frac{1}{n}\Sigma_{i=1}^{n}{\Big(\frac{log(Y_i+1) - log(\hat{Y_i}+1)}{N}\Big)^2}}$

In [46]:
# ! pip install xgboost
# ! pip install lightgbm 

In [47]:
m
        score_list.append([tpl[0], y_col[i], mse_score, np.sqrt(mse_score) ] )

IndentationError: unexpected indent (4179191000.py, line 2)

In [None]:
score_df = pd.DataFrame(score_list, columns=["model","col","mse","rmse"])
score_df

In [None]:
plt.figure(figsize=(8,3))
ax1 = plt.subplot(1,2,1)
ax1.set_title("RMSE")
sns.barplot(
    data= score_df,
    x= "model",
    y= "rmse" ,
    hue = "col",
    ax = ax1
    
)
ax2 = plt.subplot(1,2,2)
ax2.set_title("MSE")
sns.barplot(
    data= score_df,
    x= "model",
    y= "mse" ,
    hue = "col",
    ax = ax2
)
plt.show()

* by 규환

In [None]:
y = train[['regcount','casual','registered']]
X = train.drop(['regcount','casual','registered'], axis=1)


In [None]:
X.head(), y.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [None]:
model = DecisionTreeRegressor(random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [None]:
print( y_test.shape )
y_test.head(3)

In [None]:
print( y_pred.shape )

In [None]:
y_pred[0]

In [None]:
print( y_pred.T[0].shape )
y_pred.T

In [None]:
y_test['casual']

In [None]:
# regcount(A패턴)  = registered(A2패턴) + casual (B패턴) 
score_regcount   = mean_squared_error(y_test['regcount']    , y_pred.T[0])
score_casual     = mean_squared_error(y_test['casual'], y_pred.T[1])
score_registered = mean_squared_error(y_test['registered']  , y_pred.T[2])

print(score_regcount, score_casual, score_registered)