# [한양대 ERICA] 퇴근시간 버스승차인원 예측 경진대회
## Quisst

### 전처리

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
train.head()

Unnamed: 0,id,date,bus_route_id,in_out,station_code,station_name,6~7_ride,7~8_ride,8~9_ride,9~10_ride,10~11_ride,11~12_ride,6~7_takeoff,7~8_takeoff,8~9_takeoff,9~10_takeoff,10~11_takeoff,11~12_takeoff,18~20_ride
0,0,2019-09-01,4270000,시외,344,제주썬호텔,0.0,1.0,2.0,5.0,2.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2019-09-01,4270000,시외,357,한라병원,1.0,4.0,4.0,2.0,5.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2,2,2019-09-01,4270000,시외,432,정존마을,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,3,2019-09-01,4270000,시내,1579,제주국제공항(600번),0.0,17.0,6.0,26.0,14.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,53.0
4,4,2019-09-01,4270000,시내,1646,중문관광단지입구,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [3]:
# 세부적인 탑승/하차 column 생성

train['6~8_ride'] = train['6~7_ride'] + train['7~8_ride']
train['8~10_ride'] = train['8~9_ride'] + train['9~10_ride']
train['10~12_ride'] = train['10~11_ride'] + train['11~12_ride']
train['6~12_ride'] = train['6~8_ride'] + train['8~10_ride'] + train['10~12_ride']

train['6~8_takeoff'] = train['6~7_takeoff'] + train['7~8_takeoff']
train['8~10_takeoff'] = train['8~9_takeoff'] + train['9~10_takeoff']
train['10~12_takeoff'] = train['10~11_takeoff'] + train['11~12_takeoff']
train['6~12_takeoff'] = train['6~8_takeoff'] + train['8~10_takeoff'] + train['10~12_takeoff']

In [4]:
# 요일 뽑아내고 원핫 인코딩

train['date'] = pd.to_datetime(train['date'])
train['weekday'] = train['date'].dt.weekday
train = pd.get_dummies(train,columns=['weekday'])
train['weekday'] = train['date'].dt.weekday

test['date'] = pd.to_datetime(test['date'])
test['weekday'] = test['date'].dt.weekday
test = pd.get_dummies(test,columns=['weekday'])
test['weekday'] = test['date'].dt.weekday

In [5]:
# 데이터 인코딩

from sklearn.preprocessing import LabelEncoder

bus_route_id_encoded = LabelEncoder().fit(pd.concat([train['bus_route_id'], test['bus_route_id']]))
station_name_encoded = LabelEncoder().fit(pd.concat([train['station_name'], test['station_name']]))

train['bus_route_id_encoded'] = bus_route_id_encoded.transform(train['bus_route_id'])
train['station_name_encoded'] = station_name_encoded.transform(train['station_name'])

test['bus_route_id_encoded'] = bus_route_id_encoded.transform(test['bus_route_id'])
test['station_name_encoded'] = station_name_encoded.transform(test['station_name'])

In [6]:
train['bus_route_id_station_name'] = train['bus_route_id'].astype(str) + train['station_name']
test['bus_route_id_station_name'] = test['bus_route_id'].astype(str) + test['station_name']

In [7]:
bus_route_id_station_name_encoded = LabelEncoder().fit(pd.concat([train['bus_route_id_station_name'], test['bus_route_id_station_name']]))

train['bus_route_id_station_name_encoded'] = bus_route_id_station_name_encoded.transform(train['bus_route_id_station_name'])
test['bus_route_id_station_name_encoded'] = bus_route_id_station_name_encoded.transform(test['bus_route_id_station_name'])

In [8]:
# 시내/시외 구분

train['in_out'] = train['in_out'].map({'시내':0,'시외':1})
test['in_out'] = test['in_out'].map({'시내':0,'시외':1})

In [9]:
# 시내/시외의 18~20시 탑승자 수의 평균 구하기

train['in_out_mean'], test['in_out_mean'] = 0, 0

in_out_0s_train = train[train['in_out'] == 0].index
in_out_1s_train = train[train['in_out'] == 1].index
in_out_0s_test = test[test['in_out'] == 0].index
in_out_1s_test = test[test['in_out'] == 1].index

train.loc[in_out_0s_train, 'in_out_mean'] = train[train['in_out'] == 0]['18~20_ride'].mean()
train.loc[in_out_1s_train, 'in_out_mean'] = train[train['in_out'] == 1]['18~20_ride'].mean()
test.loc[in_out_0s_test, 'in_out_mean'] = train[train['in_out'] == 0]['18~20_ride'].mean()
test.loc[in_out_1s_test, 'in_out_mean'] = train[train['in_out'] == 1]['18~20_ride'].mean()

In [10]:
# 정류장 이름에 '공항'을 포함하는 정류장 분류

a = train[train['station_name'].str.contains('공항')]['18~20_ride']
a

3         53.0
55        10.0
65        73.0
74         0.0
110       42.0
          ... 
414535     5.0
415091     4.0
415115     1.0
415116     5.0
415361     0.0
Name: 18~20_ride, Length: 2831, dtype: float64

In [11]:
def is_airport(x):
    if str(x).find("공항") != -1 :
        return 1
    return 0

In [12]:
train['is_airport'] = train['station_name'].apply(is_airport)
train['is_airport']
test['is_airport'] = test['station_name'].apply(is_airport)
test['is_airport']

0         0
1         0
2         0
3         1
4         0
         ..
228165    0
228166    0
228167    0
228168    0
228169    0
Name: is_airport, Length: 228170, dtype: int64

In [13]:
# 주말 분류하기

def is_weekend(x):
    if x == 1:
        return x
    else:
        return 0

In [14]:
train['is_weekend'] = train['weekday_5'].apply(is_weekend)
train['is_weekend'] += train['weekday_6'].apply(is_weekend)

test['is_weekend'] = test['weekday_5'].apply(is_weekend)
test['is_weekend'] += test['weekday_6'].apply(is_weekend)

In [15]:
# 주중 분류하기

def is_weekday(x):
    if x == 1:
        return 0
    else:
        return 1

In [16]:
train['is_weekday'] = train['is_weekend'].apply(is_weekday)
test['is_weekday'] = test['is_weekend'].apply(is_weekday)

In [17]:
# 각 요일의 18~20시 탑승자 수의 평균 구하기

train['day_mean'], test['day_mean'] = 0, 0

mon_train = train[train['weekday_0'] == 1].index
tues_train = train[train['weekday_1'] == 1].index
wednes_train = train[train['weekday_2'] == 1].index
thurs_train = train[train['weekday_3'] == 1].index
fri_train = train[train['weekday_4'] == 1].index
satur_train = train[train['weekday_5'] == 1].index
sun_train = train[train['weekday_6'] == 1].index

mon_test = test[test['weekday_0'] == 1].index
tues_test = test[test['weekday_1'] == 1].index
wednes_test = test[test['weekday_2'] == 1].index
thurs_test = test[test['weekday_3'] == 1].index
fri_test = test[test['weekday_4'] == 1].index
satur_test = test[test['weekday_5'] == 1].index
sun_test = test[test['weekday_6'] == 1].index

train.loc[mon_train, 'day_mean'] = train[train['weekday_0'] == 1]['18~20_ride'].mean()
train.loc[tues_train, 'day_mean'] = train[train['weekday_1'] == 1]['18~20_ride'].mean()
train.loc[wednes_train, 'day_mean'] = train[train['weekday_2'] == 1]['18~20_ride'].mean()
train.loc[thurs_train, 'day_mean'] = train[train['weekday_3'] == 1]['18~20_ride'].mean()
train.loc[fri_train, 'day_mean'] = train[train['weekday_4'] == 1]['18~20_ride'].mean()
train.loc[satur_train, 'day_mean'] = train[train['weekday_5'] == 1]['18~20_ride'].mean()
train.loc[sun_train, 'day_mean'] = train[train['weekday_6'] == 1]['18~20_ride'].mean()

test.loc[mon_test, 'day_mean'] = train[train['weekday_0'] == 1]['18~20_ride'].mean()
test.loc[tues_test, 'day_mean'] = train[train['weekday_1'] == 1]['18~20_ride'].mean()
test.loc[wednes_test, 'day_mean'] = train[train['weekday_2'] == 1]['18~20_ride'].mean()
test.loc[thurs_test, 'day_mean'] = train[train['weekday_3'] == 1]['18~20_ride'].mean()
test.loc[fri_test, 'day_mean'] = train[train['weekday_4'] == 1]['18~20_ride'].mean()
test.loc[satur_test, 'day_mean'] = train[train['weekday_5'] == 1]['18~20_ride'].mean()
test.loc[sun_test, 'day_mean'] = train[train['weekday_6'] == 1]['18~20_ride'].mean()

In [49]:
# 각 버스 노선의 18~20시 탑승자 수의 평균 구하기

def bus_route_id_ride_mean(x):
    x = train[train['bus_route_id'] == x]['18~20_ride'].mean()
    return x

train['bus_route_id_ride_mean'] = train['bus_route_id'].progress_apply(bus_route_id_ride_mean)
test['bus_route_id_ride_mean'] = test['bus_route_id'].progress_apply(bus_route_id_ride_mean)

100%|█████████████████████████████████████████████████████████████████████████| 415423/415423 [07:35<00:00, 911.48it/s]
100%|█████████████████████████████████████████████████████████████████████████| 228170/228170 [04:09<00:00, 914.87it/s]


In [50]:
# 각 정류장의 18~20시 탑승자 수의 평균 구하기

def station_name_ride_mean(x):
    x = train[train['station_name_encoded'] == x]['18~20_ride'].mean()
    return x

train['station_name_ride_mean'] = train['station_name_encoded'].progress_apply(station_name_ride_mean)
test['station_name_ride_mean'] = test['station_name_encoded'].progress_apply(station_name_ride_mean)

100%|█████████████████████████████████████████████████████████████████████████| 415423/415423 [08:16<00:00, 836.29it/s]
100%|█████████████████████████████████████████████████████████████████████████| 228170/228170 [04:31<00:00, 841.12it/s]


In [51]:
# 각 정류장 코드의 18~20시 탑승자 수의 평균 구하기

def station_code_ride_mean(x):
    x = train[train['station_code'] == x]['18~20_ride'].mean()
    return x

train['station_code_ride_mean'] = train['station_code'].progress_apply(station_code_ride_mean)
test['station_code_ride_mean'] = test['station_code'].progress_apply(station_code_ride_mean)

100%|█████████████████████████████████████████████████████████████████████████| 415423/415423 [07:30<00:00, 922.14it/s]
100%|█████████████████████████████████████████████████████████████████████████| 228170/228170 [04:09<00:00, 914.49it/s]


In [52]:
train.head()

Unnamed: 0,id,date,bus_route_id,in_out,station_code,station_name,6~7_ride,7~8_ride,8~9_ride,9~10_ride,...,bus_route_id_station_name,bus_route_id_station_name_encoded,in_out_mean,is_airport,is_weekend,is_weekday,day_mean,bus_route_id_ride_mean,station_name_ride_mean,station_code_ride_mean
0,0,2019-09-01,4270000,1,344,제주썬호텔,0.0,1.0,2.0,5.0,...,4270000제주썬호텔,28464,2.044345,0,1,0,1.034282,3.104381,0.964286,1.466667
1,1,2019-09-01,4270000,1,357,한라병원,1.0,4.0,4.0,2.0,...,4270000한라병원,28476,2.044345,0,1,0,1.034282,3.104381,4.46208,4.178218
2,2,2019-09-01,4270000,1,432,정존마을,1.0,1.0,0.0,2.0,...,4270000정존마을,28459,2.044345,0,1,0,1.034282,3.104381,1.84381,2.169559
3,3,2019-09-01,4270000,0,1579,제주국제공항(600번),0.0,17.0,6.0,26.0,...,4270000제주국제공항(600번),28460,1.228499,1,1,0,1.034282,3.104381,52.032258,52.032258
4,4,2019-09-01,4270000,0,1646,중문관광단지입구,0.0,0.0,0.0,0.0,...,4270000중문관광단지입구,28467,1.228499,0,1,0,1.034282,3.104381,1.509294,0.732794


In [53]:
train.columns

Index(['id', 'date', 'bus_route_id', 'in_out', 'station_code', 'station_name',
       '6~7_ride', '7~8_ride', '8~9_ride', '9~10_ride', '10~11_ride',
       '11~12_ride', '6~7_takeoff', '7~8_takeoff', '8~9_takeoff',
       '9~10_takeoff', '10~11_takeoff', '11~12_takeoff', '18~20_ride',
       '6~8_ride', '8~10_ride', '10~12_ride', '6~12_ride', '6~8_takeoff',
       '8~10_takeoff', '10~12_takeoff', '6~12_takeoff', 'weekday_0',
       'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5',
       'weekday_6', 'weekday', 'bus_route_id_encoded', 'station_name_encoded',
       'bus_route_id_station_name', 'bus_route_id_station_name_encoded',
       'in_out_mean', 'is_airport', 'is_weekend', 'is_weekday', 'day_mean',
       'bus_route_id_ride_mean', 'station_name_ride_mean',
       'station_code_ride_mean'],
      dtype='object')

In [54]:
input_var=['in_out', 'station_code',
       '6~7_ride', '7~8_ride', '8~9_ride', '9~10_ride', '10~11_ride',
       '11~12_ride', '6~7_takeoff', '7~8_takeoff', '8~9_takeoff',
       '9~10_takeoff', '10~11_takeoff', '11~12_takeoff',
       '6~8_ride', '8~10_ride', '10~12_ride', '6~12_ride', '6~8_takeoff',
       '8~10_takeoff', '10~12_takeoff', '6~12_takeoff', 'weekday_0',
       'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5',
       'weekday_6', 'bus_route_id_encoded', 'station_name_encoded',
       'bus_route_id_station_name_encoded',
       'in_out_mean', 'is_airport', 'is_weekend', 'is_weekday', 'day_mean',
       'bus_route_id_ride_mean', 'station_name_ride_mean',
       'station_code_ride_mean']
target=['18~20_ride']

In [55]:
X_train=train[input_var]
y_train=train[target]

X_test_pred=test[input_var]

### Hyper Parameter Tuning & Training

In [56]:
!pip install bayesian_optimization

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from bayes_opt import BayesianOptimization



In [57]:
# hyper parameter tuning (Bayesian Optimization)

train_dataset = lgb.Dataset(data=X_train, label=y_train)
init_iter = 5
n_iters = 10

def hyp_lgbm(num_leaves, max_depth, feature_fraction, bagging_fraction, num_iterations):
      
    params = {
        'application':'regression',
        'early_stopping_round': 100,
        'metric':'rmse',
        'verbose': -1,
        'n_jobs': -1,
        'learning_rate': 0.003
    }
    params['num_iterations'] = int(round(num_iterations))
    params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
    params['feature_fraction'] = max(min(feature_fraction, 1), 0)
    params['max_depth'] = int(round(max_depth))
    params['num_leaves'] = int(round(num_leaves))
    result = lgb.cv(params, train_dataset, nfold=5, stratified=False, verbose_eval=1000, metrics=['rmse'], seed=23)

    return -np.min(result['rmse-mean'])

In [58]:
# tuning할 hyper parameter들의 범위 지정

pds = {
    'max_depth': (4, 100),
    'num_leaves': (20, 1000),
    'feature_fraction': (0.7, 0.9),
    'bagging_fraction': (0.7, 0.9),
    'num_iterations': (1000, 8000)
}
optimizer = BayesianOptimization(hyp_lgbm, pds, random_state=23)
optimizer.maximize(init_points=init_iter, n_iter=n_iters, acq='ei', random_state=23)

|   iter    |  target   | baggin... | featur... | max_depth | num_it... | num_le... |
-------------------------------------------------------------------------------------




[1000]	cv_agg's rmse: 2.41655 + 0.0517794
[2000]	cv_agg's rmse: 2.3573 + 0.0461467
| [0m 1       [0m | [0m-2.354   [0m | [0m 0.8035  [0m | [0m 0.8894  [0m | [0m 77.48   [0m | [0m 2.977e+0[0m | [0m 236.6   [0m |
[1000]	cv_agg's rmse: 2.40017 + 0.0458181
[2000]	cv_agg's rmse: 2.34938 + 0.043844
| [95m 2       [0m | [95m-2.349   [0m | [95m 0.8372  [0m | [95m 0.7334  [0m | [95m 41.67   [0m | [95m 5.326e+0[0m | [95m 423.7   [0m |
[1000]	cv_agg's rmse: 2.39446 + 0.0495302
| [0m 3       [0m | [0m-2.358   [0m | [0m 0.7005  [0m | [0m 0.8768  [0m | [0m 88.95   [0m | [0m 3.103e+0[0m | [0m 597.8   [0m |
[1000]	cv_agg's rmse: 2.42767 + 0.0490506
| [0m 4       [0m | [0m-2.398   [0m | [0m 0.8957  [0m | [0m 0.869   [0m | [0m 10.25   [0m | [0m 3.063e+0[0m | [0m 302.2   [0m |
[1000]	cv_agg's rmse: 2.39295 + 0.0505962
| [0m 5       [0m | [0m-2.392   [0m | [0m 0.8645  [0m | [0m 0.8252  [0m | [0m 14.61   [0m | [0m 1.004e+0[0m | [0m 943.3 



[1000]	cv_agg's rmse: 2.39341 + 0.0455905
| [0m 6       [0m | [0m-2.351   [0m | [0m 0.8856  [0m | [0m 0.7834  [0m | [0m 88.82   [0m | [0m 3.092e+0[0m | [0m 598.5   [0m |




[1000]	cv_agg's rmse: 2.39263 + 0.0457792
| [0m 7       [0m | [0m-2.351   [0m | [0m 0.7231  [0m | [0m 0.7744  [0m | [0m 84.38   [0m | [0m 3.121e+0[0m | [0m 608.1   [0m |




[1000]	cv_agg's rmse: 2.39275 + 0.0482449
| [0m 8       [0m | [0m-2.354   [0m | [0m 0.8188  [0m | [0m 0.8138  [0m | [0m 83.79   [0m | [0m 3.079e+0[0m | [0m 600.2   [0m |




[1000]	cv_agg's rmse: 2.3946 + 0.0454544
[2000]	cv_agg's rmse: 2.34698 + 0.0450368
| [95m 9       [0m | [95m-2.347   [0m | [95m 0.8893  [0m | [95m 0.7343  [0m | [95m 95.7    [0m | [95m 3.078e+0[0m | [95m 611.3   [0m |




[1000]	cv_agg's rmse: 2.39214 + 0.0467906
[2000]	cv_agg's rmse: 2.34916 + 0.0456074
| [0m 10      [0m | [0m-2.349   [0m | [0m 0.7871  [0m | [0m 0.7837  [0m | [0m 84.2    [0m | [0m 3.087e+0[0m | [0m 589.0   [0m |




[1000]	cv_agg's rmse: 2.39449 + 0.047093
| [0m 11      [0m | [0m-2.349   [0m | [0m 0.8787  [0m | [0m 0.758   [0m | [0m 76.26   [0m | [0m 3.121e+0[0m | [0m 619.5   [0m |




[1000]	cv_agg's rmse: 2.39311 + 0.0463491
| [95m 12      [0m | [95m-2.347   [0m | [95m 0.8584  [0m | [95m 0.7225  [0m | [95m 72.29   [0m | [95m 3.079e+0[0m | [95m 601.6   [0m |




[1000]	cv_agg's rmse: 2.41732 + 0.0504164
[2000]	cv_agg's rmse: 2.35502 + 0.0458013
| [0m 13      [0m | [0m-2.352   [0m | [0m 0.868   [0m | [0m 0.7309  [0m | [0m 89.69   [0m | [0m 2.974e+0[0m | [0m 234.6   [0m |




[1000]	cv_agg's rmse: 2.39187 + 0.0486199
| [0m 14      [0m | [0m-2.354   [0m | [0m 0.7281  [0m | [0m 0.8851  [0m | [0m 96.41   [0m | [0m 3.105e+0[0m | [0m 585.4   [0m |




[1000]	cv_agg's rmse: 2.39234 + 0.0456008
[2000]	cv_agg's rmse: 2.34491 + 0.0445507
| [95m 15      [0m | [95m-2.345   [0m | [95m 0.822   [0m | [95m 0.7343  [0m | [95m 65.67   [0m | [95m 3.116e+0[0m | [95m 605.2   [0m |


In [59]:
# tuning된 hyper parameter 확인

optimizer.max['params']

{'bagging_fraction': 0.8219650047349388,
 'feature_fraction': 0.7343197162276959,
 'max_depth': 65.66801926212803,
 'num_iterations': 3115.7913990928723,
 'num_leaves': 605.1839699481119}

In [60]:
# LGBM으로 학습

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=23)
train_dataset = lgb.Dataset(X_train, label=y_train)
test_dataset = lgb.Dataset(X_test, label=y_test)

params = {
    'bagging_fraction': 0.8219650047349388,
    'feature_fraction': 0.7343197162276959,
    'max_depth': int(round(65.66801926212803)),
    'num_iterations': int(round(3115.7913990928723)),
    'num_leaves': int(round(605.1839699481119)),
    'learning_rate': 0.003,
    #'boosting': 'gbdt',
    'boosting': 'dart',
    'metric': 'rmse',
    'objective': 'regression',
    'verbose': -1,
    'n_jobs': -1
}

model = lgb.train(params, train_dataset, 3116, test_dataset, verbose_eval=500, early_stopping_rounds=100)



[500]	valid_0's rmse: 3.57779
[1000]	valid_0's rmse: 3.12015
[1500]	valid_0's rmse: 2.87398
[2000]	valid_0's rmse: 2.68873
[2500]	valid_0's rmse: 2.56455
[3000]	valid_0's rmse: 2.52062


In [61]:
# 학습 결괏값 저장

y_pred = model.predict(X_test_pred)
y_pred

array([ 3.01399986e+00,  6.63235608e+00,  1.96617361e+00, ...,
        5.03494492e-02,  5.52224213e-02, -1.50027541e-03])

In [62]:
# 0 미만의 값은 0으로 처리

def return_0(x):
    if x < 0:
        return 0
    else:
        return x

In [63]:
# 저장

submission = pd.read_csv('data/submission_sample.csv')
submission['18~20_ride'] = y_pred
submission['18~20_ride'] = submission['18~20_ride'].apply(return_0)
submission.to_csv("last_model_v60.csv",index=False)