## 버스 운행시간 데이터를 활용한 데이터 분석
- 버스가 다음 정류장에 도착하기 위해서 걸릴 시간을 예측

## 1. 라이브러리 및 데이터
## Library & Data

In [1]:
## matplotlib 사용시 한국어 나오게 하는 코드

!apt -qq -y install fonts-nanum > /dev/null

import matplotlib.font_manager as fm
import matplotlib.pyplot as plt

fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'

font = fm.FontProperties(fname=fontpath, size=9)

# 그래프에 retina display 적용

%config InlineBackend.figure_format = 'retina'

# Colab 의 한글 폰트 설정

plt.rc('font', family='NanumBarunGothic') 







In [2]:
import pandas as pd #판다스 패키지 불러오기
import lightgbm as lgb
import folium

In [3]:
# 데이터 불러오기
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')
submission = pd.read_csv('/content/submission_제출양식.csv')

## 2. 탐색적 자료분석
## Exploratory Data Analysis (EDA)


### pd.DataFrame.head()
 - 데이터 프레임의 위에서 부터 n개 행을 보여주는 함수
 - n의 기본 값(default 값)은 5

In [9]:
train.head()

Unnamed: 0,id,date,route_id,vh_id,route_nm,now_latitude,now_longitude,now_station,now_arrive_time,distance,next_station,next_latitude,next_longitude,next_arrive_time
0,0,2019-10-15,405136001,7997025,360-1,33.456267,126.55175,제주대학교입구,06시,266.0,제대마을,33.457724,126.554014,24
1,1,2019-10-15,405136001,7997025,360-1,33.457724,126.554014,제대마을,06시,333.0,제대아파트,33.458783,126.557353,36
2,2,2019-10-15,405136001,7997025,360-1,33.458783,126.557353,제대아파트,06시,415.0,제주대학교,33.459893,126.561624,40
3,3,2019-10-15,405136001,7997025,360-1,33.479705,126.543811,남국원(아라방면),06시,578.0,제주여자중고등학교(아라방면),33.48486,126.542928,42
4,4,2019-10-15,405136001,7997025,360-1,33.485662,126.494923,도호동,07시,374.0,은남동,33.485822,126.490897,64


+ id : 고유 id
+ date : 버스 운행 날짜
+ route_id : 버스 노선 ID
+ vh_id : 버스 id
+ route_nm : 버스 노선 실제 번호
+ now_latitude : 현재 정류소의 위도
+ now_longitude : 현재 정류소의 경도
+ now_station : 현재 정류소 이름
+ now_arrive_time : 현재 정류장에 도착한 시간
+ distance : 현재 정류장에서 다음 정류장까지 실제 이동한 거리
+ next_station : 다음 정류소 이름
+ next_latitude : 다음 정류소의 위도
+ next_longitude : 다음 정류소의 경도
+ next_arrive_time : 다음 정류장에 도착할 때 까지 걸린 시간(단위:초)으로 답안 제출을 위해서 예측해야 되는 값



In [8]:
test.head()

Unnamed: 0,id,date,route_id,vh_id,route_nm,now_latitude,now_longitude,now_station,now_arrive_time,distance,next_station,next_latitude,next_longitude
0,210457,2019-10-29,405136001,7997025,360-1,33.457724,126.554014,제대마을,07시,333.0,제대아파트,33.458783,126.557353
1,210458,2019-10-29,405136001,7997025,360-1,33.458783,126.557353,제대아파트,07시,415.0,제주대학교,33.459893,126.561624
2,210459,2019-10-29,405136001,7997025,360-1,33.478867,126.483833,한라중학교/부영아파트,08시,417.0,대림2차아파트,33.47885,126.48835
3,210460,2019-10-29,405136001,7997025,360-1,33.47885,126.48835,대림2차아파트,08시,334.0,연동대림1차아파트,33.4807,126.489933
4,210461,2019-10-29,405136001,7997025,360-1,33.4807,126.489933,연동대림1차아파트,08시,550.0,케이티앤지,33.482077,126.485355


In [7]:
submission.head()

Unnamed: 0,id,next_arrive_time
0,210457,0
1,210458,0
2,210459,0
3,210460,0
4,210461,0


### pd.DataFrame.tail()
 - 데이터 프레임의 아래에서 부터 n개 행을 보여주는 함수
 - n의 기본 값(default 값)은 5

In [6]:
train.tail()

Unnamed: 0,id,date,route_id,vh_id,route_nm,now_latitude,now_longitude,now_station,now_arrive_time,distance,next_station,next_latitude,next_longitude,next_arrive_time
210452,210452,2019-10-28,405328102,7983486,281-2,33.255783,126.57745,비석거리,21시,528.0,삼아아파트,33.251896,126.574417,96
210453,210453,2019-10-28,405328102,7983486,281-2,33.248595,126.568527,동문로터리,21시,280.0,매일올레시장 7번입구,33.249753,126.565959,50
210454,210454,2019-10-28,405328102,7983486,281-2,33.251891,126.560303,서귀포시 구 버스터미널,21시,114.0,아랑조을거리 입구,33.251084,126.559551,16
210455,210455,2019-10-28,405328102,7983486,281-2,33.251084,126.559551,아랑조을거리 입구,21시,223.0,평생학습관,33.249504,126.558068,38
210456,210456,2019-10-28,405328102,7983486,281-2,33.248487,126.511195,대륜동주민센터,21시,189.0,서귀포버스터미널(가상정류소),33.249091,126.509224,24


In [5]:
print(train['date'].min())
print(train['date'].max())
print(test['date'].min())
print(test['date'].max())

2019-10-15
2019-10-28
2019-10-29
2019-11-05


### pd.DataFrame.shape
 - 데이터 프레임의 행의 개수와 열의 개수가 저장되어 있는 속성(attribute)

In [4]:
print(train.shape)
print(test.shape)
print(submission.shape)

(210457, 14)
(91774, 13)
(91774, 2)


### pd.DataFrame.info()
- 데이터셋의 column별 정보를 알려주는 함수
- 비어 있지 않은 값은 (non-null)은 몇개인지?
- column의 type은 무엇인지?
 - type의 종류 : int(정수), float(실수), object(문자열), 등등 (date, ...)

In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210457 entries, 0 to 210456
Data columns (total 14 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   id                210457 non-null  int64  
 1   date              210457 non-null  object 
 2   route_id          210457 non-null  int64  
 3   vh_id             210457 non-null  int64  
 4   route_nm          210457 non-null  object 
 5   now_latitude      210457 non-null  float64
 6   now_longitude     210457 non-null  float64
 7   now_station       210457 non-null  object 
 8   now_arrive_time   210457 non-null  object 
 9   distance          210457 non-null  float64
 10  next_station      210457 non-null  object 
 11  next_latitude     210457 non-null  float64
 12  next_longitude    210457 non-null  float64
 13  next_arrive_time  210457 non-null  int64  
dtypes: float64(5), int64(4), object(5)
memory usage: 22.5+ MB


In [11]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91774 entries, 0 to 91773
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               91774 non-null  int64  
 1   date             91774 non-null  object 
 2   route_id         91774 non-null  int64  
 3   vh_id            91774 non-null  int64  
 4   route_nm         91774 non-null  object 
 5   now_latitude     91774 non-null  float64
 6   now_longitude    91774 non-null  float64
 7   now_station      91774 non-null  object 
 8   now_arrive_time  91774 non-null  object 
 9   distance         91774 non-null  float64
 10  next_station     91774 non-null  object 
 11  next_latitude    91774 non-null  float64
 12  next_longitude   91774 non-null  float64
dtypes: float64(5), int64(3), object(5)
memory usage: 9.1+ MB


### pd.DataFrame.describe()
- 숫자형 (int, float) column들의 기술 통계량을 보여주는 함수

- 기술통계량이란?
 - 해당 column을 대표할 수 있는 통계값들을 의미
 
 
- 기술통계량 종류
 - count: 해당 column에서 비어 있지 않은 값의 개수
 - mean: 평균
 - std: 표준편차
 - min: 최솟값 (이상치 포함)
 - 25% (Q1): 전체 데이터를 순서대로 정렬했을 때, 아래에서 부터 1/4번째 지점에 있는 값
 - 50% (Q2): 중앙값 (전체 데이터를 순서대로 정렬했을 때, 아래에서 부터 2/4번째 지점에 있는 값)
 - 75% (Q3): 전체 데이터를 순서대로 정렬했을 때, 아래에서 부터 3/4번째 지점에 있는 값
 - max: 최댓값 (이상치 포함) 
 
 
 
- 이상치: 울타리 밖에 있는 부분을 이상치라고 정의함
   - 아래쪽 울타리: $Q_1$ - $1.5 * IQR$
   - 위쪽 울타리: $Q_3$ + $1.5 * IQR$
   - $IQR$ = $Q_3 - Q_1$
 
 
<img src="https://miro.medium.com/max/10125/1*NRlqiZGQdsIyAu0KzP7LaQ.png" width="700" height="500">

In [12]:
train.describe()

Unnamed: 0,id,route_id,vh_id,now_latitude,now_longitude,distance,next_latitude,next_longitude,next_arrive_time
count,210457.0,210457.0,210457.0,210457.0,210457.0,210457.0,210457.0,210457.0,210457.0
mean,105228.0,405249100.0,7988694.0,33.434528,126.603451,490.2561,33.434711,126.603687,85.380824
std,60753.847139,91324.04,6774.077,0.10235,0.123961,520.563932,0.102224,0.123838,85.05117
min,0.0,405136000.0,7983000.0,33.244382,126.4733,97.0,33.244382,126.4733,6.0
25%,52614.0,405136500.0,7983093.0,33.325283,126.5239,291.0,33.325283,126.52455,44.0
50%,105228.0,405320100.0,7983431.0,33.484667,126.55105,384.0,33.48486,126.55105,66.0
75%,157842.0,405320100.0,7997041.0,33.500197,126.650322,542.0,33.500228,126.650322,102.0
max,210456.0,405328100.0,7997124.0,33.556167,126.935188,7461.0,33.556167,126.935188,2996.0


### pd.DataFrame.groupby()
 - 집단에 대한 통계량 확인 
 
<img src="https://s3.amazonaws.com/files.dezyre.com/images/Tutorials/Split+Apply+Combine+Strategy.png" width="700" height="500">

In [13]:
len(train['now_station'].unique())

348

In [14]:
train.groupby('route_id')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f56d1735310>

In [15]:
train[['route_id', 'vh_id']].drop_duplicates().groupby('route_id').count()
# drop_duplicates : 중복되는 행을 제거
# DataFrame.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
# subset : 중복값을 검사할 열, keep : 중복제거를 할 때 남길 행, inplace : 원본 변경 여부, ignore_index : 원래의 index 무시 여부부

Unnamed: 0_level_0,vh_id
route_id,Unnamed: 1_level_1
405136001,10
405136002,5
405136007,1
405136012,3
405136521,6
405136522,15
405320111,3
405320112,5
405320113,1
405320114,5


In [16]:
train[['route_id', 'next_arrive_time']].groupby('route_id').mean()

Unnamed: 0_level_0,next_arrive_time
route_id,Unnamed: 1_level_1
405136001,92.063688
405136002,108.270852
405136007,94.172947
405136012,111.322633
405136521,114.673939
405136522,101.339188
405320111,68.90171
405320112,67.602392
405320113,68.403649
405320114,66.634818


## folium
- python에서 제공하는 지도를 다루는 대표적인 라이브러리
- folium.Map(location=[위도, 경도], zoom_start=숫자)
- folium.Marker([위도, 경도], popup = "마우스 클릭시 표기되는 문구", tooltip = "마우스 오버시 표기되는 문구", icon = folium.Icon(color="색상", icon="아이콘 모양").add_to(지도)
- folium.CircleMarker([위도, 경도], radius = "반경의 범위", color = "선의 색깔", fill_color = "채워질 원의 색깔").add_to(지도)

### folium.Map()
 - location 옵션에 있는 좌표를 중심으로 지도 시각화

In [17]:
coord = [35.1559361, 129.1384361]
map_osm = folium.Map(location = coord)
map_osm

In [18]:
map_osm = folium.Map(location=coord, zoom_start=12)
map_osm

In [19]:
map_osm = folium.Map(location = coord, tiles = 'Stamen Terrain')
map_osm

In [20]:
map_osm = folium.Map(location = coord, tiles = 'Stamen Toner')
map_osm

In [21]:
coord = [35.2335123, 129.0810047]

map_osm = folium.Map(location=coord, zoom_start=12)

folium.Marker(coord, popup='부산대학교', tooltip = '부산대학교 tooltip').add_to(map_osm) # 마커 생성 뒤 지도에 추가해주는 기능

map_osm.save('index.html')

map_osm

default images: https://getbootstrap.com/docs/3.3/components/

- 검색 기능이 없다

fontawesome:https://fontawesome.com/icons?d=gallery

- 해당링크의 무료인것만 쓸 수 있음

In [22]:
map_osm = folium.Map(location=coord, zoom_start=12)
folium.Marker(coord, popup='부산대학교', icon=folium.Icon(icon='home')).add_to(map_osm)
map_osm

In [23]:
map_osm = folium.Map(location=[37.541, 126.986], zoom_start=11)
folium.Marker([37.566345, 126.977893], popup='서울특별시청', icon=folium.Icon(color='red',icon='info-sign')).add_to(map_osm)
folium.Circle([37.4600, 126.9519], tooltip='서울대학교', radius=3000, color="green").add_to(map_osm)
folium.CircleMarker([37.5662, 126.9386], radius=10, popup='연세대학교',color="black", fill_color="crimson").add_to(map_osm)
map_osm

In [24]:
map_osm = folium.Map(location=[37.541, 126.986], zoom_start=11)
folium.RegularPolygonMarker([37.566345, 126.977893], popup='서울특별시청', fill_color='red',color='red', number_of_sides=3, radius=30, fill_opacity=0.2).add_to(map_osm)
folium.RegularPolygonMarker([37.4600, 126.9519], popup='서울대학교', fill_color='green',color='green', number_of_sides=4, radius=10, fill_opacity=0.2).add_to(map_osm)
folium.RegularPolygonMarker([37.5662, 126.9386], popup='연세대학교',fill_color='blue',color='blue', number_of_sides=6, radius=10).add_to(map_osm)
map_osm

In [25]:
cd = train.loc[0, 'now_latitude':'now_longitude'].tolist()
map_osm = folium.Map(location=cd, zoom_start=12)
folium.Marker(cd, icon=folium.Icon(icon='nome')).add_to(map_osm)
map_osm

In [26]:
train

Unnamed: 0,id,date,route_id,vh_id,route_nm,now_latitude,now_longitude,now_station,now_arrive_time,distance,next_station,next_latitude,next_longitude,next_arrive_time
0,0,2019-10-15,405136001,7997025,360-1,33.456267,126.551750,제주대학교입구,06시,266.0,제대마을,33.457724,126.554014,24
1,1,2019-10-15,405136001,7997025,360-1,33.457724,126.554014,제대마을,06시,333.0,제대아파트,33.458783,126.557353,36
2,2,2019-10-15,405136001,7997025,360-1,33.458783,126.557353,제대아파트,06시,415.0,제주대학교,33.459893,126.561624,40
3,3,2019-10-15,405136001,7997025,360-1,33.479705,126.543811,남국원(아라방면),06시,578.0,제주여자중고등학교(아라방면),33.484860,126.542928,42
4,4,2019-10-15,405136001,7997025,360-1,33.485662,126.494923,도호동,07시,374.0,은남동,33.485822,126.490897,64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210452,210452,2019-10-28,405328102,7983486,281-2,33.255783,126.577450,비석거리,21시,528.0,삼아아파트,33.251896,126.574417,96
210453,210453,2019-10-28,405328102,7983486,281-2,33.248595,126.568527,동문로터리,21시,280.0,매일올레시장 7번입구,33.249753,126.565959,50
210454,210454,2019-10-28,405328102,7983486,281-2,33.251891,126.560303,서귀포시 구 버스터미널,21시,114.0,아랑조을거리 입구,33.251084,126.559551,16
210455,210455,2019-10-28,405328102,7983486,281-2,33.251084,126.559551,아랑조을거리 입구,21시,223.0,평생학습관,33.249504,126.558068,38


In [27]:
map_data = train[['route_id','now_latitude','now_longitude','now_station']]
map_bus_route = map_data[map_data['route_id'] == 405320122].drop_duplicates("now_station")


map_osm = folium.Map(location=[33.4134, 126.5190], zoom_start = 10.5)


for item in map_bus_route.index:
    lat = map_bus_route.loc[item,'now_latitude']
    long = map_bus_route.loc[item,'now_longitude']
    
    folium.Marker([lat,long], popup = map_bus_route.loc[item,'now_station'],
                      icon = folium.Icon(color = 'red', icon = 'info-sign')
                      ).add_to(map_osm)

map_osm.save('index.html')

map_osm

## 3. 데이터 전처리
## Data Cleansing & Pre-Processing  

### pd.Series.str.slice()
- Series 내에 문자열을 일괄적으로 slicing (자르기)

In [30]:
train['now_arrive_time'] = train['now_arrive_time'].str.slice(stop = -1).astype('int')
# astype : 데이터프레임의 타입 변경경

In [32]:
train.head()

Unnamed: 0,id,date,route_id,vh_id,route_nm,now_latitude,now_longitude,now_station,now_arrive_time,distance,next_station,next_latitude,next_longitude,next_arrive_time
0,0,2019-10-15,405136001,7997025,360-1,33.456267,126.55175,제주대학교입구,6,266.0,제대마을,33.457724,126.554014,24
1,1,2019-10-15,405136001,7997025,360-1,33.457724,126.554014,제대마을,6,333.0,제대아파트,33.458783,126.557353,36
2,2,2019-10-15,405136001,7997025,360-1,33.458783,126.557353,제대아파트,6,415.0,제주대학교,33.459893,126.561624,40
3,3,2019-10-15,405136001,7997025,360-1,33.479705,126.543811,남국원(아라방면),6,578.0,제주여자중고등학교(아라방면),33.48486,126.542928,42
4,4,2019-10-15,405136001,7997025,360-1,33.485662,126.494923,도호동,7,374.0,은남동,33.485822,126.490897,64


In [33]:
test['now_arrive_time'] = test['now_arrive_time'].str.slice(stop = -1).astype('int')

In [34]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91774 entries, 0 to 91773
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               91774 non-null  int64  
 1   date             91774 non-null  object 
 2   route_id         91774 non-null  int64  
 3   vh_id            91774 non-null  int64  
 4   route_nm         91774 non-null  object 
 5   now_latitude     91774 non-null  float64
 6   now_longitude    91774 non-null  float64
 7   now_station      91774 non-null  object 
 8   now_arrive_time  91774 non-null  int64  
 9   distance         91774 non-null  float64
 10  next_station     91774 non-null  object 
 11  next_latitude    91774 non-null  float64
 12  next_longitude   91774 non-null  float64
dtypes: float64(5), int64(4), object(4)
memory usage: 9.1+ MB


## 4. 변수 선택 및 모델 구축
## Feature Engineering & Initial Modeling  

In [35]:
train.head()

Unnamed: 0,id,date,route_id,vh_id,route_nm,now_latitude,now_longitude,now_station,now_arrive_time,distance,next_station,next_latitude,next_longitude,next_arrive_time
0,0,2019-10-15,405136001,7997025,360-1,33.456267,126.55175,제주대학교입구,6,266.0,제대마을,33.457724,126.554014,24
1,1,2019-10-15,405136001,7997025,360-1,33.457724,126.554014,제대마을,6,333.0,제대아파트,33.458783,126.557353,36
2,2,2019-10-15,405136001,7997025,360-1,33.458783,126.557353,제대아파트,6,415.0,제주대학교,33.459893,126.561624,40
3,3,2019-10-15,405136001,7997025,360-1,33.479705,126.543811,남국원(아라방면),6,578.0,제주여자중고등학교(아라방면),33.48486,126.542928,42
4,4,2019-10-15,405136001,7997025,360-1,33.485662,126.494923,도호동,7,374.0,은남동,33.485822,126.490897,64


In [36]:
features = ['now_latitude', 'now_longitude', 'now_arrive_time', 'distance']
target = ['next_arrive_time']

In [37]:
X_train, X_test, y_train = train[features], test[features], train[target]

## 5. 모델 학습 및 검증
## Model Tuning & Evaluation

In [38]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [42]:
model_dict = {
    'linear':LinearRegression(),
    'rf':RandomForestRegressor(random_state=0, n_jobs=-1),
    'lgbm':lgb.LGBMRegressor(random_state=0)
}

In [43]:
model_dict.keys()

dict_keys(['linear', 'rf', 'lgbm'])

In [44]:
model_dict['rf']

In [48]:
model_result = {}

for key in model_dict.keys():
  print('#### 훈련 중 ####')
  model_dict[key].fit(X_train, y_train)
  print('#### 예측 중 ####')
  model_result[key] = model_dict[key].predict(X_test)

#### 훈련 중 ####
#### 예측 중 ####
#### 훈련 중 ####


  model_dict[key].fit(X_train, y_train)


#### 예측 중 ####
#### 훈련 중 ####
#### 예측 중 ####


In [51]:
model_result['rf']

array([ 35.36650507,  45.12237491, 135.88134374, ..., 126.37972036,
        31.84467172,  88.46184286])

In [52]:
lr_submit = submission.copy()
rf_submit = submission.copy()
lgbm_submit = submission.copy()

In [53]:
lr_submit['next_arrive_time'] = model_result['linear']
rf_submit['next_arrive_time'] = model_result['rf']
lgbm_submit['next_arrive_time'] = model_result['lgbm']

In [54]:
lgbm_submit.head()

Unnamed: 0,id,next_arrive_time
0,210457,42.124006
1,210458,60.248783
2,210459,120.812412
3,210460,61.146493
4,210461,150.454103


In [55]:
lr_submit.to_csv('lr_submit.csv', index = False)
rf_submit.to_csv('rf_submit.csv', index = False)
lgbm_submit.to_csv('lgbm_submit.csv', index = False)

## 6. 결과 및 결언
## Conclusion & Discussion