In [1]:
import requests
from requests_toolbelt import MultipartEncoder
import os
import gzip
import pandas as pd
import numpy as np
from datetime import datetime, date, time

In [2]:
def download_ex_data(file_name: str, fields: dict) -> bytes:
    if os.path.isfile(file_name):
        print(f"already '{file_name}' has been downloaded.")
    else:
        print(f"start crawling '{file_name}'")

        print(" -> POST ...")
        multipart = MultipartEncoder(fields=fields)
        headers = {
            'Content-Type': multipart.content_type,
        }
        res = requests.post('http://data.ex.co.kr/portal/fdwn/log', headers=headers, data=multipart)

        print(" -> decompessing ...")
        data = gzip.decompress(res.content)

        print(" -> saving ...")
        with open(file_name, 'wb') as f:
            f.write(data)

        print(f"complete to download '{file_name}'")

In [3]:
arg_date = '20190102'

In [4]:
vds_data_post_fields = {
    'dataSupplyDate': f'{arg_date}',
    'collectType': 'VDS',
    'dataType': '16',
    'collectCycle': '04',
    'supplyCycle': '01',
    'outFileName': f'vds_data_{arg_date}.gz',
}
vds_data_file_name = f"vds_data_{arg_date}.csv"
download_ex_data(vds_data_file_name, vds_data_post_fields)

con_zone_post_fields = {
    'dataSupplyDate': f'{arg_date}',
    'collectType': 'ETC',
    'dataType': '78',
    'collectCycle': '04',
    'supplyCycle': '01',
    'outFileName': f'con_zone_{arg_date}.gz',
}
con_zone_file_name = f"con_zone_{arg_date}.csv"
download_ex_data(con_zone_file_name, con_zone_post_fields)

vds_zone_post_fields = {
    'dataSupplyDate': f'{arg_date}',
    'collectType': 'ETC',
    'dataType': '79',
    'collectCycle': '04',
    'supplyCycle': '01',
    'outFileName': f'vds_zone_{arg_date}.gz',
}
vds_zone_file_name = f"vds_zone_{arg_date}.csv"
download_ex_data(vds_zone_file_name, vds_zone_post_fields)

vds_point_post_fields = {
    'dataSupplyDate': f'{arg_date}',
    'collectType': 'VDS',
    'dataType': '84',
    'collectCycle': '04',
    'supplyCycle': '01',
    'outFileName': f'vds_point_{arg_date}.gz',
}
vds_point_file_name = f"vds_point_{arg_date}.csv"
download_ex_data(vds_point_file_name, vds_point_post_fields)

already 'vds_data_20190102.csv' has been downloaded.
already 'con_zone_20190102.csv' has been downloaded.
already 'vds_zone_20190102.csv' has been downloaded.
already 'vds_point_20190102.csv' has been downloaded.


In [5]:
def check_delimiter(path):
    with open(path, 'r', encoding='euc-kr') as f:
        if '|' in f.readline():
            return '|'
        else:
            return ','

In [6]:
# 콘존 자료에서 필요한 열만 불러오기
# (콘존ID, 콘존길이, 기점종점방향구분코드, 시작노드ID, 종료노드ID, 차로수, 노선번호, 제한속도, 노선구성순번, 콘존명, 버스전용차로유무, 도로등급구분코드)
con_zone_df = pd.read_csv(
    con_zone_file_name,
    encoding='euc-kr',
    sep=check_delimiter(con_zone_file_name),
    usecols=[0, 1, 2, 9],
    dtype={
        '콘존ID': str,
        '콘존길이': float,
        '기점종점방향구분코드': str,
        '콘존명': str,
    }
)
con_zone_df.head()

Unnamed: 0,콘존ID,콘존길이,기점종점방향구분코드,콘존명
0,0010CZE005,200.0,E,경부고속국도시점-구서IC
1,0010CZE010,1820.0,E,구서IC-영락IC
2,0010CZE011,1990.0,E,영락IC-부산TG
3,0010CZE020,1070.0,E,부산TG-노포
4,0010CZE030,7780.0,E,노포IC-노포JC


In [7]:
# VDS 존 자료에서 필요한 열만 불러오기
# (VDS_ID, 지점이정, VDS존시작이정, VDS존종료이정, 노선번호, VDS존유형구분코드, 노선구성순번, 기점종점방향구분코드, VDS존길이, 도로등급구분코드, 콘존ID)
vds_zone_df = pd.read_csv(
    vds_zone_file_name,
    encoding='euc-kr',
    sep=check_delimiter(vds_zone_file_name),
    usecols=[0, 2, 3, 4, 10],
    dtype={
        'VDS_ID': str,
        '노선번호': int,
        '콘존ID': str,
        'VDS존시작이정': float,
        'VDS존종료이정': float,
    }
)
vds_zone_df.tail()

Unnamed: 0,VDS_ID,VDS존시작이정,VDS존종료이정,노선번호,콘존ID
7596,6000VDS03150,38.98,37.88,6000,6000CZS070
7597,6000VDS03200,43.22,38.98,6000,6000CZS080
7598,6000VDS03300,44.28,43.22,6000,6000CZS090
7599,6000VDS03400,46.29,44.28,6000,6000CZS090
7600,6000VDS03500,48.8,46.29,6000,6000CZS090


In [8]:
road_name_dict = dict([(key, value) for _, key, value in pd.read_csv(
    vds_point_file_name,
    sep=check_delimiter(vds_point_file_name),
    encoding='euc-kr',
    usecols=[8, 9]
).drop_duplicates().itertuples()])

print(f"10: {road_name_dict[10]}, total: {len(road_name_dict)}")

10: 경부선, total: 52


In [9]:
vds_zone_df['도로명'] = [road_name_dict.get(num) for num in vds_zone_df['노선번호']]
vds_zone_df.head()

Unnamed: 0,VDS_ID,VDS존시작이정,VDS존종료이정,노선번호,콘존ID,도로명
0,0010VDE00100,0.2,2.02,10,0010CZE010,경부선
1,0010VDE00200,2.02,2.85,10,0010CZE011,경부선
2,0010VDE00300,2.85,4.01,10,0010CZE011,경부선
3,0010VDE00400,4.01,5.08,10,0010CZE020,경부선
4,0010VDE00500,5.08,5.58,10,0010CZE030,경부선


In [10]:
# VDS 원시 자료에서 필요한 열만 불러오기
# (수집일자, 수집시분초, VDS_ID, 지점이정, 도로이정, 점유율, 평균속도, 교통량, 차로번호, 콘존ID, 콘존명, 콘존길이)
vds_data_df = pd.read_csv(
    vds_data_file_name, encoding='euc-kr',
    sep=check_delimiter(vds_data_file_name),
    usecols=[1, 2, 5, 6, 7, 8],
    dtype={
        '수집시분초': int, # query의 편의를 위해
        'VDS_ID': str,
        '점유율': float,
        '평균속도': float,
        '교통량': int,
        '차로번호': int,
    },
)

vds_data_df.head()

Unnamed: 0,수집시분초,VDS_ID,점유율,평균속도,교통량,차로번호
0,13430,0010VDS04145,0.0,0.0,0,3
1,13430,0010VDS04150,0.0,0.0,0,1
2,13430,0010VDS04150,0.78,89.0,1,2
3,13430,0010VDS04150,0.0,0.0,0,3
4,13430,0010VDS04155,0.0,0.0,0,1


In [11]:
vds_data_df = vds_data_df.merge(vds_zone_df, how='inner', on='VDS_ID')
vds_data_df = vds_data_df.merge(con_zone_df, how='inner', on='콘존ID')
vds_data_df

Unnamed: 0,수집시분초,VDS_ID,점유율,평균속도,교통량,차로번호,VDS존시작이정,VDS존종료이정,노선번호,콘존ID,도로명,콘존길이,기점종점방향구분코드,콘존명
0,13430,0010VDS04145,0.00,0.0,0,3,49.7,48.70,10,0010CZS080,경부선,17090.0,S,활천IC-언양JC
1,1,0010VDS04145,0.00,0.0,0,1,49.7,48.70,10,0010CZS080,경부선,17090.0,S,활천IC-언양JC
2,1,0010VDS04145,0.78,100.0,1,2,49.7,48.70,10,0010CZS080,경부선,17090.0,S,활천IC-언양JC
3,1,0010VDS04145,0.00,0.0,0,3,49.7,48.70,10,0010CZS080,경부선,17090.0,S,활천IC-언양JC
4,13730,0010VDS04145,0.00,0.0,0,1,49.7,48.70,10,0010CZS080,경부선,17090.0,S,활천IC-언양JC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40420795,235900,1000VDE06200,-1.00,-1.0,-1,4,105.7,106.94,1000,1000CZE340,서울외곽순환선,3640.0,E,도리JC-조남JC
40420796,235930,1000VDE06200,-1.00,-1.0,-1,1,105.7,106.94,1000,1000CZE340,서울외곽순환선,3640.0,E,도리JC-조남JC
40420797,235930,1000VDE06200,-1.00,-1.0,-1,2,105.7,106.94,1000,1000CZE340,서울외곽순환선,3640.0,E,도리JC-조남JC
40420798,235930,1000VDE06200,-1.00,-1.0,-1,3,105.7,106.94,1000,1000CZE340,서울외곽순환선,3640.0,E,도리JC-조남JC


| 구분                                                    | 통행구분                      | 시간               | 구간                                                           |
|---------------------------------------------------------|-------------------------------|--------------------|----------------------------------------------------------------|
| 평일                                                    | 경부고속도로 서울·부산 양방향 | 07:00 ~ 21:00      | 오산IC(376.4km)부터 한남대교남단(423.0km)까지 총연장 46.6km    |
| 토요일, 일요일, 공휴일                                  | 경부고속도로 서울·부산 양방향 | 07:00 ~ 21:00      | 신탄진IC(282.0km)부터 한남대교남단(423.0km)까지 총연장 141.0km |
| 토요일, 일요일, 공휴일                                  | 영동고속도로 인천·강릉 양방향 | 07:00 ~ 21:00      | 신갈JCT(43.6km)부터 호법JCT(70.5km)까지총연장 26.9km           |
| 설날·추석연휴(공휴일이 이어지는 경우 포함) 및 연휴 전날 | 경부고속도로 서울·부산 양방향 | 07:00~다음날 01:00 | 신탄진IC(282.0km)부터 한남대교남단(423.0km)까지 총연장 141.0km |
| 설날·추석연휴(공휴일이 이어지는 경우 포함) 및 연휴 전날 | 영동고속도로 인천·강릉 양방향 | 07:00~다음날 01:00 | 신갈JCT(43.6km)부터 호법JCT(70.5km)까지 총연장 26.9km          |

In [12]:
# 분류 번호 0 : 평일
# 분류 번호 1 : 공휴일(토요일, 일요일, 개천절, 신정 등)
# 분류 번호 2 : 설날/추석연휴 전날
# 분류 번호 3 : 설날/추석연휴
# 분류 번호 4 : 설날/추석연휴 다음날

year = datetime.strptime(arg_date, '%Y%m%d').year
holidays_df = pd.read_csv(f'holidays_{year}.csv')

# 평일인 경우
bus_road_query = '(차로번호 == 1) and (노선번호 == 10 and VDS존종료이정 >= 376.4 and VDS존시작이정 <= 423.0) and (수집시분초 >= 70000 and 수집시분초 <= 210000)'

if len(holiday := holidays_df[holidays_df['날짜'] == int(arg_date)]):
    holiday_code = holiday.iloc[0]['분류코드']
    bus_road_query = '(차로번호 == 1) and ((노선번호 == 10 and VDS존종료이정 >= 282.0 and VDS존시작이정 <= 423.0) or (노선번호 == 500 and VDS존종료이정 >= 43.6 and VDS존시작이정 <= 70.5))'

    if holiday_code == 1:
        bus_road_query += 'and (수집시분초 >= 70000 and 수집시분초 <= 210000)'
        # 공휴일인 경우
        pass
    elif holiday_code == 2:
        # 설날/추석연휴 전날
        bus_road_query += 'and (수집시분초 >= 70000 and 수집시분초 < 240000)'
        pass
    elif holiday_code == 3:
        # 설날/추석연휴
        bus_road_query += 'and ((수집시분초 > 0 and 수집시분초 <= 10000) or (수집시분초 >= 70000 and 수집시분초 < 240000))'
        pass
    elif holiday_code == 4:
        # 설날/추석연휴 다음날
        bus_road_query += 'and (수집시분초 > 0 and 수집시분초 <= 10000)'
        pass

bus_road_query

'(차로번호 == 1) and (노선번호 == 10 and VDS존종료이정 >= 376.4 and VDS존시작이정 <= 423.0) and (수집시분초 >= 70000 and 수집시분초 <= 210000)'

In [13]:
bus_road_indexes = vds_data_df.query(bus_road_query).index

road_type_column = np.ones(len(vds_data_df), dtype=int)
road_type_column[bus_road_indexes] = 2

vds_data_df['차로유형구분코드'] = road_type_column

vds_data_df.query(bus_road_query)

Unnamed: 0,수집시분초,VDS_ID,점유율,평균속도,교통량,차로번호,VDS존시작이정,VDS존종료이정,노선번호,콘존ID,도로명,콘존길이,기점종점방향구분코드,콘존명,차로유형구분코드
9682715,102000,0010VDS29200,3.32,83.0,3,1,378.21,377.15,10,0010CZS400,경부선,13270.0,S,오산IC-안성JC,2
9682720,81300,0010VDS29200,3.54,79.0,3,1,378.21,377.15,10,0010CZS400,경부선,13270.0,S,오산IC-안성JC,2
9682735,102300,0010VDS29200,3.14,86.0,2,1,378.21,377.15,10,0010CZS400,경부선,13270.0,S,오산IC-안성JC,2
9682760,81600,0010VDS29200,7.03,87.0,5,1,378.21,377.15,10,0010CZS400,경부선,13270.0,S,오산IC-안성JC,2
9682765,102600,0010VDS29200,0.00,0.0,0,1,378.21,377.15,10,0010CZS400,경부선,13270.0,S,오산IC-안성JC,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40164010,190030,0010VDS32503,-1.00,-1.0,-1,1,423.00,421.70,10,0010CZS522,경부선,1300.0,S,한남IC-잠원IC,2
40164045,190100,0010VDS32503,-1.00,-1.0,-1,1,423.00,421.70,10,0010CZS522,경부선,1300.0,S,한남IC-잠원IC,2
40164070,190130,0010VDS32503,-1.00,-1.0,-1,1,423.00,421.70,10,0010CZS522,경부선,1300.0,S,한남IC-잠원IC,2
40164085,190200,0010VDS32503,-1.00,-1.0,-1,1,423.00,421.70,10,0010CZS522,경부선,1300.0,S,한남IC-잠원IC,2


In [14]:
datetime_column = []
date_base: date = datetime.strptime(arg_date, '%Y%m%d').date()

for t in vds_data_df['수집시분초']:
    _t = t
    second = _t % 100
    _t //= 100
    minute = _t % 100
    _t //= 100
    hour = _t

    datetime_column.append(datetime.combine(date_base, time(hour, minute, second)))

vds_data_df['집계일시분초'] = datetime_column
vds_data_df

Unnamed: 0,수집시분초,VDS_ID,점유율,평균속도,교통량,차로번호,VDS존시작이정,VDS존종료이정,노선번호,콘존ID,도로명,콘존길이,기점종점방향구분코드,콘존명,차로유형구분코드,집계일시분초
0,13430,0010VDS04145,0.00,0.0,0,3,49.7,48.70,10,0010CZS080,경부선,17090.0,S,활천IC-언양JC,1,2019-01-02 01:34:30
1,1,0010VDS04145,0.00,0.0,0,1,49.7,48.70,10,0010CZS080,경부선,17090.0,S,활천IC-언양JC,1,2019-01-02 00:00:01
2,1,0010VDS04145,0.78,100.0,1,2,49.7,48.70,10,0010CZS080,경부선,17090.0,S,활천IC-언양JC,1,2019-01-02 00:00:01
3,1,0010VDS04145,0.00,0.0,0,3,49.7,48.70,10,0010CZS080,경부선,17090.0,S,활천IC-언양JC,1,2019-01-02 00:00:01
4,13730,0010VDS04145,0.00,0.0,0,1,49.7,48.70,10,0010CZS080,경부선,17090.0,S,활천IC-언양JC,1,2019-01-02 01:37:30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40420795,235900,1000VDE06200,-1.00,-1.0,-1,4,105.7,106.94,1000,1000CZE340,서울외곽순환선,3640.0,E,도리JC-조남JC,1,2019-01-02 23:59:00
40420796,235930,1000VDE06200,-1.00,-1.0,-1,1,105.7,106.94,1000,1000CZE340,서울외곽순환선,3640.0,E,도리JC-조남JC,1,2019-01-02 23:59:30
40420797,235930,1000VDE06200,-1.00,-1.0,-1,2,105.7,106.94,1000,1000CZE340,서울외곽순환선,3640.0,E,도리JC-조남JC,1,2019-01-02 23:59:30
40420798,235930,1000VDE06200,-1.00,-1.0,-1,3,105.7,106.94,1000,1000CZE340,서울외곽순환선,3640.0,E,도리JC-조남JC,1,2019-01-02 23:59:30


In [15]:
vds_data_df.drop(['수집시분초', 'VDS존시작이정', 'VDS존종료이정', '차로번호'], axis=1, inplace=True, errors='ignore')
vds_data_df

Unnamed: 0,VDS_ID,점유율,평균속도,교통량,노선번호,콘존ID,도로명,콘존길이,기점종점방향구분코드,콘존명,차로유형구분코드,집계일시분초
0,0010VDS04145,0.00,0.0,0,10,0010CZS080,경부선,17090.0,S,활천IC-언양JC,1,2019-01-02 01:34:30
1,0010VDS04145,0.00,0.0,0,10,0010CZS080,경부선,17090.0,S,활천IC-언양JC,1,2019-01-02 00:00:01
2,0010VDS04145,0.78,100.0,1,10,0010CZS080,경부선,17090.0,S,활천IC-언양JC,1,2019-01-02 00:00:01
3,0010VDS04145,0.00,0.0,0,10,0010CZS080,경부선,17090.0,S,활천IC-언양JC,1,2019-01-02 00:00:01
4,0010VDS04145,0.00,0.0,0,10,0010CZS080,경부선,17090.0,S,활천IC-언양JC,1,2019-01-02 01:37:30
...,...,...,...,...,...,...,...,...,...,...,...,...
40420795,1000VDE06200,-1.00,-1.0,-1,1000,1000CZE340,서울외곽순환선,3640.0,E,도리JC-조남JC,1,2019-01-02 23:59:00
40420796,1000VDE06200,-1.00,-1.0,-1,1000,1000CZE340,서울외곽순환선,3640.0,E,도리JC-조남JC,1,2019-01-02 23:59:30
40420797,1000VDE06200,-1.00,-1.0,-1,1000,1000CZE340,서울외곽순환선,3640.0,E,도리JC-조남JC,1,2019-01-02 23:59:30
40420798,1000VDE06200,-1.00,-1.0,-1,1000,1000CZE340,서울외곽순환선,3640.0,E,도리JC-조남JC,1,2019-01-02 23:59:30


In [17]:
vds_data_grouping = vds_data_df.groupby(['VDS_ID', '차로유형구분코드', pd.Grouper(key='집계일시분초', freq='5min')])
vds_data_grouping

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fc67a9ab970>

In [35]:
def aggregate_speed(data: pd.DataFrame):
    if len(data) == 0:
        return -1

    speed_sum = np.inner(data['교통량'], data['평균속도']) # 각 데이터에 대한 속도의 누적합을 구한다.
    count = np.sum(data['교통량']) # 교통량의 합을 구해 검지기를 지난 차량의 수를 구한다.

    if count == 0:
        return 0

    return speed_sum / count

#   - 교통량: 분석단위시간당 측정된 차량의 합
#   - 속도: 분석단위시간당 측정된 차량의 속도를 산술평균한 값
#   - 여기서 각 행마다 속도와 교통량을 곱하면 분석단위시간당 측정된 차량의 속도의 누적합을 구할 수 있으며, 30초 단위 데이터마다 이를 수행하여 5분 분석단위시간당 측정된 차량의 속도 누적합을 구함
#   - 이렇게 한 누적합된 속도를 5분 분석단위시간동안 측정된 차량의 합으로 나누어, 5분 동안에 측정된 차량의 속도를 산술평균한 값을 구할 수 있음

# grouped dataframe을 속도와 교통량으로 묶어서 평균 속도를 구하는 함수

In [36]:
def aggregate_share(data: pd.DataFrame):
    if len(data) == 0:
        return -1
    
    return data[data['점유율'] != 0]['점유율'].mean()

In [37]:
def aggregate_traffic(data: pd.DataFrame):
    if len(data) == 0:
        return -1

    return np.sum(data['교통량']) # 단순 누적합

In [42]:
def aggregate_vds(group: pd.DataFrame):
    print(group.size)
    # 위에 있는 aggregate_speed, aggregate_share_2, aggregate_traffic 함수를 참조하여 dataframe을 apply하는 함수
    # en) apply dataframe using aggregate_speed, aggregate_share_2, aggregate_traffic functions above
    return pd.DataFrame({
        '교통량': [aggregate_traffic(group)],
        '점유율': [aggregate_share(group)],
        '평균속도': [aggregate_speed(group)],
        '노선번호': [group.iloc[0]['노선번호']],
        '콘존ID': [group.iloc[0]['콘존ID']],
        '도로명': [group.iloc[0]['도로명']],
        '콘존길이': [group.iloc[0]['콘존길이']],
        '기점종점방향구분코드': [group.iloc[0]['기점종점방향구분코드']],
        '콘존명': [group.iloc[0]['콘존명']],
    })

In [43]:
output = vds_data_grouping.apply(aggregate_vds)

360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360


KeyboardInterrupt: 