# Import and Parsing XML from OPEN API (PUBLIC)
## 차로별 차종 구성 및 위반율
* http://data.ex.co.kr/openapi/basicinfo/openApiInfoM?apiId=0141&pn=-1
* 고속도로 공공데이터포털 인증키 : 6416260179

# Imports

In [1]:
from urllib.request import urlopen
from urllib.parse import urlencode, unquote, quote_plus
import urllib
import requests
import json
import xml.etree.ElementTree as ET
import xmltodict
from tqdm import tqdm

import pandas as pd

In [2]:
route_info = pd.read_csv('data/info_routeNo.csv', encoding = 'cp949').loc[:, ['노선번호', '도로단축명']]

In [3]:
# 사고일자 중 2016년 12월 이전 것은 모두 제거해야 함 --- 제거된 데이터를 사용
accident_info = pd.read_csv('data/accident_merged_20161201-.csv', encoding = 'cp949')
accident_info = pd.merge(accident_info, route_info, how = 'left', on = '도로단축명')
accident_info.head(3)

Unnamed: 0.1,Unnamed: 0,사고일자,사고일자.1,월별구분,사고시간,시간단위_3시간,주야구분,노선명,이정,방향,...,원인차_차종구분,도로명,도로단축명,도로표출명,기점종점방향구분코드,시점명,종점명,conzoneID,conzoneName,노선번호
0,1183,2016-12-02,20161202,12월,3:50:00,3 ~6,야간,영동선,2.3,강릉,...,SUV형,영동선,영동선,영동,E,인천,강릉,0500CZE010,서창JC-월곶JC,500
1,1184,2016-12-02,20161202,12월,3:50:00,3 ~6,야간,영동선,2.3,강릉,...,SUV형,영동선,영동선,영동,S,인천,강릉,0500CZS010,월곶JC-서창JC,500
2,1186,2016-12-03,20161203,12월,1:00:00,0 ~3,야간,경부선,290.9,서울,...,소형,경부선,경부선,경부,E,부산,서울,0010CZE505,금토JC-양재IC,10


# 사고데이터 -- 위반데이터
* 사고일자.1 == 시작일자, 종료일자
* 노선번호 == routeNo
* conzoneName == 콘존명

In [4]:
req_list = accident_info.loc[:, ['사고일자.1', '노선번호', 'conzoneID', 'conzoneName', '기점종점방향구분코드']].drop_duplicates()

req_list['routeNo'] = None

for i in range(len(req_list)):
    i_no = str(req_list['노선번호'].iloc[i])
    
    if len(i_no) == 2:
        req_list['routeNo'].iloc[i] = '00' + str(i_no)
    
    elif len(i_no) == 3:
        req_list['routeNo'].iloc[i] = '0' + str(i_no)
    
    elif len(i_no) == 4:
        req_list['routeNo'].iloc[i] = str(i_no)
    
    else:
        pass
    
req_list = req_list[::-1]
    
req_list2 = req_list.loc[:, ['사고일자.1', 'routeNo', '기점종점방향구분코드']].drop_duplicates()
req_list2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,사고일자.1,routeNo,기점종점방향구분코드
3475,20201231,0010,S
3474,20201231,0010,E
3473,20201230,0150,S
3472,20201230,0150,E
3471,20201228,0450,S
...,...,...,...
4,20161203,0100,E
3,20161203,0010,S
2,20161203,0010,E
1,20161202,0500,S


# XML Parsing

In [5]:
# Service URL
xmlUrl = 'http://data.ex.co.kr/openapi/trafficOprgPrcd/laneTypeViolation'
My_API_Key = '6416260179' # 사용자 인증키

## 필수 요청인자를 파라미터로 정리하기

* 반드시 OPEN API의 페이지를 확인해야 합니다

In [7]:
dates = list(req_list2['사고일자.1'])
routes = list(req_list2['routeNo'])
directions = list(req_list2['기점종점방향구분코드'])

In [22]:
for date, routeNo, direction in tqdm(zip(dates[2155:], routes[2155:], directions[2155:])):
    
    conzone_list = list(req_list[(req_list['사고일자.1'] == date) & (req_list['routeNo'] == routeNo)]['conzoneID'])
    
    queryParams = '?' + urlencode(    # get 방식으로 쿼리를 분리하기 위해 '?'를 넣은 것이다. 메타코드 아님.
        {
            quote_plus('request') : 'GetFeature',
            quote_plus('key') : My_API_Key, # 발급받은 인증키
            quote_plus('type') : 'xml', # 검색결과 포맷 = xml
            quote_plus('routeNo') : routeNo, # 노선번호
            quote_plus('sDate') : date, # 시작일자
            quote_plus('eDate') : date, # 종료일자
            #quote_plus('iLaneNum') : lane_num, # 차로번호
            #quote_plus('iAvcCarType') : car_type, # AVC 차종구분코드
            quote_plus('updownType') : direction # 기점종점방향구분코드
         }
    )
    response = urllib.request.Request(xmlUrl + queryParams)
    response_body = urlopen(response).read() #get bytes data

    decode_data = response_body.decode('utf-8')
    #print(type(decode_data))

    xml_parse = xmltodict.parse(decode_data)     # string인 xml 파싱
    xml_dict = json.loads(json.dumps(xml_parse))
    
    if xml_dict['data']['count'] != '0':
        
        df = pd.json_normalize(xml_dict['data'], record_path = ['lcsTrafficLists'])
    
        tczdf = pd.DataFrame()
    
        for cz in conzone_list:
            czdf = df[df['conzoneId'] == cz]
            tczdf = pd.concat([tczdf, czdf])
    
        tczdf.to_csv(f'data/violations/{date}_{routeNo}_{direction}.csv', encoding = 'cp949')
    
    else:
        pass

1031it [1:23:59,  4.89s/it]
