# ACCIDENT + VIOLATION + DANGER INDEX 데이터셋 가공
* 사고 데이터
* 고속도로 위험지수 데이터
* 위반 데이터

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

from math import radians, cos, sin, asin, sqrt

# 데이터 가공

## DANGER INDEX
* 길이에 따른 위험지수의 가중평균으로 Section(Conzone)별 위험지수를 산정하기
* Vehicle Type별로 pivot_table을 사용하여 분리하기

In [2]:
DangerIndex = pd.read_csv('data/Freeway_Danger_Index.csv', encoding = 'cp949')

In [3]:
DangerIndex.head(3)

Unnamed: 0.1,Unnamed: 0,index,start_x,start_y,end_x,end_y,anals_value,anals_grd,Freeway,Section,vehicle_type
0,0,1,128.425357,35.767675,128.427532,35.767962,0.5,1,광주대구고속도로,고령JC-옥포JC,1
1,1,2,128.45551,35.785579,128.456676,35.787111,0.68,1,광주대구고속도로,고령JC-옥포JC,1
2,2,3,128.433061,35.771705,128.434729,35.772889,2.91,1,광주대구고속도로,고령JC-옥포JC,1


In [4]:
DangerIndex = pd.pivot_table(DangerIndex.iloc[:, 2:],
                             index = ['start_x', 'start_y', 'end_x', 'end_y', 'Freeway', 'Section'],
                             columns = ['vehicle_type']).reset_index()

DangerIndex.columns = ['start_x', 'start_y', 'end_x', 'end_y', 'Freeway', 'Section',
              'grade_1', 'grade_2', 'grade_3', 'grade_4', 'value_1', 'value_2', 'value_3', 'value_4']

In [5]:
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in kilometers between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
    
    return c * r

In [6]:
DangerIndex['length'] = DangerIndex.apply(lambda x: haversine(x['start_x'], x['start_y'], x['end_x'], x['end_y']), axis = 1)

# 모든 차종에서 value, grade가 같으므로 하나로 변경함
DangerIndex = DangerIndex.drop(['grade_2', 'grade_3', 'grade_4', 'value_2', 'value_3', 'value_4'], axis = 'columns')

In [7]:
DangerIndex['valueLen'] = DangerIndex['value_1'] * DangerIndex['length']
DangerIndex.head(3)

Unnamed: 0,start_x,start_y,end_x,end_y,Freeway,Section,grade_1,value_1,length,valueLen
0,126.491911,35.030704,126.494023,35.031186,무안광주고속도로,함평JC-동함평IC,1,0.0,0.199678,0.0
1,126.493972,35.031373,126.49186,35.030891,무안광주고속도로,동함평IC-함평JC,1,0.0,0.199678,0.0
2,126.494023,35.031186,126.496137,35.031665,무안광주고속도로,함평JC-동함평IC,1,0.5,0.19968,0.09984


In [8]:
pv = pd.pivot_table(DangerIndex,
                    index = ['Freeway', 'Section'],
                    values = ['length', 'valueLen'],
                    aggfunc = 'sum').reset_index()

pv['avg_idx'] = pv['valueLen'] / pv['length']
pv = pv.drop(['length', 'valueLen'], axis = 'columns')

pv.head(3)

Unnamed: 0,Freeway,Section,avg_idx
0,경부고속도로,금토JC-신양재IC,4.171787
1,고창담양고속도로,담양JC-대덕JC,0.199993
2,고창담양고속도로,대덕JC-담양JC,0.19585


In [9]:
pv.to_csv('data/Freeway_Danger_Index_edited.csv', encoding = 'cp949')

## VIOLATION
* 시간대(0~3, 3~6)별로 구분하여 합쳐서 지수를 재생성하기

In [10]:
violation = pd.read_csv('data/violations_merged_02.csv', encoding = 'cp949')
violation.head(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,date,routeNo,direction,conzoneId,conzoneNm,laneNum,oTime,busLaneGubun,totalTF_1,violationTF_1,totalTF_HV,violationTF_HV,totalTF,violationTF,total_ViolationRate,HV_ViolationRate,time
0,0,0,20170730,450,E,0450CZE240,여주JC-남여주IC,1,07:00,전용,15,0,0,0,15,0,0.0,0.0,06-09
1,1,1,20170730,450,E,0450CZE240,여주JC-남여주IC,1,07:15,전용,7,0,0,0,7,0,0.0,0.0,06-09
2,2,2,20170730,450,E,0450CZE240,여주JC-남여주IC,1,07:30,전용,9,0,4,0,13,0,0.0,0.0,06-09


In [11]:
def timefunc(x):
    
    t = int(x[0:2])
    
    if  t >= 0 and t < 3:
        return '00-03'
    elif t >= 3 and t < 6:
        return '03-06'
    elif t >= 6 and t < 9:
        return '06-09'
    elif t >= 9 and t < 12:
        return '09-12'
    elif t >= 12 and t < 15:
        return '12-15'
    elif t >= 15 and t < 18:
        return '15-18'
    elif t >= 18 and t < 21:
        return '18-21'
    elif t >= 21 and t < 24:
        return '21-24'
    else:
        pass

In [12]:
violation['time'] = violation['oTime'].apply(timefunc)
violation.head(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,date,routeNo,direction,conzoneId,conzoneNm,laneNum,oTime,busLaneGubun,totalTF_1,violationTF_1,totalTF_HV,violationTF_HV,totalTF,violationTF,total_ViolationRate,HV_ViolationRate,time
0,0,0,20170730,450,E,0450CZE240,여주JC-남여주IC,1,07:00,전용,15,0,0,0,15,0,0.0,0.0,06-09
1,1,1,20170730,450,E,0450CZE240,여주JC-남여주IC,1,07:15,전용,7,0,0,0,7,0,0.0,0.0,06-09
2,2,2,20170730,450,E,0450CZE240,여주JC-남여주IC,1,07:30,전용,9,0,4,0,13,0,0.0,0.0,06-09


In [13]:
violation.to_csv('data/violations_merged_02.csv', encoding = 'cp949')

## ACCIDENT

In [17]:
accident = pd.read_csv('data/accident_merged_20170730-.csv', encoding = 'cp949')
accident.head(3)

Unnamed: 0.1,Unnamed: 0,사고일자,사고일자.1,월별구분,사고시간,시간단위_3시간,주야구분,노선명,이정,방향,...,원인차차종,원인차_차종구분,도로명,도로단축명,도로표출명,기점종점방향구분코드,시점명,종점명,conzoneID,conzoneName
0,2064,2017-07-30,20170730,07월,1:30:00,0~3,야간,중부내륙선,228.4,양평,...,화물,대형,중부내륙선,중부내륙선,중부내륙,E,창원,양평,0450CZE240,여주JC-남여주IC
1,2065,2017-07-30,20170730,07월,1:30:00,0~3,야간,중부내륙선,228.4,양평,...,화물,대형,중부내륙선,중부내륙선,중부내륙,S,창원,양평,0450CZS240,남여주IC-여주JC
2,2067,2017-07-31,20170731,07월,10:00:00,9~12,주간,영동선,80.8,강릉,...,화물,중형,영동선,영동선,영동,E,인천,강릉,0500CZE225,원주JC-원주IC


In [20]:
# 시간단위_3시간 << 변수를 00-03, 12-15, ... 단위로 일괄 변경하여 저장하기
def timefunc2(x):
    if x == '0~3':
        return '00-03'
    elif x == '3~6':
        return '03-06'
    elif x == '6~9':
        return '06-09'
    elif x == '9~12':
        return '09-12'
    elif x == '12~15':
        return '12-15'
    elif x == '15~18':
        return '15-18'
    elif x == '18~21':
        return '18-21'
    elif x == '21~24':
        return '21-24'
    else:
        pass

In [21]:
accident['시간단위_3시간'] = accident['시간단위_3시간'].apply(timefunc2)

In [22]:
accident.head(3)

Unnamed: 0.1,Unnamed: 0,사고일자,사고일자.1,월별구분,사고시간,시간단위_3시간,주야구분,노선명,이정,방향,...,원인차차종,원인차_차종구분,도로명,도로단축명,도로표출명,기점종점방향구분코드,시점명,종점명,conzoneID,conzoneName
0,2064,2017-07-30,20170730,07월,1:30:00,00-03,야간,중부내륙선,228.4,양평,...,화물,대형,중부내륙선,중부내륙선,중부내륙,E,창원,양평,0450CZE240,여주JC-남여주IC
1,2065,2017-07-30,20170730,07월,1:30:00,00-03,야간,중부내륙선,228.4,양평,...,화물,대형,중부내륙선,중부내륙선,중부내륙,S,창원,양평,0450CZS240,남여주IC-여주JC
2,2067,2017-07-31,20170731,07월,10:00:00,09-12,주간,영동선,80.8,강릉,...,화물,중형,영동선,영동선,영동,E,인천,강릉,0500CZE225,원주JC-원주IC


In [23]:
accident.to_csv('data/accident_merged_20170730-_time-edited.csv', encoding = 'cp949')