# 데이터 정제, 더미변수 생성
- 실제 RStudio 분석에 쓸 수 있도록 데이터를 정제

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

In [2]:
uphill_dir = 'data'
uphill_file = '202111_uphill_congestion_merged.csv'
uphill_path = os.path.join(uphill_dir, uphill_file)

In [3]:
uphill = pd.read_csv(uphill_path, encoding = 'cp949')
uphill = uphill.drop(['Unnamed: 0', 'Unnamed: 0.1', '터널까지거리'], axis = 'columns')
uphill.head(3)

Unnamed: 0,집계시분,VDS_ID,기점종점방향구분코드,노선번호,지점이정,도로이정,평균교통량,평균속도,혼잡빈도수,차로번호,...,차로수,길어깨폭원(m),설치형식,현재운영_설치형식,중방향계수,중차량구성비,가속차로_연장(m),감속차로_연장(m),감속차로_폭(m),종점후터널유무
0,00:00,0010VDS00700,S,10,7.3,12.86,4,116.46,0,1,...,4,2.5,2,포켓형,0.49,0.1887,90.0,55.0,3.8,0
1,00:00,0010VDS00700,S,10,7.3,12.86,10,104.19,0,2,...,4,2.5,2,포켓형,0.49,0.1887,90.0,55.0,3.8,0
2,00:00,0010VDS00700,S,10,7.3,12.86,7,83.34,0,3,...,4,2.5,2,포켓형,0.49,0.1887,90.0,55.0,3.8,0


In [4]:
uphill.columns

Index(['집계시분', 'VDS_ID', '기점종점방향구분코드', '노선번호', '지점이정', '도로이정', '평균교통량', '평균속도',
       '혼잡빈도수', '차로번호', '콘존명', '콘존길이', '노선', 'R(m)', 'I(%)', '차로수', '길어깨폭원(m)',
       '설치형식', '현재운영_설치형식', '중방향계수', '중차량구성비', '가속차로_연장(m)', '감속차로_연장(m)',
       '감속차로_폭(m)', '종점후터널유무'],
      dtype='object')

## 더미변수 생성을 위한 함수 정의
* 집계시분 : 3시간 단위의 '시간대'로 변경
* 차로번호 : lane_1, lane_2, lane_3, lane_4
* 현재운영_설치형식 : 포켓형-uphilltype_2, 추월차로형-uphilltype_3, (기본:폐쇄형)

In [5]:
def timefunc(x):
    """시간대(timeline) 변수 생성"""
    t = int(x[:-3])
    
    if t >= 0 and t < 3:
        return '00-03'
    elif t >= 3 and t < 6:
        return '03-06'
    elif t >= 6 and t < 9:
        return '06-09'
    elif t >= 9 and t < 12:
        return '09-12'
    elif t >= 12 and t < 15:
        return '12-15'
    elif t >= 15 and t < 18:
        return '15-18'
    elif t >= 18 and t < 21:
        return '18-21'
    elif t >= 21 and t < 24:
        return '21-24'

## 더미변수 만들기

In [6]:
uphill['time'] = uphill['집계시분'].apply(timefunc)

In [7]:
uphill.head(3)

Unnamed: 0,집계시분,VDS_ID,기점종점방향구분코드,노선번호,지점이정,도로이정,평균교통량,평균속도,혼잡빈도수,차로번호,...,길어깨폭원(m),설치형식,현재운영_설치형식,중방향계수,중차량구성비,가속차로_연장(m),감속차로_연장(m),감속차로_폭(m),종점후터널유무,time
0,00:00,0010VDS00700,S,10,7.3,12.86,4,116.46,0,1,...,2.5,2,포켓형,0.49,0.1887,90.0,55.0,3.8,0,00-03
1,00:00,0010VDS00700,S,10,7.3,12.86,10,104.19,0,2,...,2.5,2,포켓형,0.49,0.1887,90.0,55.0,3.8,0,00-03
2,00:00,0010VDS00700,S,10,7.3,12.86,7,83.34,0,3,...,2.5,2,포켓형,0.49,0.1887,90.0,55.0,3.8,0,00-03


In [8]:
save_file = 'FIN_uphill_merged_ver3_202111.csv'
save_path = os.path.join(uphill_dir, save_file)

In [9]:
uphill.to_csv(save_path, encoding = 'cp949')