# 데이터 정제, 더미변수 생성
- 실제 RStudio 분석에 쓸 수 있도록 데이터를 정제

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

In [2]:
uphill_dir = 'data'
uphill_file = '202111_uphill_congestion_merged.csv'
uphill_path = os.path.join(uphill_dir, uphill_file)

In [3]:
uphill = pd.read_csv(uphill_path, encoding = 'cp949')
uphill = uphill.drop(['Unnamed: 0', 'Unnamed: 0.1', '터널까지거리'], axis = 'columns')
uphill.head(3)

Unnamed: 0,집계년월,집계시분,VDS_ID,기점종점방향구분코드,노선번호,지점이정,도로이정,평균교통량,평균속도,혼잡빈도수,...,차로수,길어깨폭원(m),설치형식,현재운영_설치형식,중방향계수,중차량구성비,가속차로_연장(m),감속차로_연장(m),감속차로_폭(m),종점후터널유무
0,202111,01:50,0100VDE03700,E,100,38.4,35.6,2,94.99,0,...,3,1.5,3,추월차로형,0.5,0.146,150.0,140.0,3.6,0
1,202111,01:50,0100VDE03700,E,100,38.4,35.6,3,76.62,0,...,3,1.5,3,추월차로형,0.5,0.146,150.0,140.0,3.6,0
2,202111,01:50,0100VDS04100,S,100,42.8,45.7,1,93.64,0,...,3,1.5,3,추월차로형,0.5,0.156,240.0,200.0,3.6,0


In [4]:
uphill.columns

Index(['집계년월', '집계시분', 'VDS_ID', '기점종점방향구분코드', '노선번호', '지점이정', '도로이정', '평균교통량',
       '평균속도', '혼잡빈도수', '차로번호', '콘존명', '콘존길이', '노선', 'R(m)', 'I(%)', '차로수',
       '길어깨폭원(m)', '설치형식', '현재운영_설치형식', '중방향계수', '중차량구성비', '가속차로_연장(m)',
       '감속차로_연장(m)', '감속차로_폭(m)', '종점후터널유무'],
      dtype='object')

## 더미변수 생성을 위한 함수 정의
* 집계시분 : 3시간 단위의 '시간대'로 변경
* 차로번호 : lane_1, lane_2, lane_3, lane_4
* 현재운영_설치형식 : 포켓형-uphilltype_2, 추월차로형-uphilltype_3, (기본:폐쇄형)

In [5]:
def timefunc(x):
    """시간대(timeline) 변수 생성"""
    t = int(x[:-3])
    
    if t >= 0 and t < 3:
        return '00-03'
    elif t >= 3 and t < 6:
        return '03-06'
    elif t >= 6 and t < 9:
        return '06-09'
    elif t >= 9 and t < 12:
        return '09-12'
    elif t >= 12 and t < 15:
        return '12-15'
    elif t >= 15 and t < 18:
        return '15-18'
    elif t >= 18 and t < 21:
        return '18-21'
    elif t >= 21 and t < 24:
        return '21-24'

In [6]:
## 시간대 더미변수 : 00-03이 기준임

def timefunc_0306(x):
    """시간대(time) :: 03-06"""
    if x == '03-06':
        return 1
    else:
        return 0
    
def timefunc_0609(x):
    """시간대(time) :: 06-09"""
    if x == '06-09':
        return 1
    else:
        return 0
    
def timefunc_0912(x):
    """시간대(time) :: 09-12"""
    if x == '09-12':
        return 1
    else:
        return 0

def timefunc_1215(x):
    """시간대(time) :: 12-15"""
    if x == '12-15':
        return 1
    else:
        return 0

def timefunc_1518(x):
    """시간대(time) :: 15-18"""
    if x == '15-18':
        return 1
    else:
        return 0

def timefunc_1821(x):
    """시간대(time) :: 18-21"""
    if x == '18-21':
        return 1
    else:
        return 0

def timefunc_2124(x):
    """시간대(time) :: 21-24"""
    if x == '21-24':
        return 1
    else:
        return 0

In [7]:
def lanenum_2(x):
    """차로번호별 더미변수 생성: 차로번호 == 2"""
    if x == 2:
        return 1
    else:
        return 0
    
def lanenum_3(x):
    """차로번호별 더미변수 생성: 차로번호 == 3"""
    if x == 3:
        return 1
    else:
        return 0
    
def lanenum_4(x):
    """차로번호별 더미변수 생성: 차로번호 == 4"""
    if x == 4:
        return 1
    else:

        return 0

In [8]:
# 그거... 폐쇄형(1)이 기준임

def lanetype_02(x):
    """현재운영_설치형식 :: 포켓형(2)일 시"""
    if x == '포켓형':
        return 1
    else:
        return 0
    
def lanetype_03(x):
    """현재운영_설치형식 :: 추월차로형(3)일 시"""
    if x == '추월차로형':
        return 1
    else:
        return 0

## 더미변수 만들기

In [9]:
uphill['time'] = uphill['집계시분'].apply(timefunc)
uphill['time_0306'] = uphill['time'].apply(timefunc_0306)
uphill['time_0609'] = uphill['time'].apply(timefunc_0609)
uphill['time_0912'] = uphill['time'].apply(timefunc_0912)
uphill['time_1215'] = uphill['time'].apply(timefunc_1215)
uphill['time_1518'] = uphill['time'].apply(timefunc_1518)
uphill['time_1821'] = uphill['time'].apply(timefunc_1821)
uphill['time_2124'] = uphill['time'].apply(timefunc_2124)

In [10]:
uphill['lanetype_01'] = uphill['현재운영_설치형식'].apply(lanetype_02)
uphill['lanetype_02'] = uphill['현재운영_설치형식'].apply(lanetype_03)

In [11]:
uphill['lanenum_2'] = uphill['차로번호'].apply(lanenum_2)
uphill['lanenum_3'] = uphill['차로번호'].apply(lanenum_3)
uphill['lanenum_4'] = uphill['차로번호'].apply(lanenum_4)

In [12]:
uphill.head(3)

Unnamed: 0,집계년월,집계시분,VDS_ID,기점종점방향구분코드,노선번호,지점이정,도로이정,평균교통량,평균속도,혼잡빈도수,...,time_0912,time_1215,time_1518,time_1821,time_2124,lanetype_01,lanetype_02,lanenum_2,lanenum_3,lanenum_4
0,202111,01:50,0100VDE03700,E,100,38.4,35.6,2,94.99,0,...,0,0,0,0,0,0,1,0,0,0
1,202111,01:50,0100VDE03700,E,100,38.4,35.6,3,76.62,0,...,0,0,0,0,0,0,1,1,0,0
2,202111,01:50,0100VDS04100,S,100,42.8,45.7,1,93.64,0,...,0,0,0,0,0,0,1,0,0,0


In [13]:
save_file = 'FIN_uphill_merged_ver4_202111.csv'
save_path = os.path.join(uphill_dir, save_file)

In [14]:
uphill.to_csv(save_path, encoding = 'cp949')