# 강남구 (1080)
- 작성자 : 황다은
- 작성일자 : 2020년 9월 6일
- 편집자 : 이준석
- 편집일자 : 2020년 9월 6일

In [4]:
from bs4 import BeautifulSoup
import datetime as dt
import pandas as pd
from pathlib import Path
import numpy as np
import re
from tqdm import tqdm

dat = dt.datetime.today().date() # 오늘 날짜

### 1. Raw File 불러오기

In [5]:
dat = '2020-09-06'
PATH = Path.cwd().parent / 'data' / 'raw_tables' / dat.replace('-', '')[2:]
df = pd.read_csv(PATH / (dat.replace('-', '')[2:] + "_type1.csv"))

soup = BeautifulSoup(df[df.num == 1080].html.iloc[0], 'html.parser')

### 2. Regex Parsing 하기

In [6]:
# 너무 길어서 두 개로 나누었다. (가시적으로 표현하자.)
regex = 'tle\"\>\#(?P<num>.+?)\s확진자\\t\((?P<rsd>.+?)\).+?\/\s?(?P<dat>.+?)\<\/div.+?<li>감염경로\s\:\s?'
regex += '(?P<pth>.+?)\<\/li\>\s\<li\>(증상발현일\s\:\s)?(?P<sym>.+?)\<\/li>(.+?조치\s\:\s(?P<atn>.+?)\<\/li\>)?'

rows = soup.select('#tab-1 .acodList')
infos = [re.search(regex, str(r)) for r in rows]

### 3. 저장할 DataFrame 미리 만들기

In [7]:
pat_col = ['num', 'dat', 'pth', 'syy', 'syd', 'cnd', 'rsd', 'atn', 'dch', 'adt']
rou_col = ['num', 'ord', 'dat', 'rgl', 'rgt', 'frm', 'exd', 'msk', 'ste', 'mob']
patient = pd.DataFrame(columns = pat_col)
route = pd.DataFrame(columns = rou_col)

### 4. 귀찮은 전처리해주기

In [10]:
for i, info in tqdm(enumerate(infos)):
    
    # regex 처리가 안 되는 애들은 어차피 정보가 없으므로 None 값임.
    if info is not None:
        
        num = '1080' + '0' * (5-len(info.group('num'))) + info.group('num')
        syy = info.group('sym') != '증상 없음'
        syd = info.group('sym') * syy
        dch = ''; adt = '' # 정보가 없으면 이렇게 처리하자
        rsd, cnd, pth, atn = info.group('rsd'), info.group('dat'), info.group('pth'), info.group('atn')
        new_pat = pd.DataFrame([(num, dat, pth, syy, syd, cnd, rsd, atn, dch, adt)], columns = pat_col)
        
        patient = pd.concat([patient, new_pat])
        
        new_route = pd.read_html(str(rows[i].select('table')[0]))[0].replace('※ 관내 동선 없음', '')
        new_route['rgl'] = new_route.지역 + new_route.위치
        new_route['frm'] = ''; new_route['msk'] = ''
        new_route['num'] = num
        new_route['ord'] = range(1, len(new_route)+1)
        new_route['dat'] = dat
        new_route['소독여부'] = [int(r == '완료') for r in new_route.소독여부]
        new_route = new_route.drop(['지역', '위치'], axis = 1)
        new_route.columns = ['rgt', 'exd', 'mob', 'ste', 'rgl', 'frm', 'msk', 'num', 'ord', 'dat']
        new_route = new_route.loc[:,rou_col]
        
        route = pd.concat([route, new_route])

143it [00:00, 255.61it/s]


### 5. 결과 잘 나왔나 확인하기

In [11]:
patient

Unnamed: 0,num,dat,pth,syy,syd,cnd,rsd,atn,dch,adt
0,108000213,2020-09-06,확진자 접촉,True,9.4,2020-09-06,세곡동,,,
0,108000213,2020-09-06,확진자 접촉,True,9.4,2020-09-06,세곡동,,,
0,108000212,2020-09-06,확진자 접촉,False,,2020-09-05,개포동,,,
0,108000211,2020-09-06,확진자 접촉,True,8.29,2020-09-05,역삼동,,,
0,108000210,2020-09-06,확진자 접촉,True,8.31,2020-09-05,개포동,,,
0,108000209,2020-09-06,미상,True,9.3,2020-09-04,개포동,,,
0,108000208,2020-09-06,미상,False,,2020-09-04,개포동,,,
0,108000207,2020-09-06,확진자 접촉,False,,2020-09-04,압구정동,,,
0,108000206,2020-09-06,확진자 접촉,False,,2020-09-04,압구정동,,,
0,108000205,2020-09-06,확진자 접촉,False,,2020-09-04,압구정동,,,


In [12]:
route

Unnamed: 0,num,ord,dat,rgl,rgt,frm,exd,msk,ste,mob
0,108000213,1,2020-09-06,,,,,,0,
0,108000212,1,2020-09-06,대치동대치역 3번 출구,마트,,- 8.29(토) 16:11,,1,자차
1,108000212,2,2020-09-06,개포동개포동역 6번 출구,카페,,- 8.29(토) 16:21,,1,자차
2,108000212,3,2020-09-06,대치동선릉역 1번 출구,음식점,,- 8.30(일) 19:00~20:00,,1,자차
3,108000212,4,2020-09-06,대치동대치역 3번 출구,마트,,- 8.31(월) 15:39,,1,자차
...,...,...,...,...,...,...,...,...,...,...
6,108000175,7,2020-09-06,논현동언주역 3번 출구,헬스장,,- 8.28(금) 19:30~21:30,,1,도보
7,108000175,8,2020-09-06,논현동언주역 2번 출구,병의원,,- 8.29(토) 10:30,,1,도보
8,108000175,9,2020-09-06,논현동언주역 2번 출구,약국,,- 8.29(토) 10:35,,1,도보
9,108000175,10,2020-09-06,논현동학동역 4번 출구,마트,,- 8.29(토) 10:39,,1,도보
