# 강북구 (1082)
- 작성자 : 이준석
- 작성일자 : 2020년 9월 7일

In [2]:
from bs4 import BeautifulSoup
import datetime as dt
import pandas as pd
from pathlib import Path
import numpy as np
import re
from tqdm import tqdm

### 1. Raw File 불러오기

In [3]:
dat = '2020-09-06'
PATH = Path.cwd().parent / 'data' / 'raw_tables' / dat.replace('-', '')[2:]
df = pd.read_csv(PATH / (dat.replace('-', '')[2:] + "_type1.csv"))

soup = BeautifulSoup(df[df.num == 1082].html.iloc[0], 'html.parser')

### 2. Regex Parsing 하기

In [4]:
regex = "\\n\s+?(?P<rgt>.+?)\((?P<frm>.+?)\)\s\|\s(?P<rgl>.+?)\s\|\s(?P<exd>.+?)"
regex += "\s\|\s(?P<msk>.+?),\s.+?\s?\|\s(?P<ste>.+?)\s"

selector = '#corona_container > main > div.contain_wrap.corona_wrap > strong > '
selector += 'strong > div > div.sectionbox > div > div > div.itembox > ul'
rows = soup.select(selector + '> li')
infos = [re.findall(regex, r.text) for r in rows]

### 3. 저장할 DataFrame 미리 만들기

In [5]:
pat_col = ['num', 'dat', 'pth', 'syy', 'syd', 'cnd', 'rsd', 'atn', 'dch', 'adt']
rou_col = ['num', 'ord', 'dat', 'rgl', 'rgt', 'frm', 'exd', 'msk', 'ste', 'mob']
patient = pd.DataFrame(columns = pat_col)
route = pd.DataFrame(columns = rou_col)

### 4. 귀찮은 전처리해주기

In [6]:
for (row, info) in tqdm(zip(rows, infos)):
    
    # regex 처리가 안 되는 애들은 어차피 정보가 없으므로 None 값임.
    if info is not None:
        
        num_r = int(row.select('li.title1 > p')[0].text[4:])
        num = '1082' + '0' * (5-len(str(num_r))) + str(num_r)
        syy = ''; syd = ''; rsd = ''# 정보가 없으면 이렇게 처리하자
        cnd = pd.to_datetime('2020/' + row.select('li.title2 > p')[0].text)
        pth = row.select('li.title3 > p')[0].text.strip()
        adt = row.select('li.title4 > p')[0].text.strip()
        atn = row.select('li.title5 > p')[0].text.strip()
        dch = int(atn == '퇴원(완치)')
        new_pat = pd.DataFrame([(num, dat, pth, syy, syd, cnd, rsd, atn, dch, adt)], columns = pat_col)
        
        patient = pd.concat([patient, new_pat])
        
        new_route = pd.DataFrame(info)
        if len(new_route) == 0:
            new_route = pd.DataFrame(
                columns = rou_col,
                data = [[num, 1, dat, '', '', '', '', '', '', '']]
            )
        else:
            new_route.columns = ['rgt', 'frm', 'rgl', 'exd', 'msk', 'ste']
            new_route.iloc[:,-2] = [int(r == '마스크 착용') for r in new_route.msk]
            new_route.iloc[:,-1] = [int(r == '소독완료') for r in new_route.ste]
            new_route['num'] = num
            new_route['ord'] = range(1, len(new_route)+1)
            new_route['dat'] = dat
            new_route['mob'] = ''
            new_route = new_route.loc[:,rou_col]
        
        route = pd.concat([route, new_route])

125it [00:01, 119.74it/s]


### 5. 결과 잘 나왔나 확인하기

In [7]:
patient

Unnamed: 0,num,dat,pth,syy,syd,cnd,rsd,atn,dch,adt
0,108200125,2020-09-06,강북#79 가족,,,2020-09-06,,-,0,없음
0,108200124,2020-09-06,특정 불가,,,2020-09-05,,-,0,1명
0,108200123,2020-09-06,타구 확진자,,,2020-09-05,,-,0,2명(그외 2)
0,108200122,2020-09-06,특정 불가,,,2020-09-03,,코이카생활치료센터,0,특정 불가
0,108200121,2020-09-06,확인 중,,,2020-09-03,,서울적십자병원,0,확인 중
...,...,...,...,...,...,...,...,...,...,...
0,108200005,2020-09-06,해외접촉(필리핀),,,2020-03-12,,퇴원(완치),1,"가족 2명(음성, 격리해제)"
0,108200004,2020-09-06,강북 #1 확진자,,,2020-03-06,,퇴원(완치),1,없음
0,108200003,2020-09-06,강북 #1 확진자,,,2020-03-06,,퇴원(완치),1,"4명(음성, 격리해제)"
0,108200002,2020-09-06,확인 중,,,2020-03-04,,퇴원(완치),1,"9명(음성, 격리해제)"


In [8]:
route

Unnamed: 0,num,ord,dat,rgl,rgt,frm,exd,msk,ste,mob
0,108200125,1,2020-09-06,,,,,,,
0,108200124,1,2020-09-06,강북구 삼양로,의원,상호 비공개,8.27.(목) 10:05 - 10:15,1,1,
1,108200124,2,2020-09-06,강북구 삼양로,의원,상호 비공개,8.31.(월) 09:40 - 11:50,1,1,
2,108200124,3,2020-09-06,강북구 삼양로,약국,상호 비공개,8.27.(목) 10:15 - 10:20,1,1,
3,108200124,4,2020-09-06,강북구 삼양로,약국,상호 비공개,9.4.(금) 10:35 - 10:40,1,1,
...,...,...,...,...,...,...,...,...,...,...
0,108200005,1,2020-09-06,,,,,,,
0,108200004,1,2020-09-06,,,,,,,
0,108200003,1,2020-09-06,,,,,,,
0,108200002,1,2020-09-06,,,,,,,
