In [2]:
import pandas as pd
import numpy as np

In [3]:
import os
from os import fdopen, remove, walk
import glob
from tempfile import mkstemp
import shutil
from shutil import move, copymode

In [4]:
from datetime import date

In [5]:
DATASET_PATH = "/Users/noopy/covid19_unknown_spread/dataset"
datasets = glob.glob(f"{DATASET_PATH}/*.csv")
datasets[:5]

['/Users/noopy/covid19_unknown_spread/dataset/seoul_covid_8_31_.csv',
 '/Users/noopy/covid19_unknown_spread/dataset/seoul_covid_9_07_.csv',
 '/Users/noopy/covid19_unknown_spread/dataset/seoul_covid_8_30_.csv',
 '/Users/noopy/covid19_unknown_spread/dataset/seoul_covid_8_17_.csv',
 '/Users/noopy/covid19_unknown_spread/dataset/seoul_covid_8_21_.csv']

In [6]:
# get oldest file in the dataset folder
import os, heapq
def newst_files_in_tree(rootfolder, count=1, extension=".csv"):
    return heapq.nlargest(count,
        (os.path.join(dirname, filename)
        for dirname, dirnames, filenames in os.walk(rootfolder)
        for filename in filenames
        if filename.endswith(extension)),
        key=lambda fn: os.stat(fn).st_mtime)

In [7]:
newst_csv = newst_files_in_tree(DATASET_PATH)[0]

In [8]:
df_temp = pd.read_csv(newst_csv, encoding="utf-8")
df_temp.sample(15)

Unnamed: 0,연번,환자,확진일,거주지,여행력,접촉력,퇴원현황
810,3716,19555,8.29.,영등포구,-,여의도 순복음교회 관련,퇴원
2474,2052,15126,8.16.,성북구,-,성북구 사랑제일교회 관련,퇴원
3202,1324,12856,6.30.,서초구,-,강남역삼동 금융회사,퇴원
2065,2461,16327,8.19.,은평구,-,광화문집회 관련,퇴원
4125,401,9516,3.28.,강남구,영국,해외 접촉 추정,퇴원
1838,2688,16834,8.21.,은평구,-,확인 중,퇴원
2934,1592,14264,7.29.,구로구,-,확인 중,퇴원
40,4486,21643,9.09.,서초구,-,확인 중,
4334,192,7800,3.10.,관악구,-,콜센터직원 접촉,퇴원
4070,456,9929,3.31.,종로구,미국 뉴욕,해외 접촉 추정,퇴원


In [9]:
# overwrite whatever cell value that contains "확인" as "확인 중"
df_temp.loc[df_temp["접촉력"].str.contains("확인"),"접촉력"] = "확인 중"

# check whether those two arrays are combined
df_temp.loc[df_temp["접촉력"].str.contains("확인"),"접촉력"].unique()

array(['확인 중'], dtype=object)

In [10]:
df_temp["접촉력"] = df_temp["접촉력"].str.replace("관련", "")
df_temp["접촉력"] = df_temp["접촉력"].str.strip()

In [11]:
infection_paths = df_temp["접촉력"].unique()
print(len(infection_paths))
infection_paths

234


array(['서대문구 세브란스 병원', '타시도 확진자 접촉', '강동구 BF모바일 콜센터', '기타 확진자 접촉', '확인 중',
       '영등포 지인모임', '8.15서울도심집회', '영등포구 일련정종 서울포교소', '종로구청 근로자',
       '해외 접촉 추정', '송파구 쿠팡 물류센터', 'KT가좌지사', '도봉구 운동시설', '성북구 사랑제일교회',
       '노원구 기도모임', '관악구 가족모임', '동작구 JH글로벌', '노원구 빛가온교회', '영등포구 국회출입기자',
       '은평구 수색성당', '광진구 혜민병원', '은평구 헤어콕', '송파구 소재병원', '극단 산', '영등포구 권능교회',
       '서초구 장애인교육시설', '강동구 소재 병원', '다래경매', '성북구 체대입시', '성북구 요양시설',
       '중구소재 은행', '중랑구 소재 체육시설', '관악구 에바다', '동작구 소재 서울신학교', '용인시 우리제일교회',
       '노원구 손해보험', '구로구 보성운수', '강서구 서울대효요양병원', '서대문구 지인모임', '강북구 일가족',
       '8.15도심집회', '강서구 보안회사', '롯데리아 종사자 모임', '동작구 카드 발급업체', '영등포구 큰권능교회',
       '구로구 아파트', '강남구 소재 아파트', '제주 게스트하우스', '성북구 벧엘장로교회', '관악구 김혜근의원',
       '동대문구 sk탁구클럽', '동작구 스터디카페', '군인권센터', '동작구 요양시설', '여의도 순복음교회',
       '중구 보험회사(현대해상)', '강서구 병원', '8.15도심집회(순복음 강북교회)', '중앙보훈병원',
       '종로구 혜화경찰서', '양천구 되새김교회', '현대커머셜', '8.15도심집회(녹색병원)',
       '영등포 IFC몰 오케스트로', '고양시 반석교회(케네디상가)', '롯데 자산개발', '골드트레인', '한양대병원',
       '순복음 강북교회'

In [12]:
df_date = df_temp.sort_values(["연번"], ascending=False)
df_date.head()

Unnamed: 0,연번,환자,확진일,거주지,여행력,접촉력,퇴원현황
0,4526,21737,9.09.,은평구,-,서대문구 세브란스 병원,
1,4525,21719,9.09.,타시도,-,타시도 확진자 접촉,
2,4524,21709,9.09.,강동구,-,강동구 BF모바일 콜센터,
3,4523,21704,9.09.,은평구,-,기타 확진자 접촉,
4,4522,21702,9.09.,강동구,-,강동구 BF모바일 콜센터,


In [13]:
df_date["확진일"] = df_date["확진일"].str.replace(".", "-")
df_date["확진일"] = df_date["확진일"].str[:-1]
df_date.head()

Unnamed: 0,연번,환자,확진일,거주지,여행력,접촉력,퇴원현황
0,4526,21737,9-09,은평구,-,서대문구 세브란스 병원,
1,4525,21719,9-09,타시도,-,타시도 확진자 접촉,
2,4524,21709,9-09,강동구,-,강동구 BF모바일 콜센터,
3,4523,21704,9-09,은평구,-,기타 확진자 접촉,
4,4522,21702,9-09,강동구,-,강동구 BF모바일 콜센터,


In [14]:
df_date["확진일"] = "2020-0" + df_date["확진일"]
df_date.head()

Unnamed: 0,연번,환자,확진일,거주지,여행력,접촉력,퇴원현황
0,4526,21737,2020-09-09,은평구,-,서대문구 세브란스 병원,
1,4525,21719,2020-09-09,타시도,-,타시도 확진자 접촉,
2,4524,21709,2020-09-09,강동구,-,강동구 BF모바일 콜센터,
3,4523,21704,2020-09-09,은평구,-,기타 확진자 접촉,
4,4522,21702,2020-09-09,강동구,-,강동구 BF모바일 콜센터,


In [15]:
df_date.sample(5)

Unnamed: 0,연번,환자,확진일,거주지,여행력,접촉력,퇴원현황
2758,1768,14876,2020-08-13,은평구,-,기타 확진자 접촉,퇴원
84,4442,21471,2020-09-07,서초구,-,기타 확진자 접촉,
3546,980,11785,2020-06-06,성동구,-,이태원 클럽,퇴원
1538,2988,17691,2020-08-24,마포구,-,은평구 헤어콕,
4323,203,7842,2020-03-11,강서구,-,콜센터직원 접촉,퇴원


In [16]:
df = df_date.copy()
df.head()

Unnamed: 0,연번,환자,확진일,거주지,여행력,접촉력,퇴원현황
0,4526,21737,2020-09-09,은평구,-,서대문구 세브란스 병원,
1,4525,21719,2020-09-09,타시도,-,타시도 확진자 접촉,
2,4524,21709,2020-09-09,강동구,-,강동구 BF모바일 콜센터,
3,4523,21704,2020-09-09,은평구,-,기타 확진자 접촉,
4,4522,21702,2020-09-09,강동구,-,강동구 BF모바일 콜센터,


In [17]:
df.loc[(df["확진일"].str.len() < 10), '확진일'] = df.loc[(df["확진일"].str.len() < 10), '확진일'].str.replace("-0", "-")
df.loc[(df["확진일"].str.len() < 10), '확진일'] = df.loc[(df["확진일"].str.len() < 10), '확진일'].str.replace("-", "-0")
df.head()

Unnamed: 0,연번,환자,확진일,거주지,여행력,접촉력,퇴원현황
0,4526,21737,2020-09-09,은평구,-,서대문구 세브란스 병원,
1,4525,21719,2020-09-09,타시도,-,타시도 확진자 접촉,
2,4524,21709,2020-09-09,강동구,-,강동구 BF모바일 콜센터,
3,4523,21704,2020-09-09,은평구,-,기타 확진자 접촉,
4,4522,21702,2020-09-09,강동구,-,강동구 BF모바일 콜센터,


In [18]:
df.sample(15)

Unnamed: 0,연번,환자,확진일,거주지,여행력,접촉력,퇴원현황
357,4169,20679,2020-09-03,노원구,-,도봉구 운동시설,
3063,1463,13688,2020-07-17,송파구,-,관악구 사무실,퇴원
815,3711,19576,2020-08-29,기타,-,영등포구 큰권능교회,
425,4101,20545,2020-09-02,노원구,-,기타 확진자 접촉,
3749,777,11220,2020-05-25,강서구,-,#11196 접촉,퇴원
2313,2213,15790,2020-08-18,마포구,-,성북구 사랑제일교회,퇴원
1819,2707,16796,2020-08-21,양천구,-,확인 중,
1100,3426,19022,2020-08-27,관악구,-,확인 중,
1051,3475,19015,2020-08-27,송파구,-,기타 확진자 접촉,퇴원
1062,3464,19060,2020-08-27,은평구,-,기타 확진자 접촉,퇴원


In [19]:
df.to_csv("./dataset_predict/df_wrangle.csv", index=False)

In [20]:
temp = df["확진일"].value_counts().rename_axis('date').reset_index(name='new_confirmed')
temp.head()

Unnamed: 0,date,new_confirmed
0,2020-08-29,167
1,2020-08-27,160
2,2020-08-26,157
3,2020-08-18,151
4,2020-08-15,146


In [21]:
df_predict = temp.sort_values(by="date")
df_predict.head()

Unnamed: 0,date,new_confirmed
198,2020-01-24,1
166,2020-01-30,3
173,2020-01-31,3
189,2020-02-02,1
181,2020-02-05,2


In [22]:
df_predict["date"] = pd.to_datetime(df_predict["date"])

In [23]:
df_predict.index = pd.DatetimeIndex(df_predict["date"])
df_predict.head()

Unnamed: 0_level_0,date,new_confirmed
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-24,2020-01-24,1
2020-01-30,2020-01-30,3
2020-01-31,2020-01-31,3
2020-02-02,2020-02-02,1
2020-02-05,2020-02-05,2


In [24]:
df_predict = df_predict.resample('D').max()
df_predict.head()

Unnamed: 0_level_0,date,new_confirmed
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-24,2020-01-24,1.0
2020-01-25,NaT,
2020-01-26,NaT,
2020-01-27,NaT,
2020-01-28,NaT,


In [25]:
df_predict["date"] = df_predict.index
df_predict.head()

Unnamed: 0_level_0,date,new_confirmed
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-24,2020-01-24,1.0
2020-01-25,2020-01-25,
2020-01-26,2020-01-26,
2020-01-27,2020-01-27,
2020-01-28,2020-01-28,


In [26]:
df_predict = df_predict.fillna(0)
df_predict

Unnamed: 0_level_0,date,new_confirmed
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-24,2020-01-24,1.0
2020-01-25,2020-01-25,0.0
2020-01-26,2020-01-26,0.0
2020-01-27,2020-01-27,0.0
2020-01-28,2020-01-28,0.0
...,...,...
2020-09-05,2020-09-05,58.0
2020-09-06,2020-09-06,44.0
2020-09-07,2020-09-07,76.0
2020-09-08,2020-09-08,30.0


In [27]:
df_predict.to_csv("dataset_predict/df_predict.csv", index=False)

In [28]:
df_predict =  pd.read_csv("dataset_predict/df_predict.csv", encoding="utf-8")

In [29]:
df_untracked = df[df["접촉력"]=="확인 중"]
df_untracked

Unnamed: 0,연번,환자,확진일,거주지,여행력,접촉력,퇴원현황
7,4519,21642,2020-09-09,도봉구,-,확인 중,
16,4510,21691,2020-09-09,종로구,-,확인 중,
24,4502,21721,2020-09-09,관악구,-,확인 중,
25,4501,21720,2020-09-09,관악구,-,확인 중,
28,4498,21649,2020-09-09,은평구,-,확인 중,
...,...,...,...,...,...,...,...
4470,56,1768,2020-02-25,고양시,-,확인 중,퇴원
4484,42,1370,2020-02-25,노원구,-,확인 중,퇴원
4487,39,924,2020-02-25,평택,-,확인 중,퇴원
4488,38,907,2020-02-25,관악구,-,확인 중,퇴원


In [30]:
df_untracked_temp = df_untracked.groupby(by=["확진일"]).size()
df_untracked_temp = df_untracked_temp.rename_axis('date').reset_index(name='new_untracked')
df_untracked_temp

Unnamed: 0,date,new_untracked
0,2020-02-25,6
1,2020-02-26,1
2,2020-02-27,2
3,2020-02-28,2
4,2020-03-02,2
...,...,...
124,2020-09-05,19
125,2020-09-06,13
126,2020-09-07,19
127,2020-09-08,11


In [31]:
df_untracked_no = df_untracked_temp

In [32]:
df_merged = pd.merge(df_predict, df_untracked_no, how="left", on="date")
df_merged = df_merged.fillna(0)
df_merged.head(15)

Unnamed: 0,date,new_confirmed,new_untracked
0,2020-01-24,1.0,0.0
1,2020-01-25,0.0,0.0
2,2020-01-26,0.0,0.0
3,2020-01-27,0.0,0.0
4,2020-01-28,0.0,0.0
5,2020-01-29,0.0,0.0
6,2020-01-30,3.0,0.0
7,2020-01-31,3.0,0.0
8,2020-02-01,0.0,0.0
9,2020-02-02,1.0,0.0


In [33]:
df_merged.tail(15)

Unnamed: 0,date,new_confirmed,new_untracked
215,2020-08-26,157.0,44.0
216,2020-08-27,160.0,38.0
217,2020-08-28,118.0,34.0
218,2020-08-29,167.0,23.0
219,2020-08-30,70.0,17.0
220,2020-08-31,104.0,18.0
221,2020-09-01,87.0,20.0
222,2020-09-02,64.0,12.0
223,2020-09-03,46.0,6.0
224,2020-09-04,55.0,6.0


In [34]:
list_infection_paths_no = []
for index, row in df_merged.iterrows():
    filter_end_date = row["date"]
    # print(filter_end_date)
    df_filtered = df.loc[df['확진일'] <= filter_end_date]
    
    # 확진자 접촉력이 밝혀진 경우
    infection_paths_known = df_filtered["접촉력"].unique()
    
    # "기타 확진자 접촉" 항목 485건 / R0
    contact_no = len(df_filtered[df_filtered["접촉력"].str.contains("기타 확진자 접촉")])/1.5
    
    infection_paths_no = len(infection_paths_known) + contact_no
    list_infection_paths_no.append(int(infection_paths_no))
print(list_infection_paths_no[:10])
print(list_infection_paths_no[-10:])

[1, 1, 1, 1, 1, 1, 2, 4, 4, 4]
[495, 509, 528, 536, 552, 563, 572, 585, 594, 603]


In [35]:
df_merged["no_paths"] = list_infection_paths_no
df_merged["new_confirmed"] = df_merged["new_confirmed"].apply(int)
df_merged["new_untracked"] = df_merged["new_untracked"].apply(int)
df_merged.head()

Unnamed: 0,date,new_confirmed,new_untracked,no_paths
0,2020-01-24,1,0,1
1,2020-01-25,0,0,1
2,2020-01-26,0,0,1
3,2020-01-27,0,0,1
4,2020-01-28,0,0,1


In [36]:
df_merged.tail()

Unnamed: 0,date,new_confirmed,new_untracked,no_paths
225,2020-09-05,58,19,563
226,2020-09-06,44,13,572
227,2020-09-07,76,19,585
228,2020-09-08,30,11,594
229,2020-09-09,47,9,603


In [37]:
df_merged.to_csv("dataset_predict/df_predict.csv", index=False)

### check holiday

In [38]:
import json

list_calendar=[]

calendar_file = "./korean-calendar/korean-calendar.json"
# read file
with open(calendar_file, 'r') as myfile:
    data=myfile.readlines()

data = [x.strip() for x in data] 

for i in data:
    calendar_data = i[43:]
    calendar_item = "{"+calendar_data
    calendar_item_json = json.loads(calendar_item)
    list_calendar.append(calendar_item_json)
list_calendar[:10]

[{'sc': '2000-01-01', 'lc': '1999-11-25', 'w': 7, 'h': True, 'ht': '신정'},
 {'sc': '2000-01-02', 'lc': '1999-11-26', 'w': 1, 'h': False, 'ht': ''},
 {'sc': '2000-01-03', 'lc': '1999-11-27', 'w': 2, 'h': False, 'ht': ''},
 {'sc': '2000-01-04', 'lc': '1999-11-28', 'w': 3, 'h': False, 'ht': ''},
 {'sc': '2000-01-05', 'lc': '1999-11-29', 'w': 4, 'h': False, 'ht': ''},
 {'sc': '2000-01-06', 'lc': '1999-11-30', 'w': 5, 'h': False, 'ht': ''},
 {'sc': '2000-01-07', 'lc': '1999-12-01', 'w': 6, 'h': False, 'ht': ''},
 {'sc': '2000-01-08', 'lc': '1999-12-02', 'w': 7, 'h': False, 'ht': ''},
 {'sc': '2000-01-09', 'lc': '1999-12-03', 'w': 1, 'h': False, 'ht': ''},
 {'sc': '2000-01-10', 'lc': '1999-12-04', 'w': 2, 'h': False, 'ht': ''}]

In [39]:
df_calendar = pd.DataFrame(list_calendar)
df_calendar = df_calendar[["sc", "w", "h"]]
df_calendar.head()

Unnamed: 0,sc,w,h
0,2000-01-01,7,True
1,2000-01-02,1,False
2,2000-01-03,2,False
3,2000-01-04,3,False
4,2000-01-05,4,False


In [40]:
def label_weekend(row):
    # sunday
    if row['w'] == 1:
        return True
    # friday
    if row['w'] == 6:
        return True
    # saturday
    if row['w'] == 7:
        return True
    else:
        return False

In [41]:
df_calendar["is_weekend"] = df_calendar.apply(lambda row: label_weekend(row), axis=1)
df_calendar.head(10)

Unnamed: 0,sc,w,h,is_weekend
0,2000-01-01,7,True,True
1,2000-01-02,1,False,True
2,2000-01-03,2,False,False
3,2000-01-04,3,False,False
4,2000-01-05,4,False,False
5,2000-01-06,5,False,False
6,2000-01-07,6,False,True
7,2000-01-08,7,False,True
8,2000-01-09,1,False,True
9,2000-01-10,2,False,False


In [42]:
# rename dataframe name
df_calendar.rename(columns = {'sc':'date', 'h':'is_holiday'}, inplace = True)

In [43]:
df_calendar.to_csv("dataset_predict/df_calendar.csv", index=False)

In [44]:
df_calendar_subset = df_calendar[["date", "is_holiday", "is_weekend"]]

In [45]:
df_train = pd.merge(df_merged, df_calendar_subset, how="left", on="date")
df_train.tail()

Unnamed: 0,date,new_confirmed,new_untracked,no_paths,is_holiday,is_weekend
225,2020-09-05,58,19,563,False,True
226,2020-09-06,44,13,572,False,True
227,2020-09-07,76,19,585,False,False
228,2020-09-08,30,11,594,False,False
229,2020-09-09,47,9,603,False,False


In [46]:
df_train[-25:-15]

Unnamed: 0,date,new_confirmed,new_untracked,no_paths,is_holiday,is_weekend
205,2020-08-16,91,4,188,False,True
206,2020-08-17,132,7,192,False,False
207,2020-08-18,151,18,198,False,False
208,2020-08-19,135,19,217,False,False
209,2020-08-20,126,28,229,False,False
210,2020-08-21,128,32,247,False,True
211,2020-08-22,143,44,270,False,True
212,2020-08-23,98,25,291,False,True
213,2020-08-24,136,45,319,False,False
214,2020-08-25,116,35,342,False,False


In [47]:
df_train.tail(10)

Unnamed: 0,date,new_confirmed,new_untracked,no_paths,is_holiday,is_weekend
220,2020-08-31,104,18,495,False,False
221,2020-09-01,87,20,509,False,False
222,2020-09-02,64,12,528,False,False
223,2020-09-03,46,6,536,False,False
224,2020-09-04,55,6,552,False,True
225,2020-09-05,58,19,563,False,True
226,2020-09-06,44,13,572,False,True
227,2020-09-07,76,19,585,False,False
228,2020-09-08,30,11,594,False,False
229,2020-09-09,47,9,603,False,False


### Add Government Social Distancing Policy

In [51]:
df_train["gov_policy"] = 0
df_train

Unnamed: 0,date,new_confirmed,new_untracked,no_paths,is_holiday,is_weekend,gov_policy
0,2020-01-24,1,0,1,True,True,0
1,2020-01-25,0,0,1,True,True,0
2,2020-01-26,0,0,1,True,True,0
3,2020-01-27,0,0,1,False,False,0
4,2020-01-28,0,0,1,False,False,0
...,...,...,...,...,...,...,...
225,2020-09-05,58,19,563,False,True,0
226,2020-09-06,44,13,572,False,True,0
227,2020-09-07,76,19,585,False,False,0
228,2020-09-08,30,11,594,False,False,0


In [77]:
# 서울의 사회적 거리두기 2단계는 8월 16일부터 조치됨
# 서울에는 8월 19일 0시 기준으로 새로운 조치들이 추가되며, 흔히 얘기하는 2.5단계가 됨. 이후에 기간 연장을 해서 9.7(월)0시 ~ 9.13(일)24시까지 2.5단계가 시행됨

df_train.loc[(df_train["date"]>="2020-08-16") & (df_train["date"]<="2020-08-18"),"gov_policy"] = 2
df_train.loc[(df_train["date"]>="2020-08-19") & (df_train["date"]<="2020-09-13"),"gov_policy"] = 3
df_train.tail(30)

Unnamed: 0,date,new_confirmed,new_untracked,no_paths,is_holiday,is_weekend,gov_policy
200,2020-08-11,15,4,159,False,False,0
201,2020-08-12,26,5,163,False,False,0
202,2020-08-13,33,1,169,False,False,0
203,2020-08-14,73,4,175,False,True,0
204,2020-08-15,146,6,181,True,True,0
205,2020-08-16,91,4,188,False,True,2
206,2020-08-17,132,7,192,False,False,2
207,2020-08-18,151,18,198,False,False,2
208,2020-08-19,135,19,217,False,False,3
209,2020-08-20,126,28,229,False,False,3


In [78]:
df_train.to_csv("./dataset_predict/df_train.csv", index=False)