In [1]:
import pandas as pd
import numpy as np

In [2]:
import os
from os import fdopen, remove, walk
import glob
from tempfile import mkstemp
import shutil
from shutil import move, copymode

In [3]:
from datetime import date

In [4]:
DATASET_PATH = "/Users/noopy/covid19_unknown_spread/dataset"
datasets = glob.glob(f"{DATASET_PATH}/*.csv")
datasets[:5]

['/Users/noopy/covid19_unknown_spread/dataset/seoul_covid_8_31_.csv',
 '/Users/noopy/covid19_unknown_spread/dataset/seoul_covid_9_07_.csv',
 '/Users/noopy/covid19_unknown_spread/dataset/seoul_covid_8_30_.csv',
 '/Users/noopy/covid19_unknown_spread/dataset/seoul_covid_8_17_.csv',
 '/Users/noopy/covid19_unknown_spread/dataset/seoul_covid_8_21_.csv']

In [5]:
# get oldest file in the dataset folder
import os, heapq
def newst_files_in_tree(rootfolder, count=1, extension=".csv"):
    return heapq.nlargest(count,
        (os.path.join(dirname, filename)
        for dirname, dirnames, filenames in os.walk(rootfolder)
        for filename in filenames
        if filename.endswith(extension)),
        key=lambda fn: os.stat(fn).st_mtime)

In [6]:
newst_csv = newst_files_in_tree(DATASET_PATH)[0]

In [7]:
df_temp = pd.read_csv(newst_csv, encoding="utf-8")
df_temp.sample(15)

Unnamed: 0,연번,환자,확진일,거주지,여행력,접촉력,퇴원현황
3095,1334,12919,7.01.,동대문구,-,확인 중,퇴원
2990,1439,13542,7.14.,종로구,-,강남구 사무실 관련,퇴원
1442,2987,17756,8.24.,기타,-,다래경매 관련,
987,3442,18740,8.27.,성북구,-,성북구 사랑제일교회 관련,
3835,594,10442,4.09.,성동구,-,#10407 접촉,퇴원
2140,2289,15876,8.18.,송파구,-,골드트레인,
1808,2621,16421,8.20.,중랑구,-,해외 접촉 추정,
4164,265,8408,3.16.,관악구,-,확인중,퇴원
3534,895,11585,6.02.,강서구,-,삼성화재 관련,퇴원
3802,627,10693,4.21.,마포구,일본,해외 접촉 추정,퇴원


In [8]:
# overwrite whatever cell value that contains "확인" as "확인 중"
df_temp.loc[df_temp["접촉력"].str.contains("확인"),"접촉력"] = "확인 중"

# check whether those two arrays are combined
df_temp.loc[df_temp["접촉력"].str.contains("확인"),"접촉력"].unique()

array(['확인 중'], dtype=object)

In [9]:
df_temp["접촉력"] = df_temp["접촉력"].str.replace("관련", "")
df_temp["접촉력"] = df_temp["접촉력"].str.strip()

In [10]:
infection_paths = df_temp["접촉력"].unique()
print(len(infection_paths))
infection_paths

230


array(['확인 중', '영등포구 일련정종 포교소', '기타 확진자 접촉', '타시도 확진자 접촉',
       '강동구 BF모바일 콜센터', '영등포 지인모임', '노원구 빛가온교회', '송파구 쿠팡 물류센터',
       '영등포구 국회출입기자', '은평구 수색성당', '광진구 혜민병원', '은평구 헤어콕', '8.15도심집회',
       '서울 아산병원', '관악구 가족모임', '해외 접촉 추정', '극단 산', '동작구 카드 발급업체',
       '영등포구 큰권능교회', '서초구 장애인교육시설', '노원구 기도모임', '강동구 소재 병원', '성북구 사랑제일교회',
       '도봉구 운동시설', '다래경매', '성북구 체대입시', '성북구 요양시설', '중구소재 은행',
       '중랑구 소재 체육시설', '관악구 에바다', '동작구 소재 서울신학교', '용인시 우리제일교회', '노원구 손해보험',
       '구로구 보성운수', '강서구 서울대효요양병원', '서대문구 지인모임', '강북구 일가족', '강서구 보안회사',
       '롯데리아 종사자 모임', '구로구 아파트', '강남구 소재 아파트', '제주 게스트하우스', '성북구 벧엘장로교회',
       '관악구 김혜근의원', '동대문구 sk탁구클럽', '동작구 스터디카페', 'KT가좌지사', '군인권센터',
       '동작구 요양시설', '영등포구 권능교회', '여의도 순복음교회', '중구 보험회사(현대해상)', '강서구 병원',
       '8.15도심집회(순복음 강북교회)', '중앙보훈병원', '종로구 혜화경찰서', '양천구 되새김교회', '현대커머셜',
       '8.15도심집회(녹색병원)', '영등포 IFC몰 오케스트로', '고양시 반석교회(케네디상가)', '롯데 자산개발',
       '골드트레인', '한양대병원', '순복음 강북교회', '서대문구 지인 모임', '강동구 어린이집', '광화문집회',
       '은평구 성경공부모임', '관악구 요양

In [11]:
df_date = df_temp.sort_values(["연번"], ascending=False)
df_date.head()

Unnamed: 0,연번,환자,확진일,거주지,여행력,접촉력,퇴원현황
0,4429,21428,9.07.,기타,-,확인 중,
1,4428,21429,9.07.,송파구,-,확인 중,
2,4427,21425,9.07.,노원구,-,확인 중,
3,4426,21420,9.07.,마포구,-,영등포구 일련정종 포교소,
4,4425,21414,9.07.,서초구,-,기타 확진자 접촉,


In [12]:
df_date["확진일"] = df_date["확진일"].str.replace(".", "-")
df_date["확진일"] = df_date["확진일"].str[:-1]
df_date.head()

Unnamed: 0,연번,환자,확진일,거주지,여행력,접촉력,퇴원현황
0,4429,21428,9-07,기타,-,확인 중,
1,4428,21429,9-07,송파구,-,확인 중,
2,4427,21425,9-07,노원구,-,확인 중,
3,4426,21420,9-07,마포구,-,영등포구 일련정종 포교소,
4,4425,21414,9-07,서초구,-,기타 확진자 접촉,


In [13]:
df_date["확진일"] = "2020-0" + df_date["확진일"]
df_date.head()

Unnamed: 0,연번,환자,확진일,거주지,여행력,접촉력,퇴원현황
0,4429,21428,2020-09-07,기타,-,확인 중,
1,4428,21429,2020-09-07,송파구,-,확인 중,
2,4427,21425,2020-09-07,노원구,-,확인 중,
3,4426,21420,2020-09-07,마포구,-,영등포구 일련정종 포교소,
4,4425,21414,2020-09-07,서초구,-,기타 확진자 접촉,


In [14]:
df_date.sample(5)

Unnamed: 0,연번,환자,확진일,거주지,여행력,접촉력,퇴원현황
2167,2262,16275,2020-08-18,동작구,-,성북구 사랑제일교회,퇴원
3695,734,11042,2020-05-16,송파구,-,이태원 클럽,퇴원
1426,3003,18038,2020-08-24,강북구,-,확인 중,
3592,837,11383,2020-05-28,강서구,-,부천시 쿠팡,퇴원
450,3979,20242,2020-08-31,양천구,-,동작구 카드 발급업체,


In [15]:
df = df_date.copy()
df.head()

Unnamed: 0,연번,환자,확진일,거주지,여행력,접촉력,퇴원현황
0,4429,21428,2020-09-07,기타,-,확인 중,
1,4428,21429,2020-09-07,송파구,-,확인 중,
2,4427,21425,2020-09-07,노원구,-,확인 중,
3,4426,21420,2020-09-07,마포구,-,영등포구 일련정종 포교소,
4,4425,21414,2020-09-07,서초구,-,기타 확진자 접촉,


In [16]:
df.loc[(df["확진일"].str.len() < 10), '확진일'] = df.loc[(df["확진일"].str.len() < 10), '확진일'].str.replace("-0", "-")
df.loc[(df["확진일"].str.len() < 10), '확진일'] = df.loc[(df["확진일"].str.len() < 10), '확진일'].str.replace("-", "-0")
df.head()

Unnamed: 0,연번,환자,확진일,거주지,여행력,접촉력,퇴원현황
0,4429,21428,2020-09-07,기타,-,확인 중,
1,4428,21429,2020-09-07,송파구,-,확인 중,
2,4427,21425,2020-09-07,노원구,-,확인 중,
3,4426,21420,2020-09-07,마포구,-,영등포구 일련정종 포교소,
4,4425,21414,2020-09-07,서초구,-,기타 확진자 접촉,


In [17]:
df.sample(15)

Unnamed: 0,연번,환자,확진일,거주지,여행력,접촉력,퇴원현황
2316,2113,15592,2020-08-17,송파구,-,용인시 우리제일교회,퇴원
1846,2583,16553,2020-08-20,송파구,-,광화문집회,
1343,3086,17686,2020-08-24,강북구,-,확인 중,
1290,3139,18237,2020-08-25,강서구,-,기타 확진자 접촉,
856,3573,19222,2020-08-27,강동구,-,기타 확진자 접촉,
3659,770,11195,2020-05-24,성동구,-,이태원 클럽,퇴원
617,3812,19862,2020-08-29,양천구,-,확인 중,
3621,808,11285,2020-05-27,남양주시,-,강남구 동인교회,퇴원
263,4166,20738,2020-09-03,서초구,-,중구소재 은행,
1118,3311,18530,2020-08-26,양천구,-,여의도 순복음교회,


In [18]:
df.to_csv("./dataset_predict/df_wrangle.csv", index=False)

In [19]:
temp = df["확진일"].value_counts().rename_axis('date').reset_index(name='new_confirmed')
temp.head()

Unnamed: 0,date,new_confirmed
0,2020-08-29,167
1,2020-08-27,159
2,2020-08-26,157
3,2020-08-18,151
4,2020-08-15,146


In [20]:
df_predict = temp.sort_values(by="date")
df_predict.head()

Unnamed: 0,date,new_confirmed
189,2020-01-24,1
169,2020-01-30,3
173,2020-01-31,3
185,2020-02-02,1
175,2020-02-05,2


In [21]:
df_predict["date"] = pd.to_datetime(df_predict["date"])

In [22]:
df_predict.index = pd.DatetimeIndex(df_predict["date"])
df_predict.head()

Unnamed: 0_level_0,date,new_confirmed
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-24,2020-01-24,1
2020-01-30,2020-01-30,3
2020-01-31,2020-01-31,3
2020-02-02,2020-02-02,1
2020-02-05,2020-02-05,2


In [23]:
df_predict = df_predict.resample('D').max()
df_predict.head()

Unnamed: 0_level_0,date,new_confirmed
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-24,2020-01-24,1.0
2020-01-25,NaT,
2020-01-26,NaT,
2020-01-27,NaT,
2020-01-28,NaT,


In [24]:
df_predict["date"] = df_predict.index
df_predict.head()

Unnamed: 0_level_0,date,new_confirmed
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-24,2020-01-24,1.0
2020-01-25,2020-01-25,
2020-01-26,2020-01-26,
2020-01-27,2020-01-27,
2020-01-28,2020-01-28,


In [25]:
df_predict = df_predict.fillna(0)
df_predict

Unnamed: 0_level_0,date,new_confirmed
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-24,2020-01-24,1.0
2020-01-25,2020-01-25,0.0
2020-01-26,2020-01-26,0.0
2020-01-27,2020-01-27,0.0
2020-01-28,2020-01-28,0.0
...,...,...
2020-09-03,2020-09-03,46.0
2020-09-04,2020-09-04,55.0
2020-09-05,2020-09-05,58.0
2020-09-06,2020-09-06,43.0


In [26]:
df_predict.to_csv("dataset_predict/df_predict.csv", index=False)

In [27]:
df_predict =  pd.read_csv("dataset_predict/df_predict.csv", encoding="utf-8")

In [28]:
df_untracked = df[df["접촉력"]=="확인 중"]
df_untracked

Unnamed: 0,연번,환자,확진일,거주지,여행력,접촉력,퇴원현황
0,4429,21428,2020-09-07,기타,-,확인 중,
1,4428,21429,2020-09-07,송파구,-,확인 중,
2,4427,21425,2020-09-07,노원구,-,확인 중,
5,4424,21419,2020-09-07,도봉구,-,확인 중,
8,4421,21409,2020-09-07,송파구,-,확인 중,
...,...,...,...,...,...,...,...
4373,56,1768,2020-02-25,고양시,-,확인 중,퇴원
4387,42,1370,2020-02-25,노원구,-,확인 중,퇴원
4390,39,924,2020-02-25,평택,-,확인 중,퇴원
4391,38,907,2020-02-25,관악구,-,확인 중,퇴원


In [29]:
df_untracked_temp = df_untracked.groupby(by=["확진일"]).size()
df_untracked_temp = df_untracked_temp.rename_axis('date').reset_index(name='new_untracked')
df_untracked_temp

Unnamed: 0,date,new_untracked
0,2020-02-25,6
1,2020-02-26,1
2,2020-02-27,2
3,2020-02-28,2
4,2020-03-02,2
...,...,...
122,2020-09-03,6
123,2020-09-04,6
124,2020-09-05,19
125,2020-09-06,13


In [30]:
df_untracked_no = df_untracked_temp

In [31]:
df_merged = pd.merge(df_predict, df_untracked_no, how="left", on="date")
df_merged = df_merged.fillna(0)
df_merged.head(15)

Unnamed: 0,date,new_confirmed,new_untracked
0,2020-01-24,1.0,0.0
1,2020-01-25,0.0,0.0
2,2020-01-26,0.0,0.0
3,2020-01-27,0.0,0.0
4,2020-01-28,0.0,0.0
5,2020-01-29,0.0,0.0
6,2020-01-30,3.0,0.0
7,2020-01-31,3.0,0.0
8,2020-02-01,0.0,0.0
9,2020-02-02,1.0,0.0


In [32]:
df_merged.tail(15)

Unnamed: 0,date,new_confirmed,new_untracked
213,2020-08-24,135.0,45.0
214,2020-08-25,115.0,35.0
215,2020-08-26,157.0,45.0
216,2020-08-27,159.0,38.0
217,2020-08-28,118.0,34.0
218,2020-08-29,167.0,23.0
219,2020-08-30,70.0,17.0
220,2020-08-31,103.0,17.0
221,2020-09-01,86.0,20.0
222,2020-09-02,64.0,13.0


In [49]:
list_infection_paths_no = []
for index, row in df_merged.iterrows():
    filter_end_date = row["date"]
    # print(filter_end_date)
    df_filtered = df.loc[df['확진일'] <= filter_end_date]
    
    # 확진자 접촉력이 밝혀진 경우
    infection_paths_known = df_filtered["접촉력"].unique()
    
    # "기타 확진자 접촉" 항목 485건 / R0
    contact_no = len(df_filtered[df_filtered["접촉력"].str.contains("기타 확진자 접촉")])/1.5
    
    infection_paths_no = len(infection_paths_known) + contact_no
    list_infection_paths_no.append(int(infection_paths_no))
print(list_infection_paths_no[:10])
print(list_infection_paths_no[-10:])

[1, 1, 1, 1, 1, 1, 2, 4, 4, 4]
[463, 475, 495, 506, 524, 533, 549, 559, 569, 578]


In [50]:
df_merged["no_paths"] = list_infection_paths_no
df_merged["new_confirmed"] = df_merged["new_confirmed"].apply(int)
df_merged["new_untracked"] = df_merged["new_untracked"].apply(int)
df_merged.head()

Unnamed: 0,date,new_confirmed,new_untracked,no_paths
0,2020-01-24,1,0,1
1,2020-01-25,0,0,1
2,2020-01-26,0,0,1
3,2020-01-27,0,0,1
4,2020-01-28,0,0,1


In [51]:
df_merged.tail()

Unnamed: 0,date,new_confirmed,new_untracked,no_paths
223,2020-09-03,46,6,533
224,2020-09-04,55,6,549
225,2020-09-05,58,19,559
226,2020-09-06,43,13,569
227,2020-09-07,62,15,578


In [52]:
df_merged.to_csv("dataset_predict/df_predict.csv", index=False)

### check holiday

In [53]:
import json

list_calendar=[]

calendar_file = "./korean-calendar/korean-calendar.json"
# read file
with open(calendar_file, 'r') as myfile:
    data=myfile.readlines()

data = [x.strip() for x in data] 

for i in data:
    calendar_data = i[43:]
    calendar_item = "{"+calendar_data
    calendar_item_json = json.loads(calendar_item)
    list_calendar.append(calendar_item_json)
list_calendar[:10]

[{'sc': '2000-01-01', 'lc': '1999-11-25', 'w': 7, 'h': True, 'ht': '신정'},
 {'sc': '2000-01-02', 'lc': '1999-11-26', 'w': 1, 'h': False, 'ht': ''},
 {'sc': '2000-01-03', 'lc': '1999-11-27', 'w': 2, 'h': False, 'ht': ''},
 {'sc': '2000-01-04', 'lc': '1999-11-28', 'w': 3, 'h': False, 'ht': ''},
 {'sc': '2000-01-05', 'lc': '1999-11-29', 'w': 4, 'h': False, 'ht': ''},
 {'sc': '2000-01-06', 'lc': '1999-11-30', 'w': 5, 'h': False, 'ht': ''},
 {'sc': '2000-01-07', 'lc': '1999-12-01', 'w': 6, 'h': False, 'ht': ''},
 {'sc': '2000-01-08', 'lc': '1999-12-02', 'w': 7, 'h': False, 'ht': ''},
 {'sc': '2000-01-09', 'lc': '1999-12-03', 'w': 1, 'h': False, 'ht': ''},
 {'sc': '2000-01-10', 'lc': '1999-12-04', 'w': 2, 'h': False, 'ht': ''}]

In [54]:
df_calendar = pd.DataFrame(list_calendar)
df_calendar = df_calendar[["sc", "w", "h"]]
df_calendar.head()

Unnamed: 0,sc,w,h
0,2000-01-01,7,True
1,2000-01-02,1,False
2,2000-01-03,2,False
3,2000-01-04,3,False
4,2000-01-05,4,False


In [55]:
def label_weekend(row):
    # sunday
    if row['w'] == 1:
        return True
    # friday
    if row['w'] == 6:
        return True
    # saturday
    if row['w'] == 7:
        return True
    else:
        return False

In [56]:
df_calendar["is_weekend"] = df_calendar.apply(lambda row: label_weekend(row), axis=1)
df_calendar.head(10)

Unnamed: 0,sc,w,h,is_weekend
0,2000-01-01,7,True,True
1,2000-01-02,1,False,True
2,2000-01-03,2,False,False
3,2000-01-04,3,False,False
4,2000-01-05,4,False,False
5,2000-01-06,5,False,False
6,2000-01-07,6,False,True
7,2000-01-08,7,False,True
8,2000-01-09,1,False,True
9,2000-01-10,2,False,False


In [57]:
# rename dataframe name
df_calendar.rename(columns = {'sc':'date', 'h':'is_holiday'}, inplace = True)

In [58]:
df_calendar.to_csv("dataset_predict/df_calendar.csv", index=False)

In [59]:
df_calendar_subset = df_calendar[["date", "is_holiday", "is_weekend"]]

In [60]:
df_train = pd.merge(df_merged, df_calendar_subset, how="left", on="date")
df_train.tail()

Unnamed: 0,date,new_confirmed,new_untracked,no_paths,is_holiday,is_weekend
223,2020-09-03,46,6,533,False,False
224,2020-09-04,55,6,549,False,True
225,2020-09-05,58,19,559,False,True
226,2020-09-06,43,13,569,False,True
227,2020-09-07,62,15,578,False,False


In [61]:
df_train[-25:-15]

Unnamed: 0,date,new_confirmed,new_untracked,no_paths,is_holiday,is_weekend
203,2020-08-14,73,4,175,False,True
204,2020-08-15,146,6,181,True,True
205,2020-08-16,91,4,188,False,True
206,2020-08-17,132,7,192,False,False
207,2020-08-18,151,18,198,False,False
208,2020-08-19,135,19,217,False,False
209,2020-08-20,126,28,229,False,False
210,2020-08-21,128,32,247,False,True
211,2020-08-22,143,44,270,False,True
212,2020-08-23,98,25,291,False,True


In [62]:
df_train.to_csv("./dataset_predict/df_train.csv", index=False)

### Add Government Social Distancing Policy