In [1]:
import pandas as pd
import numpy as np

In [2]:
import os
from os import fdopen, remove, walk
import glob
from tempfile import mkstemp
import shutil
from shutil import move, copymode

In [3]:
from datetime import date

In [4]:
DATASET_PATH = "/Users/noopy/covid19_unknown_spread/dataset"
datasets = glob.glob(f"{DATASET_PATH}/*.csv")
datasets[:5]

['/Users/noopy/covid19_unknown_spread/dataset/seoul_covid_8_31_.csv',
 '/Users/noopy/covid19_unknown_spread/dataset/seoul_covid_8_30_.csv',
 '/Users/noopy/covid19_unknown_spread/dataset/seoul_covid_8_17_.csv',
 '/Users/noopy/covid19_unknown_spread/dataset/seoul_covid_8_21_.csv',
 '/Users/noopy/covid19_unknown_spread/dataset/seoul_covid_7_04_.csv']

In [5]:
# get oldest file in the dataset folder
import os, heapq
def newst_files_in_tree(rootfolder, count=1, extension=".csv"):
    return heapq.nlargest(count,
        (os.path.join(dirname, filename)
        for dirname, dirnames, filenames in os.walk(rootfolder)
        for filename in filenames
        if filename.endswith(extension)),
        key=lambda fn: os.stat(fn).st_mtime)

In [6]:
newst_csv = newst_files_in_tree(DATASET_PATH)[0]

In [7]:
df_temp = pd.read_csv(newst_csv, encoding="utf-8")
df_temp.sample(15)

Unnamed: 0,연번,환자,확진일,거주지,여행력,접촉력,퇴원현황
1935,2316,15949,8.18.,관악구,-,현대커머셜,
1993,2258,15950,8.18.,종로구,-,성북구 사랑제일교회 관련,
3684,567,10329,4.06.,강남구,미국,해외 접촉 추정,퇴원
1474,2777,17086,8.22.,양천구,-,확인 중,
2051,2200,15699,8.17.,노원구,-,성북구 사랑제일교회 관련,퇴원
2187,2064,15515,8.16.,강동구,-,성북구 사랑제일교회 관련,퇴원
3412,839,11388,5.28.,동작구,-,kb 생명보험 관련,퇴원
3813,438,9751,3.30.,관악구,-,구로구 교회 관련,퇴원
1228,3023,17765,8.24.,서초구,-,기타 확진자 접촉,
1371,2880,17194,8.22.,동대문구,-,기타 확진자 접촉,


In [8]:
# overwrite whatever cell value that contains "확인" as "확인 중"
df_temp.loc[df_temp["접촉력"].str.contains("확인"),"접촉력"] = "확인 중"

# check whether those two arrays are combined
df_temp.loc[df_temp["접촉력"].str.contains("확인"),"접촉력"].unique()

array(['확인 중'], dtype=object)

In [9]:
df_temp["접촉력"] = df_temp["접촉력"].str.replace("관련", "")
df_temp["접촉력"] = df_temp["접촉력"].str.strip()

In [10]:
infection_paths = df_temp["접촉력"].unique()
print(len(infection_paths))
infection_paths

223


array(['송파구 소재 병원', '노원구 빛가온교회', '강동구 소재 병원', '성북구 사랑제일교회', '8.15도심집회',
       '확인 중', '도봉구 운동시설', '타시도 확진자 접촉', '기타 확진자 접촉', '다래경매', '노원구 기도모임',
       '동작구 카드 발급업체', '광진구 소재병원', '성북구 체대입시', '성북구 요양시설', '중구 소재 은행',
       '중랑구 소재 체육시설', '관악구 판매업소', '동작구 소재 서울신학교', '용인시 우리제일교회',
       '노원구 손해보험', '구로구 보성운수', '강서구 서울대효요양병원', '서초구 장애인교육시설', '서대문구 지인모임',
       '강북구 일가족', '중구 하나은행본점', '강서구 보안회사', '롯데리아 종사자 모임', '해외 접촉 추정',
       '8.15도심집회(순복음 강북교회)', '영등포구 권능교회', '구로구 아파트', '강남구 소재 아파트',
       '제주 게스트하우스', '성북구 벧엘장로교회', '중앙보훈병원', '관악구 김혜근의원', '동대문구 sk탁구클럽',
       '동작구 스터디카페', '극단 산', 'KT가좌지사', '군인권센터', '동작구 요양시설', '여의도 순복음교회',
       '중구 보험회사(현대해상)', '강서구 병원', '종로구 혜화경찰서', '양천구 되새김교회', '현대커머셜',
       '8.15도심집회(녹색병원)', '영등포 IFC몰 오케스트로', '고양시 반석교회(케네디상가)', '롯데 자산개발',
       '골드트레인', '한양대병원', '은평구 헤어콕', '순복음 강북교회', '서대문구 지인 모임', '강동구 어린이집',
       '광화문집회', '은평구 성경공부모임', '관악구 요양병원', '고대 안암병원', '마포구 푸본생명콜센터',
       '성동구 가족', '양천구 되새김 교회', '중구 통일상가', '고양시 반석교회  (케네디상가)',
       '강남구 판매업소(

In [11]:
df_date = df_temp.sort_values(["연번"], ascending=False)
df_date.head()

Unnamed: 0,연번,환자,확진일,거주지,여행력,접촉력,퇴원현황
0,4251,미부여,9.04.,타시,-,송파구 소재 병원,
1,4250,20979,9.04.,노원구,-,노원구 빛가온교회,
2,4249,20974,9.04.,성동구,-,강동구 소재 병원,
3,4248,20962,9.04.,성북구,-,성북구 사랑제일교회,
4,4247,미부여,9.03.,성북구,-,8.15도심집회,


In [24]:
df_date["확진일"].unique()

array(['9.04.', '9.03.', '8.30.', '9.02.', '9.01.', '8.31.', '8.29.',
       '8.28.', '8.27.', '8.26.', '8.25.', '8.24.', '8.23.', '8.22.',
       '8.21.', '8.20.', '8.19.', '8.18.', '8.17.', '8.16.', '8.15.',
       '8.14.', '8.13.', '8.12.', '8.11.', '8.10.', '8.9.', '8.8.',
       '8.7.', '8.6.', '8.5.', '8.4.', '8.3.', '8.2.', '8.1.', '7.31.',
       '7.30.', '7.29.', '7.28.', '7.27.', '7.26.', '7.25.', '7.24.',
       '7.23.', '7.22.', '7.21.', '7.20.', '7.19.', '7.18.', '7.17.',
       '7.16.', '7.15.', '7.14.', '7.13.', '7.12.', '7.11.', '7.10.',
       '7.09.', '7.08.', '7.07.', '7.06.', '7.05.', '7.04.', '7.03.',
       '7.02.', '7.01.', '6.30.', '6.29.', '6.28.', '6.27.', '6.26.',
       '6.25.', '6.24.', '6.23.', '6.22.', '6.21.', '6.20.', '6.19.',
       '6.18.', '6.17.', '6.16.', '6.15.', '6.14.', '6.13.', '6.12.',
       '6.11.', '6.10.', '6.09.', '6.08.', '6.05.', '6.07.', '6.06.',
       '6.04.', '6.03.', '6.02.', '6.01.', '5.31.', '5.30.', '5.29.',
       '5.27.', '5.2

In [12]:
df_date["확진일"] = df_date["확진일"].str.replace(".", "-")
df_date["확진일"] = df_date["확진일"].str[:-1]
df_date.head()

Unnamed: 0,연번,환자,확진일,거주지,여행력,접촉력,퇴원현황
0,4251,미부여,9-04,타시,-,송파구 소재 병원,
1,4250,20979,9-04,노원구,-,노원구 빛가온교회,
2,4249,20974,9-04,성동구,-,강동구 소재 병원,
3,4248,20962,9-04,성북구,-,성북구 사랑제일교회,
4,4247,미부여,9-03,성북구,-,8.15도심집회,


In [13]:
df_date["확진일"] = "2020-0" + df_date["확진일"]
df_date.head()

Unnamed: 0,연번,환자,확진일,거주지,여행력,접촉력,퇴원현황
0,4251,미부여,2020-09-04,타시,-,송파구 소재 병원,
1,4250,20979,2020-09-04,노원구,-,노원구 빛가온교회,
2,4249,20974,2020-09-04,성동구,-,강동구 소재 병원,
3,4248,20962,2020-09-04,성북구,-,성북구 사랑제일교회,
4,4247,미부여,2020-09-03,성북구,-,8.15도심집회,


In [14]:
df_date.sample(5)

Unnamed: 0,연번,환자,확진일,거주지,여행력,접촉력,퇴원현황
3141,1110,12067,2020-06-13,관악구,-,리치웨이,퇴원
2889,1362,13065,2020-07-04,송파구,-,타시도 확진자 접촉,퇴원
2382,1869,15106,2020-08-15,강북구,-,성북구 사랑제일교회,퇴원
2620,1631,14472,2020-08-5,성동구,-,성동구 가족,퇴원
3849,402,9553,2020-03-28,서초구,미국,해외 접촉 추정,퇴원


In [32]:
df = df_date.copy()
df.head()

Unnamed: 0,연번,환자,확진일,거주지,여행력,접촉력,퇴원현황
0,4251,미부여,2020-09-04,타시,-,송파구 소재 병원,
1,4250,20979,2020-09-04,노원구,-,노원구 빛가온교회,
2,4249,20974,2020-09-04,성동구,-,강동구 소재 병원,
3,4248,20962,2020-09-04,성북구,-,성북구 사랑제일교회,
4,4247,미부여,2020-09-03,성북구,-,8.15도심집회,


In [35]:
df.loc[(df["확진일"].str.len() < 10), '확진일'] = df.loc[(df["확진일"].str.len() < 10), '확진일'].str.replace("-0", "-")
df.loc[(df["확진일"].str.len() < 10), '확진일'] = df.loc[(df["확진일"].str.len() < 10), '확진일'].str.replace("-", "-0")
df.head()

Unnamed: 0,연번,환자,확진일,거주지,여행력,접촉력,퇴원현황
0,4251,미부여,2020-09-04,타시,-,송파구 소재 병원,
1,4250,20979,2020-09-04,노원구,-,노원구 빛가온교회,
2,4249,20974,2020-09-04,성동구,-,강동구 소재 병원,
3,4248,20962,2020-09-04,성북구,-,성북구 사랑제일교회,
4,4247,미부여,2020-09-03,성북구,-,8.15도심집회,


In [36]:
df.sample(15)

Unnamed: 0,연번,환자,확진일,거주지,여행력,접촉력,퇴원현황
225,4026,20326,2020-09-01,강남구,인도네시아,해외 접촉 추정,
2672,1579,14202,2020-07-27,용인시,멕시코,해외 접촉 추정,퇴원
739,3512,19046,2020-08-27,노원구,-,성북구 사랑제일교회,
2969,1282,12645,2020-06-26,중구,키르기스스탄,해외 접촉 추정,퇴원
2108,2143,15590,2020-08-17,성북구,-,성북구 사랑제일교회,퇴원
2182,2069,15462,2020-08-16,노원구,-,성북구 사랑제일교회,
3965,286,8627,2020-03-19,마포구,-,콜센터직원 접촉,퇴원
534,3717,20125,2020-08-29,기타,-,영등포구 권능교회,
162,4089,20578,2020-09-02,송파구,-,서초구 장애인교육시설,
875,3376,18534,2020-08-26,기타,-,확인 중,


In [37]:
df.to_csv("./dataset_predict/df_wrangle.csv", index=False)

In [38]:
temp = df["확진일"].value_counts().rename_axis('date').reset_index(name='new_confirmed')
temp

Unnamed: 0,date,new_confirmed
0,2020-08-26,153
1,2020-08-18,151
2,2020-08-15,146
3,2020-08-27,146
4,2020-08-22,140
...,...,...
191,2020-04-15,1
192,2020-05-17,1
193,2020-02-02,1
194,2020-04-24,1


In [63]:
df_predict = temp.sort_values(by="date")
df_predict.head()

Unnamed: 0,date,new_confirmed
183,2020-01-24,1
170,2020-01-30,3
163,2020-01-31,3
193,2020-02-02,1
171,2020-02-05,2


In [64]:
df_predict["date"] = pd.to_datetime(df_predict["date"])

In [69]:
df_predict.index = pd.DatetimeIndex(df_predict["date"])
df_predict

Unnamed: 0_level_0,date,new_confirmed
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-24,2020-01-24,1
2020-01-30,2020-01-30,3
2020-01-31,2020-01-31,3
2020-02-02,2020-02-02,1
2020-02-05,2020-02-05,2
...,...,...
2020-08-31,2020-08-31,94
2020-09-01,2020-09-01,101
2020-09-02,2020-09-02,81
2020-09-03,2020-09-03,62


In [75]:
df_predict = df_predict.resample('D').max()
df_predict

Unnamed: 0_level_0,date,new_confirmed
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-24,2020-01-24,1.0
2020-01-25,NaT,
2020-01-26,NaT,
2020-01-27,NaT,
2020-01-28,NaT,
...,...,...
2020-08-31,2020-08-31,94.0
2020-09-01,2020-09-01,101.0
2020-09-02,2020-09-02,81.0
2020-09-03,2020-09-03,62.0


In [77]:
df_predict["date"] = df_predict.index
df_predict

Unnamed: 0_level_0,date,new_confirmed
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-24,2020-01-24,1.0
2020-01-25,2020-01-25,
2020-01-26,2020-01-26,
2020-01-27,2020-01-27,
2020-01-28,2020-01-28,
...,...,...
2020-08-31,2020-08-31,94.0
2020-09-01,2020-09-01,101.0
2020-09-02,2020-09-02,81.0
2020-09-03,2020-09-03,62.0


In [81]:
df_predict = df_predict.fillna(0)
df_predict

Unnamed: 0_level_0,date,new_confirmed
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-24,2020-01-24,1.0
2020-01-25,2020-01-25,0.0
2020-01-26,2020-01-26,0.0
2020-01-27,2020-01-27,0.0
2020-01-28,2020-01-28,0.0
...,...,...
2020-08-31,2020-08-31,94.0
2020-09-01,2020-09-01,101.0
2020-09-02,2020-09-02,81.0
2020-09-03,2020-09-03,62.0


In [82]:
df_predict.to_csv("dataset_predict/df_predict.csv", index=False)