# Exploratory Data Analysis

In [1]:
DATA_PATH = '../data'

In [88]:
import pandas as pd
import os

case_df = pd.read_csv(os.path.join(DATA_PATH, 'Case.csv'))
patient_info_df = pd.read_csv(os.path.join(DATA_PATH, 'PatientInfo.csv'))
policy_df = pd.read_csv(os.path.join(DATA_PATH, 'Policy.csv'))
region_df = pd.read_csv(os.path.join(DATA_PATH, 'Region.csv'))
time_df = pd.read_csv(os.path.join(DATA_PATH, 'Time.csv'))

## Task 1.1

Show policy type and duration average in days unit, then sort it descending

In [48]:
policy_df

Unnamed: 0,policy_id,country,type,gov_policy,detail,start_date,end_date
0,1,Korea,Alert,Infectious Disease Alert Level,Level 1 (Blue),2020-01-03,2020-01-19
1,2,Korea,Alert,Infectious Disease Alert Level,Level 2 (Yellow),2020-01-20,2020-01-27
2,3,Korea,Alert,Infectious Disease Alert Level,Level 3 (Orange),2020-01-28,2020-02-22
3,4,Korea,Alert,Infectious Disease Alert Level,Level 4 (Red),2020-02-23,
4,5,Korea,Immigration,Special Immigration Procedure,from China,2020-02-04,
...,...,...,...,...,...,...,...
56,57,Korea,Transformation,Logistics center,On-site inspection of major logistics faciliti...,2020-05-29,2020-06-11
57,58,Korea,Transformation,Wearing of masks,"Mandatory wearing of passenger mask domestic, ...",2020-05-27,
58,59,Korea,Transformation,Wearing of masks,Drivers such as buses and taxis can refuse to ...,2020-05-26,
59,60,Korea,Technology,KI-Pass: Korea Internet - Pass,new quick response (QR) code system this week ...,2020-06-10,


In [49]:
policy_df['start_date'].isna().sum()

0

In [50]:
policy_df['end_date'].isna().sum()

37

Assume missing values are today date

In [53]:
from datetime import datetime

policy_df['end_date'].fillna(value=datetime.today().strftime('%Y-%m-%d'), inplace=True)

In [54]:
policy_df['end_date'].isna().sum()

0

In [55]:
policy_df['start_date'] = pd.to_datetime(policy_df['start_date'])
policy_df['start_date']

0    2020-01-03
1    2020-01-20
2    2020-01-28
3    2020-02-23
4    2020-02-04
        ...    
56   2020-05-29
57   2020-05-27
58   2020-05-26
59   2020-06-10
60   2020-05-28
Name: start_date, Length: 61, dtype: datetime64[ns]

In [56]:
policy_df['end_date'] = pd.to_datetime(policy_df['end_date'])
policy_df['end_date']

0    2020-01-19
1    2020-01-27
2    2020-02-22
3    2022-06-17
4    2022-06-17
        ...    
56   2020-06-11
57   2022-06-17
58   2022-06-17
59   2022-06-17
60   2020-06-14
Name: end_date, Length: 61, dtype: datetime64[ns]

Apply mapping to get duration

In [63]:
policy_df['duration'] = (policy_df['end_date'] - policy_df['start_date']).map(lambda d: d.days)
policy_df['duration']

0      16
1       7
2      25
3     845
4     864
     ... 
56     13
57    751
58    752
59    737
60     17
Name: duration, Length: 61, dtype: int64

In [67]:
policy_mean_dur_df = policy_df[['type', 'duration']].groupby(by='type', as_index=False).mean().sort_values(by='duration', ascending=False)
policy_mean_dur_df

Unnamed: 0,type,duration
4,Immigration,828.333333
6,Technology,819.333333
3,Health,759.3
7,Transformation,505.333333
5,Social,325.6
0,Administrative,268.333333
1,Alert,223.25
2,Education,95.866667


## Task 1.2

Get patient data count that lives in Seoul, then group it based on age and sex, finally sort them based on patient count ascending

In [89]:
patient_info_df

Unnamed: 0,patient_id,sex,age,country,province,city,infection_case,infected_by,contact_number,symptom_onset_date,confirmed_date,released_date,deceased_date,state
0,1000000001,male,50s,Korea,Seoul,Gangseo-gu,overseas inflow,,75,2020-01-22,2020-01-23,2020-02-05,,released
1,1000000002,male,30s,Korea,Seoul,Jungnang-gu,overseas inflow,,31,,2020-01-30,2020-03-02,,released
2,1000000003,male,50s,Korea,Seoul,Jongno-gu,contact with patient,2002000001,17,,2020-01-30,2020-02-19,,released
3,1000000004,male,20s,Korea,Seoul,Mapo-gu,overseas inflow,,9,2020-01-26,2020-01-30,2020-02-15,,released
4,1000000005,female,20s,Korea,Seoul,Seongbuk-gu,contact with patient,1000000002,2,,2020-01-31,2020-02-24,,released
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5160,7000000015,female,30s,Korea,Jeju-do,Jeju-do,overseas inflow,,25,,2020-05-30,2020-06-13,,released
5161,7000000016,,,Korea,Jeju-do,Jeju-do,overseas inflow,,,,2020-06-16,2020-06-24,,released
5162,7000000017,,,Bangladesh,Jeju-do,Jeju-do,overseas inflow,,72,,2020-06-18,,,isolated
5163,7000000018,,,Bangladesh,Jeju-do,Jeju-do,overseas inflow,,,,2020-06-18,,,isolated


In [92]:
seoul_patient_df = patient_info_df[patient_info_df['province'] == 'Seoul']
seoul_patient_df

Unnamed: 0,patient_id,sex,age,country,province,city,infection_case,infected_by,contact_number,symptom_onset_date,confirmed_date,released_date,deceased_date,state
0,1000000001,male,50s,Korea,Seoul,Gangseo-gu,overseas inflow,,75,2020-01-22,2020-01-23,2020-02-05,,released
1,1000000002,male,30s,Korea,Seoul,Jungnang-gu,overseas inflow,,31,,2020-01-30,2020-03-02,,released
2,1000000003,male,50s,Korea,Seoul,Jongno-gu,contact with patient,2002000001,17,,2020-01-30,2020-02-19,,released
3,1000000004,male,20s,Korea,Seoul,Mapo-gu,overseas inflow,,9,2020-01-26,2020-01-30,2020-02-15,,released
4,1000000005,female,20s,Korea,Seoul,Seongbuk-gu,contact with patient,1000000002,2,,2020-01-31,2020-02-24,,released
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1307,1000001308,,,Korea,Seoul,Mapo-gu,Richway,1000001290,,,2020-06-29,,,isolated
1308,1000001309,,,Korea,Seoul,Mapo-gu,Richway,1000001290,,,2020-06-29,,,isolated
1309,1000001310,,,Korea,Seoul,Geumcheon-gu,contact with patient,,,,2020-06-29,,,isolated
1310,1000001311,,,Korea,Seoul,etc,,,,,2020-06-29,,,isolated


Check whether missing value of age and sex is the same data

In [93]:
(seoul_patient_df['age'].isna() == seoul_patient_df['sex'].isna()).unique()

array([ True])

Missing value of age and sex is the same data. Make it unknown, rather than NaN

In [95]:
seoul_patient_df['age'] = seoul_patient_df['age'].fillna('unknown')
seoul_patient_df['age'].isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  seoul_patient_df['age'] = seoul_patient_df['age'].fillna('unknown')


0

In [96]:
seoul_patient_df['sex'] = seoul_patient_df['sex'].fillna('unknown')
seoul_patient_df['sex'].isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  seoul_patient_df['sex'] = seoul_patient_df['sex'].fillna('unknown')


0

Make sure `patient_id` have no missing values and no duplicates.

In [105]:
seoul_patient_df['patient_id'].isna().sum(), seoul_patient_df['patient_id'].duplicated().sum()

(0, 0)

In [107]:
grouped_age_sex_df = seoul_patient_df[['age', 'sex', 'patient_id']].groupby(by=['age', 'sex'], as_index=False).count().sort_values(by='patient_id')
grouped_age_sex_df

Unnamed: 0,age,sex,patient_id
18,90s,male,2
1,0s,male,3
16,80s,female,4
0,0s,female,5
17,80s,male,6
15,70s,male,10
3,10s,male,13
2,10s,female,14
14,70s,female,16
13,60s,male,23


Rename `patient_id` to `count`

In [108]:
grouped_age_sex_df.rename(columns=dict(patient_id='count'), inplace=True)
grouped_age_sex_df

Unnamed: 0,age,sex,count
18,90s,male,2
1,0s,male,3
16,80s,female,4
0,0s,female,5
17,80s,male,6
15,70s,male,10
3,10s,male,13
2,10s,female,14
14,70s,female,16
13,60s,male,23


## Task 1.3