In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

## Load data
- I'm using the files that were updated at **April 21st**
- ref : https://github.com/jihoo-kim/Data-Science-for-COVID-19

In [2]:
# files were updated at 
PatientInfo_df = pd.read_csv('../dataset/Patient/PatientInfo.csv')
print(PatientInfo_df.shape)
PatientInfo_df.head(3)

(3326, 18)


Unnamed: 0,patient_id,global_num,sex,birth_year,age,country,province,city,disease,infection_case,infection_order,infected_by,contact_number,symptom_onset_date,confirmed_date,released_date,deceased_date,state
0,1000000001,2.0,male,1964.0,50s,Korea,Seoul,Gangseo-gu,,overseas inflow,1.0,,75.0,2020-01-22,2020-01-23,2020-02-05,,released
1,1000000002,5.0,male,1987.0,30s,Korea,Seoul,Jungnang-gu,,overseas inflow,1.0,,31.0,,2020-01-30,2020-03-02,,released
2,1000000003,6.0,male,1964.0,50s,Korea,Seoul,Jongno-gu,,contact with patient,2.0,2002000000.0,17.0,,2020-01-30,2020-02-19,,released


In [3]:
PatientInfo_df = PatientInfo_df[PatientInfo_df.state.isin(['released', 'deceased'])]
print(PatientInfo_df.shape)
PatientInfo_df.head(3)

(1704, 18)


Unnamed: 0,patient_id,global_num,sex,birth_year,age,country,province,city,disease,infection_case,infection_order,infected_by,contact_number,symptom_onset_date,confirmed_date,released_date,deceased_date,state
0,1000000001,2.0,male,1964.0,50s,Korea,Seoul,Gangseo-gu,,overseas inflow,1.0,,75.0,2020-01-22,2020-01-23,2020-02-05,,released
1,1000000002,5.0,male,1987.0,30s,Korea,Seoul,Jungnang-gu,,overseas inflow,1.0,,31.0,,2020-01-30,2020-03-02,,released
2,1000000003,6.0,male,1964.0,50s,Korea,Seoul,Jongno-gu,,contact with patient,2.0,2002000000.0,17.0,,2020-01-30,2020-02-19,,released


- It's important that **labels are highly unbalanced** (only about 3% is deceased)

In [4]:
print('< Percentage of each label >')
display(PatientInfo_df.state.value_counts(normalize=True) * 100)

< Percentage of each label >


released    96.068075
deceased     3.931925
Name: state, dtype: float64

## Split data into train, val, test
- Since the dataset is quite small(1704 records), I will split the date into **7:2:1** for now (test data could be added from the next file update)
- Since the labels are highly imbalanced, it's better to use **stratified random sampling**.

In [5]:
# Get train dataset
X_features = PatientInfo_df.iloc[:, :-1]
y_target = PatientInfo_df.iloc[:, -1]
X_train, X_val_test, y_train, y_val_test = train_test_split(X_features, y_target, test_size=0.3, random_state=0, stratify=y_target)

# Get val & test dataset
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.33, random_state=0, stratify=y_val_test)

In [6]:
# Check the labels of each dataset
print('< Percentage of each label (Train dataset) >')
print('size of dataset :', y_train.shape[0])
display(y_train.value_counts(normalize=True) * 100)

print('\n< Percentage of each label (Validation dataset) >')
print('size of dataset :', y_val.shape[0])
display(y_val.value_counts(normalize=True) * 100)

print('\n< Percentage of each label (Test dataset) >')
print('size of dataset :', y_test.shape[0])
display(y_test.value_counts(normalize=True) * 100)

< Percentage of each label (Train dataset) >
size of dataset : 1192


released    96.057047
deceased     3.942953
Name: state, dtype: float64


< Percentage of each label (Validation dataset) >
size of dataset : 343


released    96.209913
deceased     3.790087
Name: state, dtype: float64


< Percentage of each label (Test dataset) >
size of dataset : 169


released    95.857988
deceased     4.142012
Name: state, dtype: float64

## Save patient_id list
- I will add the data that will be updated after April 21st to test dataset. It's important that the present dataset(train-val-test) are separated from new dataset in the future. Therefore, it's better to save patient_id list from the present dataset

In [7]:
present_patients = PatientInfo_df.patient_id.astype(str).tolist()

with open('patients_id_0421.txt', 'w') as fp:
    fp.write('\n'.join(present_patients))