# 1. Data Preparation

In [165]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import joblib
import os
import ruamel.yaml as yaml

from sklearn.model_selection import train_test_split

In [166]:
params_dir = '../config/params.yaml'

In [167]:
def load_params(params_dir):
  with open(params_dir, 'r') as file:
    params = yaml.safe_load(file)

    return params

In [168]:
params = load_params(params_dir)

## 1. Load Dataset

In [169]:
def read_dataset(dataset_dir):
    dataset = pd.DataFrame()

    for i in tqdm(os.listdir(dataset_dir)):
        dataset = pd.concat([pd.read_csv(dataset_dir + i), dataset])
    
    return dataset

In [170]:
dataset = read_dataset(params["dataset_dir"])

100%|██████████| 1/1 [00:01<00:00,  1.77s/it]


In [171]:
print(dataset.head())

print(dataset.describe())

   USMER  MEDICAL_UNIT  SEX  PATIENT_TYPE   DATE_DIED  INTUBED  PNEUMONIA  \
0      2             1    1             1  03/05/2020       97          1   
1      2             1    2             1  03/06/2020       97          1   
2      2             1    2             2  09/06/2020        1          2   
3      2             1    1             1  12/06/2020       97          2   
4      2             1    2             1  21/06/2020       97          2   

   AGE  PREGNANT  DIABETES  ...  ASTHMA  INMSUPR  HIPERTENSION  OTHER_DISEASE  \
0   65         2         2  ...       2        2             1              2   
1   72        97         2  ...       2        2             1              2   
2   55        97         1  ...       2        2             2              2   
3   53         2         2  ...       2        2             2              2   
4   68        97         1  ...       2        2             1              2   

   CARDIOVASCULAR  OBESITY  RENAL_CHRONIC  TOBACCO

## Data Definition

```
DEATH:
  [integer]
  [1-2]
  1 berarti meninggal
  2 berarti selamat

SEX:
  [integer]
  [1-2]
  jenis kelamin wanita atau pria

AGE:
  [integer]
  [1-121]
  umur pasien

CLASIFFICATION_FINAL:
  [integer]
  [1-7]
  hasil test covid
  positif covid bernilai 1-3
  negatif covid bernilai diatas 4

PATIENT_TYPE:
  [integer]
  [1-2]
  di rumah sakit atau isolasi mandiri

PNEUMONIA:
  [integer]
  [1-2]
  apakah pasien memiliki air sacs inflammation

PREGNANT:
  [integer]
  [1-2]
  apakah pasien hamil atau tidak

DIABETES:
  [integer]
  [1-2]
  apakah pasien diabetes atau tidak

COPD:
  [integer]
  [1-2]
  apakah pasien punya penyakit Chronic obstructive pulmonary atau tidak

ASTHMA:
  [integer]
  [1-2]
  apakah pasien punya asma atau tidak

INMSUPR:
  [integer]
  [1-2]
  apakah pasien immunosuppressed atau tidak

HIPERTENSION:
  [integer]
  [1-2]
  apakah pasient hipertensi atau tidak

CARDIOVASCULAR:
  [integer]
  [1-2]
  apakah pasien memiliki penyakit jantung atau pembuluh darah

RENAL_CHRONIC:
  [integer]
  [1-2]
  apakah pasien memiliki chronic renal disease atau tidak

OTHER_DISEASE:
  [integer]
  [1-2]
  apakah pasien memiliki penyakit lain atau tidak

OBESITY:
  [integer]
  [1-2]
  apakah pasien obesitas

TOBACCO:
  [integer]
  [1-2]
  apakah pasien merokok atau tidak

USMR:
  [integer]
  [1-2]
  apakah pasien dirawat pada kelas 1, 2, atau 3

MEDICAL_UNIT:
  [integer]
  [1-13]
  tipe institusi kesehatan yang menyediakan jasa perawatan

INTUBED:
  [integer]
  [1-2]
  apakah pasien terhubung ke ventilator

ICU:
  [integer]
  [1-2]
  apakah pasien masuk ke ICU
```

## Data Validation

### 1. Cek tipe data

In [172]:
dataset.dtypes

USMER                    int64
MEDICAL_UNIT             int64
SEX                      int64
PATIENT_TYPE             int64
DATE_DIED               object
INTUBED                  int64
PNEUMONIA                int64
AGE                      int64
PREGNANT                 int64
DIABETES                 int64
COPD                     int64
ASTHMA                   int64
INMSUPR                  int64
HIPERTENSION             int64
OTHER_DISEASE            int64
CARDIOVASCULAR           int64
OBESITY                  int64
RENAL_CHRONIC            int64
TOBACCO                  int64
CLASIFFICATION_FINAL     int64
ICU                      int64
dtype: object

Dari hasil pengecekan, semua data sesuai

### 2. Range

In [173]:
dataset.describe()

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
count,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0
mean,1.632194,8.980565,1.499259,1.190765,79.52288,3.346831,41.7941,49.76558,2.186404,2.260569,2.242626,2.298132,2.128989,2.435143,2.26181,2.125176,2.25718,2.214333,5.305653,79.55397
std,0.4822084,3.723278,0.4999997,0.3929041,36.86889,11.91288,16.90739,47.51073,5.424242,5.132258,5.114089,5.462843,5.236397,6.646676,5.19485,5.175445,5.135354,5.323097,1.881165,36.82307
min,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,1.0,4.0,1.0,1.0,97.0,2.0,30.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,97.0
50%,2.0,12.0,1.0,1.0,97.0,2.0,40.0,97.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,6.0,97.0
75%,2.0,12.0,2.0,1.0,97.0,2.0,53.0,97.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,7.0,97.0
max,2.0,13.0,2.0,2.0,99.0,99.0,121.0,98.0,98.0,98.0,98.0,98.0,98.0,98.0,98.0,98.0,98.0,98.0,7.0,99.0


### 3. Dimensi Data

In [174]:
dataset.shape

(1048575, 21)

### 4. Handling Column Error

### 4.1. Remove null

Pada boolean, 1 = yes 2 = no
Nilai 97..99 = null

In [199]:
dataset.drop(index = dataset[dataset.USMER > 2].index, inplace = True)

In [200]:
dataset.drop(index = dataset[dataset.SEX > 2].index, inplace = True)

In [201]:
# handle patient_type
dataset.PATIENT_TYPE.unique()

array([2])

In [202]:
# handle intubed
dataset.INTUBED.unique()
dataset.drop(index = dataset[dataset.INTUBED > 2].index, inplace = True)

In [219]:
# handle pneumonia
print(dataset.PNEUMONIA.unique())

[2 1]


In [220]:
# handle pregnant
print(dataset.PREGNANT.unique())
dataset.drop(index = dataset[dataset.PREGNANT > 2].index, inplace = True)

[97  2 98  1]


In [203]:
# handle diabetes
print(dataset.DIABETES.unique())
dataset.drop(index = dataset[dataset.DIABETES > 2].index, inplace = True)

[ 1  2 98]


In [204]:
# handle COPD
print(dataset.COPD.unique())
dataset.drop(index = dataset[dataset.COPD > 2].index, inplace = True)

[ 2  1 98]


In [205]:
# handle ASTHMA
print(dataset.ASTHMA.unique())
dataset.drop(index = dataset[dataset.ASTHMA > 2].index, inplace = True)

[ 2  1 98]


In [206]:
# handle INMSUPR
print(dataset.INMSUPR.unique())
dataset.drop(index = dataset[dataset.INMSUPR > 2].index, inplace = True)

[ 2  1 98]


In [207]:
# handle HIPERTENSION
print(dataset.HIPERTENSION.unique())
dataset.drop(index = dataset[dataset.HIPERTENSION > 2].index, inplace = True)

[ 2  1 98]


In [208]:
# handle OTHER_DISEASE
print(dataset.OTHER_DISEASE.unique())
dataset.drop(index = dataset[dataset.OTHER_DISEASE > 2].index, inplace = True)

[ 2  1 98]


In [210]:
# handle CARDIOVASCULAR
print(dataset.CARDIOVASCULAR.unique())
dataset.drop(index = dataset[dataset.CARDIOVASCULAR > 2].index, inplace = True)

[ 2  1 98]


In [209]:
# handle OBESITY
print(dataset.OBESITY.unique())
dataset.drop(index = dataset[dataset.OBESITY > 2].index, inplace = True)

[ 2  1 98]


In [211]:
# handle RENAL_CHRONIC
print(dataset.RENAL_CHRONIC.unique())
dataset.drop(index = dataset[dataset.RENAL_CHRONIC > 2].index, inplace = True)

[ 2  1 98]


In [212]:
# handle TOBACCO
print(dataset.TOBACCO.unique())
dataset.drop(index = dataset[dataset.TOBACCO > 2].index, inplace = True)

[ 2  1 98]


In [213]:
# handle ICU
print(dataset.ICU.unique())
dataset.drop(index = dataset[dataset.ICU > 2].index, inplace = True)

[ 2  1 99]


### 4.2. Handling column DEATH

In [242]:
dataset['DEATH'] = np.where(dataset.DATE_DIED == '9999-99-99', 2, 1)
dataset['DEATH'] = dataset['DEATH'].astype(int)

In [243]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 76832 entries, 5 to 1048569
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   USMER                 76832 non-null  int64 
 1   MEDICAL_UNIT          76832 non-null  int64 
 2   SEX                   76832 non-null  int64 
 3   PATIENT_TYPE          76832 non-null  int64 
 4   DATE_DIED             76832 non-null  object
 5   INTUBED               76832 non-null  int64 
 6   PNEUMONIA             76832 non-null  int64 
 7   AGE                   76832 non-null  int64 
 8   PREGNANT              76832 non-null  int64 
 9   DIABETES              76832 non-null  int64 
 10  COPD                  76832 non-null  int64 
 11  ASTHMA                76832 non-null  int64 
 12  INMSUPR               76832 non-null  int64 
 13  HIPERTENSION          76832 non-null  int64 
 14  OTHER_DISEASE         76832 non-null  int64 
 15  CARDIOVASCULAR        76832 non-nu

In [244]:
joblib.dump(dataset, params['dataset_cleaned_path'])

['../data/processed/covid_dataset.pkl']

## 4. Data Defense

In [258]:
def check_data(input_data, params):
    # check data types
    assert input_data.select_dtypes("int").columns.to_list() == params["int_columns"], "an error occurs in int column(s)."

    # check range of data
    assert input_data.USMER.between(params["range_usmr"][0], params["range_usmr"][1]).sum() == len(input_data), "an error occurs in USMER range."
    assert input_data.MEDICAL_UNIT.between(params["range_medical_unit"][0], params["range_medical_unit"][1]).sum() == len(input_data), "an error occurs in MEDICAL_UNIT range."
    assert input_data.SEX.between(params["range_sex"][0], params["range_sex"][1]).sum() == len(input_data), "an error occurs in SEX range."
    assert input_data.PATIENT_TYPE.between(params["range_patient_type"][0], params["range_patient_type"][1]).sum() == len(input_data), "an error occurs in PATIENT_TYPE range."
    assert input_data.INTUBED.between(params["range_intubed"][0], params["range_intubed"][1]).sum() == len(input_data), "an error occurs in INTUBED range."
    assert input_data.PNEUMONIA.between(params["range_pneumonia"][0], params["range_pneumonia"][1]).sum() == len(input_data), "an error occurs in PNEUMONIA range."
    assert input_data.AGE.between(params["range_age"][0], params["range_age"][1]).sum() == len(input_data), "an error occurs in AGE range."
    assert input_data.PREGNANT.between(params["range_pregnant"][0], params["range_pregnant"][1]).sum() == len(input_data), "an error occurs in PREGNANT range."
    assert input_data.DIABETES.between(params["range_diabetes"][0], params["range_diabetes"][1]).sum() == len(input_data), "an error occurs in DIABETES range."
    assert input_data.COPD.between(params["range_copd"][0], params["range_copd"][1]).sum() == len(input_data), "an error occurs in COPD range."
    assert input_data.ASTHMA.between(params["range_asthma"][0], params["range_asthma"][1]).sum() == len(input_data), "an error occurs in ASTHMA range."
    assert input_data.INMSUPR.between(params["range_inmsupr"][0], params["range_inmsupr"][1]).sum() == len(input_data), "an error occurs in INMSUPR range."
    assert input_data.HIPERTENSION.between(params["range_hipertension"][0], params["range_hipertension"][1]).sum() == len(input_data), "an error occurs in HIPERTENSION range."
    assert input_data.OTHER_DISEASE.between(params["range_other_disease"][0], params["range_other_disease"][1]).sum() == len(input_data), "an error occurs in OTHER_DISEASE range."
    assert input_data.CARDIOVASCULAR.between(params["range_cardiovascular"][0], params["range_cardiovascular"][1]).sum() == len(input_data), "an error occurs in CARDIOVASCULAR range."
    assert input_data.OBESITY.between(params["range_obesity"][0], params["range_obesity"][1]).sum() == len(input_data), "an error occurs in OBESITY range."
    assert input_data.RENAL_CHRONIC.between(params["range_renal_chronic"][0], params["range_renal_chronic"][1]).sum() == len(input_data), "an error occurs in RENAL_CHRONIC range."
    assert input_data.TOBACCO.between(params["range_tobacco"][0], params["range_tobacco"][1]).sum() == len(input_data), "an error occurs in TOBACCO range."
    assert input_data.CLASIFFICATION_FINAL.between(params["range_classification"][0], params["range_classification"][1]).sum() == len(input_data), "an error occurs in CLASIFFICATION_FINAL range."
    assert input_data.ICU.between(params["range_icu"][0], params["range_icu"][1]).sum() == len(input_data), "an error occurs in ICU range."

In [259]:
check_data(dataset, params)

data sudah sesuai design, tidak ada error

## Data Splitting

In [267]:
x = dataset[params["predictors"]].copy()
y = dataset["DEATH"].copy()

In [263]:
x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 76832 entries, 5 to 1048569
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   USMER                 76832 non-null  int64
 1   MEDICAL_UNIT          76832 non-null  int64
 2   SEX                   76832 non-null  int64
 3   PATIENT_TYPE          76832 non-null  int64
 4   INTUBED               76832 non-null  int64
 5   PNEUMONIA             76832 non-null  int64
 6   AGE                   76832 non-null  int64
 7   PREGNANT              76832 non-null  int64
 8   DIABETES              76832 non-null  int64
 9   COPD                  76832 non-null  int64
 10  ASTHMA                76832 non-null  int64
 11  INMSUPR               76832 non-null  int64
 12  HIPERTENSION          76832 non-null  int64
 13  OTHER_DISEASE         76832 non-null  int64
 14  CARDIOVASCULAR        76832 non-null  int64
 15  OBESITY               76832 non-null  int64
 16  RE

In [269]:
y.value_counts()

2    52978
1    23854
Name: DEATH, dtype: int64

In [272]:
from sklearn.model_selection import train_test_split

In [273]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42, stratify = y)

In [274]:
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, test_size = 0.5, random_state = 42, stratify = y_test)

In [279]:
joblib.dump(x_train, params['train_set_path'][0])
joblib.dump(y_train, params['train_set_path'][1])
joblib.dump(x_valid, params['valid_set_path'][0])
joblib.dump(y_valid, params['valid_set_path'][1])
joblib.dump(x_test, params['test_set_path'][0])
joblib.dump(y_test, params['test_set_path'][1])

['../data/processed/y_test.pkl']