# 1. Data Preparation

In [287]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import joblib
import os
import ruamel.yaml as yaml

from sklearn.model_selection import train_test_split

In [288]:
params_dir = '../config/params.yaml'

In [289]:
def load_params(params_dir):
  with open(params_dir, 'r') as file:
    params = yaml.safe_load(file)

    return params

In [283]:
params = load_params(params_dir)

## 1. Load Dataset

In [290]:
def read_dataset(dataset_dir):
    dataset = pd.DataFrame()

    for i in tqdm(os.listdir(dataset_dir)):
        dataset = pd.concat([pd.read_csv(dataset_dir + i), dataset])
    
    return dataset

In [291]:
dataset = read_dataset(params["dataset_dir"])

100%|██████████| 1/1 [00:02<00:00,  2.18s/it]


In [292]:
print(dataset.head())

print(dataset.describe())

   USMER  MEDICAL_UNIT  SEX  PATIENT_TYPE   DATE_DIED  INTUBED  PNEUMONIA  \
0      2             1    1             1  03/05/2020       97          1   
1      2             1    2             1  03/06/2020       97          1   
2      2             1    2             2  09/06/2020        1          2   
3      2             1    1             1  12/06/2020       97          2   
4      2             1    2             1  21/06/2020       97          2   

   AGE  PREGNANT  DIABETES  ...  ASTHMA  INMSUPR  HIPERTENSION  OTHER_DISEASE  \
0   65         2         2  ...       2        2             1              2   
1   72        97         2  ...       2        2             1              2   
2   55        97         1  ...       2        2             2              2   
3   53         2         2  ...       2        2             2              2   
4   68        97         1  ...       2        2             1              2   

   CARDIOVASCULAR  OBESITY  RENAL_CHRONIC  TOBACCO

## Data Definition

```
DATE_DIED:
  [object]
  [datetime] berarti meninggal
  9999-99-99 berarti selamat

SEX:
  [integer]
  [1-99]
  jenis kelamin wanita atau pria
  97-99 berarti null

AGE:
  [integer]
  [1-121]
  umur pasien
  97-99 berarti null

CLASIFFICATION_FINAL:
  [integer]
  [1-7]
  hasil test covid
  positif covid bernilai 1-3
  negatif covid bernilai diatas 4

PATIENT_TYPE:
  [integer]
  [1-99]
  di rumah sakit atau isolasi mandiri
  97-99 berarti null

PNEUMONIA:
  [integer]
  [1-99]
  apakah pasien memiliki air sacs inflammation
  97-99 berarti null

PREGNANT:
  [integer]
  [1-99]
  apakah pasien hamil atau tidak
  97-99 berarti null

DIABETES:
  [integer]
  [1-99]
  apakah pasien diabetes atau tidak
  97-99 berarti null

COPD:
  [integer]
  [1-99]
  apakah pasien punya penyakit Chronic obstructive pulmonary atau tidak
  97-99 berarti null

ASTHMA:
  [integer]
  [1-99]
  apakah pasien punya asma atau tidak
  97-99 berarti null

INMSUPR:
  [integer]
  [1-99]
  apakah pasien immunosuppressed atau tidak
  97-99 berarti null

HIPERTENSION:
  [integer]
  [1-99]
  apakah pasient hipertensi atau tidak
  97-99 berarti null

CARDIOVASCULAR:
  [integer]
  [1-99]
  apakah pasien memiliki penyakit jantung atau pembuluh darah
  97-99 berarti null

RENAL_CHRONIC:
  [integer]
  [1-99]
  apakah pasien memiliki chronic renal disease atau tidak
  97-99 berarti null

OTHER_DISEASE:
  [integer]
  [1-99]
  apakah pasien memiliki penyakit lain atau tidak
  97-99 berarti null

OBESITY:
  [integer]
  [1-99]
  apakah pasien obesitas
  97-99 berarti null

TOBACCO:
  [integer]
  [1-99]
  apakah pasien merokok atau tidak
  97-99 berarti null

USMR:
  [integer]
  [1-99]
  apakah pasien dirawat pada kelas 1, 2, atau 3
  97-99 berarti null

MEDICAL_UNIT:
  [integer]
  [1-13]
  tipe institusi kesehatan yang menyediakan jasa perawatan

INTUBED:
  [integer]
  [1-99]
  apakah pasien terhubung ke ventilator
  97-99 berarti null

ICU:
  [integer]
  [1-99]
  apakah pasien masuk ke ICU
  97-99 berarti null
```

## Data Validation

### 1. Cek tipe data

In [293]:
dataset.dtypes

USMER                    int64
MEDICAL_UNIT             int64
SEX                      int64
PATIENT_TYPE             int64
DATE_DIED               object
INTUBED                  int64
PNEUMONIA                int64
AGE                      int64
PREGNANT                 int64
DIABETES                 int64
COPD                     int64
ASTHMA                   int64
INMSUPR                  int64
HIPERTENSION             int64
OTHER_DISEASE            int64
CARDIOVASCULAR           int64
OBESITY                  int64
RENAL_CHRONIC            int64
TOBACCO                  int64
CLASIFFICATION_FINAL     int64
ICU                      int64
dtype: object

Dari hasil pengecekan, semua data sesuai

### 2. Range

In [294]:
dataset.describe()

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
count,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0
mean,1.632194,8.980565,1.499259,1.190765,79.52288,3.346831,41.7941,49.76558,2.186404,2.260569,2.242626,2.298132,2.128989,2.435143,2.26181,2.125176,2.25718,2.214333,5.305653,79.55397
std,0.4822084,3.723278,0.4999997,0.3929041,36.86889,11.91288,16.90739,47.51073,5.424242,5.132258,5.114089,5.462843,5.236397,6.646676,5.19485,5.175445,5.135354,5.323097,1.881165,36.82307
min,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,1.0,4.0,1.0,1.0,97.0,2.0,30.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,97.0
50%,2.0,12.0,1.0,1.0,97.0,2.0,40.0,97.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,6.0,97.0
75%,2.0,12.0,2.0,1.0,97.0,2.0,53.0,97.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,7.0,97.0
max,2.0,13.0,2.0,2.0,99.0,99.0,121.0,98.0,98.0,98.0,98.0,98.0,98.0,98.0,98.0,98.0,98.0,98.0,7.0,99.0


### 3. Dimensi Data

In [295]:
dataset.shape

(1048575, 21)

### 4. Handling Column Error

### 4.1. Remove null

Tidak ada null, dataset sudah menghandle nya
Pada boolean, 1 = yes 2 = no
Nilai 97..99 = null

In [296]:
dataset.isnull().sum()

USMER                   0
MEDICAL_UNIT            0
SEX                     0
PATIENT_TYPE            0
DATE_DIED               0
INTUBED                 0
PNEUMONIA               0
AGE                     0
PREGNANT                0
DIABETES                0
COPD                    0
ASTHMA                  0
INMSUPR                 0
HIPERTENSION            0
OTHER_DISEASE           0
CARDIOVASCULAR          0
OBESITY                 0
RENAL_CHRONIC           0
TOBACCO                 0
CLASIFFICATION_FINAL    0
ICU                     0
dtype: int64

In [297]:
joblib.dump(dataset, params['dataset_cleaned_path'])

['../data/processed/covid_dataset.pkl']

## 4. Data Defense

In [308]:
def check_data(input_data, params):
    # check data types
    assert input_data.select_dtypes("object").columns.to_list() == params["object_columns"], "an error occurs in object column(s)."

    assert input_data.select_dtypes("int").columns.to_list() == params["int_columns"], "an error occurs in int column(s)."

    # check range of data
    assert input_data.USMER.between(params["range_usmr"][0], params["range_usmr"][1]).sum() == len(input_data), "an error occurs in USMER range."
    assert input_data.MEDICAL_UNIT.between(params["range_medical_unit"][0], params["range_medical_unit"][1]).sum() == len(input_data), "an error occurs in MEDICAL_UNIT range."
    assert input_data.SEX.between(params["range_sex"][0], params["range_sex"][1]).sum() == len(input_data), "an error occurs in SEX range."
    assert input_data.PATIENT_TYPE.between(params["range_patient_type"][0], params["range_patient_type"][1]).sum() == len(input_data), "an error occurs in PATIENT_TYPE range."
    assert input_data.INTUBED.between(params["range_intubed"][0], params["range_intubed"][1]).sum() == len(input_data), "an error occurs in INTUBED range."
    assert input_data.PNEUMONIA.between(params["range_pneumonia"][0], params["range_pneumonia"][1]).sum() == len(input_data), "an error occurs in PNEUMONIA range."
    assert input_data.AGE.between(params["range_age"][0], params["range_age"][1]).sum() == len(input_data), "an error occurs in AGE range."
    assert input_data.PREGNANT.between(params["range_pregnant"][0], params["range_pregnant"][1]).sum() == len(input_data), "an error occurs in PREGNANT range."
    assert input_data.DIABETES.between(params["range_diabetes"][0], params["range_diabetes"][1]).sum() == len(input_data), "an error occurs in DIABETES range."
    assert input_data.COPD.between(params["range_copd"][0], params["range_copd"][1]).sum() == len(input_data), "an error occurs in COPD range."
    assert input_data.ASTHMA.between(params["range_asthma"][0], params["range_asthma"][1]).sum() == len(input_data), "an error occurs in ASTHMA range."
    assert input_data.INMSUPR.between(params["range_inmsupr"][0], params["range_inmsupr"][1]).sum() == len(input_data), "an error occurs in INMSUPR range."
    assert input_data.HIPERTENSION.between(params["range_hipertension"][0], params["range_hipertension"][1]).sum() == len(input_data), "an error occurs in HIPERTENSION range."
    assert input_data.OTHER_DISEASE.between(params["range_other_disease"][0], params["range_other_disease"][1]).sum() == len(input_data), "an error occurs in OTHER_DISEASE range."
    assert input_data.CARDIOVASCULAR.between(params["range_cardiovascular"][0], params["range_cardiovascular"][1]).sum() == len(input_data), "an error occurs in CARDIOVASCULAR range."
    assert input_data.OBESITY.between(params["range_obesity"][0], params["range_obesity"][1]).sum() == len(input_data), "an error occurs in OBESITY range."
    assert input_data.RENAL_CHRONIC.between(params["range_renal_chronic"][0], params["range_renal_chronic"][1]).sum() == len(input_data), "an error occurs in RENAL_CHRONIC range."
    assert input_data.TOBACCO.between(params["range_tobacco"][0], params["range_tobacco"][1]).sum() == len(input_data), "an error occurs in TOBACCO range."
    assert input_data.CLASIFFICATION_FINAL.between(params["range_classification"][0], params["range_classification"][1]).sum() == len(input_data), "an error occurs in CLASIFFICATION_FINAL range."
    assert input_data.ICU.between(params["range_icu"][0], params["range_icu"][1]).sum() == len(input_data), "an error occurs in ICU range."

In [314]:
check_data(dataset, params)

data sudah sesuai design, tidak ada error

## Data Splitting

In [330]:
x = dataset[params["predictors"]].copy()
y = dataset["DATE_DIED"].copy()

In [331]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 20 columns):
 #   Column                Non-Null Count    Dtype
---  ------                --------------    -----
 0   USMER                 1048575 non-null  int64
 1   MEDICAL_UNIT          1048575 non-null  int64
 2   SEX                   1048575 non-null  int64
 3   PATIENT_TYPE          1048575 non-null  int64
 4   INTUBED               1048575 non-null  int64
 5   PNEUMONIA             1048575 non-null  int64
 6   AGE                   1048575 non-null  int64
 7   PREGNANT              1048575 non-null  int64
 8   DIABETES              1048575 non-null  int64
 9   COPD                  1048575 non-null  int64
 10  ASTHMA                1048575 non-null  int64
 11  INMSUPR               1048575 non-null  int64
 12  HIPERTENSION          1048575 non-null  int64
 13  OTHER_DISEASE         1048575 non-null  int64
 14  CARDIOVASCULAR        1048575 non-null  int64
 15  OBESITY        

In [332]:
y.value_counts()

9999-99-99    971633
06/07/2020      1000
07/07/2020       996
13/07/2020       990
16/06/2020       979
               ...  
24/11/2020         1
17/12/2020         1
08/12/2020         1
16/03/2021         1
22/04/2021         1
Name: DATE_DIED, Length: 401, dtype: int64

In [337]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

In [338]:
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, test_size = 0.5, random_state = 42)

In [339]:
joblib.dump(x_train, params['train_set_path'][0])
joblib.dump(y_train, params['train_set_path'][1])
joblib.dump(x_valid, params['valid_set_path'][0])
joblib.dump(y_valid, params['valid_set_path'][1])
joblib.dump(x_test, params['test_set_path'][0])
joblib.dump(y_test, params['test_set_path'][1])

['../data/processed/y_test.pkl']