---
#  1. Import Libraries, Upload Dataset dan Eksplorasi Data
---

In [None]:
# import libraries
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [None]:
# upload dataset
train = pd.read_csv("../input/av-healthcare2/train.csv")
test = pd.read_csv("../input/av-healthcare2/test.csv")

In [None]:
# dimensi data train
train.shape

In [None]:
# menampilkan isi data train
train.head(5)

In [None]:
# mengecek type data train
train.info()

In [None]:
# cek data train yang kosong
train.isnull().sum()

In [None]:
# mengecek distribusi target
train.groupby('Stay').size()

In [None]:
# dimensi data test
test.shape

In [None]:
# menampilkan isi data test
test.head(5)

In [None]:
# mengecek type data test
test.info()

In [None]:
# cek data test yang kosong
test.isnull().sum()

---
# 2. Preprocessing
---

In [None]:
# encoding label target menjadi numerik
le = LabelEncoder()
train['Stay'] = le.fit_transform(train['Stay'])

In [None]:
# mengecek kembali distribusi target
train.groupby('Stay').size()

In [None]:
# filling missing values data train with forward fill
train['Bed Grade'] = train['Bed Grade'].fillna(method="ffill",axis=0)
train['City_Code_Patient'] = train['City_Code_Patient'].fillna(method="ffill",axis=0)

In [None]:
# cek kembali data train yang kosong
train.isnull().sum()

In [None]:
# cek kembali dimensi data train
train.shape

In [None]:
# filling missing values data test with forward fill
test['Bed Grade'] = test['Bed Grade'].fillna(method="ffill",axis=0)
test['City_Code_Patient'] = test['City_Code_Patient'].fillna(method="ffill",axis=0)

In [None]:
# cek kembali data test yang kosong
test.isnull().sum()

In [None]:
# cek kembali dimensi data test
test.shape

In [None]:
# Combining train and test data for preprocessing 

train['train_flag'] = 1
test['train_flag'] = 0
test['Stay'] = None
print('dimensi train: ', train.shape)
print('dimensi test: ', test.shape)

df_data = pd.concat((train, test))
print('dimensi data gabungan: ', df_data.shape)

In [None]:
# menampilkan isi data
df_data.head(5)

In [None]:
# cek kembali distribusi kelas target
df_data.groupby('Stay').size()

In [None]:
# mengecek kembali type data
df_data.info()

In [None]:
# mengecek kembali data yang kosong
df_data.isnull().sum()

In [None]:
# Converting float to int type
df_data['Bed Grade'] = df_data['Bed Grade'].astype(int)
df_data['City_Code_Patient'] = df_data['City_Code_Patient'].astype(int)
df_data['Admission_Deposit'] = df_data['Admission_Deposit'].astype(int)

In [None]:
# mengecek kembali tipe datanya
df_data.info()

In [None]:
# mengecek features mana aja yang bertipe data object
dataobject=df_data.select_dtypes(['object']).columns
dataobject

In [None]:
# encoder dataobject ke numerik
df_data['Hospital_type_code'] = le.fit_transform(df_data['Hospital_type_code'])
df_data['Hospital_region_code'] = le.fit_transform(df_data['Hospital_region_code'])
df_data['Department'] = le.fit_transform(df_data['Department'])
df_data['Ward_Type'] = le.fit_transform(df_data['Ward_Type'])
df_data['Ward_Facility_Code'] = le.fit_transform(df_data['Ward_Facility_Code'])
df_data['Type of Admission'] = le.fit_transform(df_data['Type of Admission'])
df_data['Severity of Illness'] = le.fit_transform(df_data['Severity of Illness'])
df_data['Age'] = le.fit_transform(df_data['Age'])

In [None]:
# menampilkan kembali isi data untuk melihat hasilnya
df_data.head(5)

In [None]:
df_data['patientid'].unique()

In [None]:
df_data['Admission_Deposit'].unique()

In [None]:
df_data.groupby('patientid').size()

In [None]:
df_data.groupby('Admission_Deposit').size()

In [None]:
# mengecek kembali type datanya
df_data.info()

In [None]:
train, test = df_data[df_data.train_flag == 1], df_data[df_data.train_flag == 0]

# Splitting back train and set sets 
train.drop(['train_flag'], inplace=True, axis=1)
test.drop(['train_flag'], inplace=True, axis=1)
test.drop(['Stay'], inplace=True, axis=1)

In [None]:
# menampilkan dimensi data train, test dan data gabungan.
print('dimensi train: ', train.shape)
print('dimensi test: ', test.shape)
print('dimensi data gabungan: ', df_data.shape)

In [None]:
# Converting object to int type
train['Stay'] = train['Stay'].astype(int)

In [None]:
# cek kembali tipe datanya untuk memastikan hasilnya
train.info()

In [None]:
train.head(5)

In [None]:
test.head(5)

# 3. EDA

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# plotting korelasi
plt.figure(figsize=(15,15))
sns.heatmap(train.corr(),cmap='Greens',annot=True)

In [None]:
# Nyatakan tabulasi frekuensi dalam bentuk batang. Histogram memudahkan kita untuk memahami ringkasan persebaran data
train.hist(figsize = (15,20))

---
# 4. Building Model Decision Tree
---

In [None]:
# split data menjadi training 80% dan testing 20%
# Kita hilangkan case_id, patientid, dan Admission_Deposit karena tidak diperlukan
X = train.drop(['Stay','case_id','patientid', 'Admission_Deposit'],axis=1)
y = train['Stay']

X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=1, shuffle=True, stratify=train['Stay'])

In [None]:
kriteria = ['entropy','gini']
# prepruning dengan max-depth criteria entropy
for a in kriteria:
    for i in range (4, 21, 4):
        dtree = DecisionTreeClassifier(criterion="entropy", max_depth=i, splitter='best')
        dtree = dtree.fit(X_train, y_train)
        y_predict = dtree.predict(X_test)
        target = ['0-10', '11-20', '21-30', '31-40', '41-50', '51-60', '61-70', '71-80', '81-90', '91-100', 'More than 100 Days']
        print("Kriteria :",a)
        print("Max Depth: ",i)
        print("Accuracy: {:.0%}".format(accuracy_score(y_test, y_predict)))
        print("Precision: {:.0%}".format(precision_score(y_test, y_predict, average='macro')))
        print("Recall: {:.0%}".format(recall_score(y_test, y_predict, average='macro')))
        print("f1 Score: {:.0%}".format(f1_score(y_test, y_predict, average='macro')))
        print("AUC Score: {:.0%}".format(roc_auc_score(y_test, dtree.predict_proba(X_test), multi_class='ovr')))
        print(classification_report(y_test, y_predict, target_names=target, zero_division=1))
        print("\n")
#         data=[a,i,accuracy_score(y_test, y_predict),precision_score(y_test, y_predict, average='macro'),recall_score(y_test, y_predict, average='macro'),f1_score(y_test, y_predict, average='macro'),roc_auc_score(y_test, dtree.predict_proba(X_test), multi_class='ovr')]
#             #converting results to dataframe
#         results=pd.DataFrame(data,columns=[
#                 "Kriteria",
#                 "Max_Depth",
#                 "Accuracy",
#                 "Precision",
#                 "Recall",
#                 "F1 Score",
#                 "AUC"
#             ])
#         results