In [38]:
import pandas as pd
import numpy as np

## LOAD DATASET

In [39]:
df = pd.read_csv('kidney-check.csv')
df.sample()

Unnamed: 0,no,rm,age,ureum,creatinin,classification
66,67,1107971,52.0,34.0,1.93,1


In [40]:
df = df.drop(['no','rm'], axis=1)

In [41]:
df.sample()

Unnamed: 0,age,ureum,creatinin,classification
56,56.0,78.0,4.4,1


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121 entries, 0 to 120
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             121 non-null    float64
 1   ureum           121 non-null    float64
 2   creatinin       121 non-null    float64
 3   classification  121 non-null    int64  
dtypes: float64(3), int64(1)
memory usage: 3.9 KB


In [43]:
numerical = []
catgcols = []

for col in df.columns:
    if df[col].dtype=="float64":
        numerical.append(col)
    else:
        catgcols.append(col)

for col in df.columns:
    if col in numerical:
        df[col].fillna(df[col].median(), inplace=True)
    else:
        df[col].fillna(df[col].mode()[0], inplace=True)

In [44]:
numerical

['age', 'ureum', 'creatinin']

In [45]:
catgcols

['classification']

In [46]:
df['classification'].value_counts()

classification
1    67
0    54
Name: count, dtype: int64

In [47]:
ind_col = [col for col in df.columns if col != 'classification']
dep_col = 'classification'

In [48]:
df[dep_col].value_counts()

classification
1    67
0    54
Name: count, dtype: int64

## TRANSFORMASI DATASET

In [49]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in catgcols:
    df[col] = le.fit_transform(df[col])

In [50]:
df['classification'] = le.fit_transform(df['classification'])

In [51]:
x = df[ind_col] #feature
y = df[dep_col] #label

In [52]:
df.head()

Unnamed: 0,age,ureum,creatinin,classification
0,72.0,31.0,1.48,1
1,48.0,238.0,12.3,1
2,50.0,97.0,9.22,1
3,32.0,43.3,3.39,1
4,56.0,84.0,5.3,1


In [53]:
df.to_csv('kidney-check-clean.csv', index=False)

In [54]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [55]:
x

Unnamed: 0,age,ureum,creatinin
0,72.0,31.0,1.48
1,48.0,238.0,12.30
2,50.0,97.0,9.22
3,32.0,43.3,3.39
4,56.0,84.0,5.30
...,...,...,...
116,20.0,10.0,1.00
117,29.0,23.0,1.19
118,38.0,9.0,0.87
119,23.0,15.0,1.18


In [56]:
x_train

Unnamed: 0,age,ureum,creatinin
13,48.0,25.0,1.35
54,71.0,79.0,4.21
93,18.0,13.0,0.99
26,57.0,112.0,5.32
68,29.0,22.0,1.00
...,...,...,...
67,66.0,20.0,1.20
64,52.0,91.0,11.23
117,29.0,23.0,1.19
47,64.0,70.0,4.86


In [57]:
x_test

Unnamed: 0,age,ureum,creatinin
48,60.0,111.0,10.2
94,63.0,18.0,1.02
95,15.0,19.0,1.12
8,50.0,101.0,9.0
97,31.0,20.0,1.11
22,82.0,207.0,9.11
7,61.0,150.0,11.86
10,50.0,66.0,6.53
45,74.0,154.0,13.88
89,29.0,17.0,1.03


In [58]:
y

0      1
1      1
2      1
3      1
4      1
      ..
116    0
117    0
118    0
119    0
120    1
Name: classification, Length: 121, dtype: int64

In [59]:
y_train

13     1
54     1
93     0
26     1
68     0
      ..
67     0
64     1
117    0
47     1
44     1
Name: classification, Length: 96, dtype: int64

In [60]:
y_test

48     1
94     0
95     0
8      1
97     0
22     1
7      1
10     1
45     1
89     0
33     1
50     1
2      1
60     1
120    1
74     0
30     1
43     1
112    0
76     0
63     1
59     1
16     1
24     1
111    0
Name: classification, dtype: int64

## MEMBUAT MODEL DECISION TREE

In [61]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

dtc = DecisionTreeClassifier(
    ccp_alpha=0.0, class_weight=None, criterion='entropy', 
    max_depth=4, max_features=None, max_leaf_nodes=None,
    min_impurity_decrease=0.0, min_samples_leaf=1,
    min_samples_split=2, min_weight_fraction_leaf=0.0,
    random_state=42, splitter='best'
)

model = dtc.fit(x_train, y_train)

dtc_acc = accuracy_score(y_test, dtc.predict(x_test))

print(f"Akurasi Data Training = {accuracy_score(y_train, dtc.predict(x_train))}")
print(f"Akurasi Data Testing = {dtc_acc} \n")

print(f"Confusion Matrix : \n{confusion_matrix(y_test, dtc.predict(x_test))}\n")
confusion = confusion_matrix(y_test, dtc.predict(x_test))
tn, fp, fn, tp = confusion.ravel()
print(f"Classification Report : \n {classification_report(y_test, dtc.predict(x_test))}")

Akurasi Data Training = 1.0
Akurasi Data Testing = 0.96 

Confusion Matrix : 
[[ 8  0]
 [ 1 16]]

Classification Report : 
               precision    recall  f1-score   support

           0       0.89      1.00      0.94         8
           1       1.00      0.94      0.97        17

    accuracy                           0.96        25
   macro avg       0.94      0.97      0.96        25
weighted avg       0.96      0.96      0.96        25



## SIMULASI MODEL

In [62]:
input_data = (69.0,123.0,5.27)

input_data_as_numpy_array = np.array(input_data)

input_data_reshape = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshape)
print(prediction)

if (prediction[0]==0):
    print ('Pasien Tidak Terkena Penyakit Ginjal')
else:
    print ('Pasien Terkena Penyakit Ginjal')

[1]
Pasien Terkena Penyakit Ginjal




## VISUALISASI POHON KEPUTUSAN