# Importing Data

Importing Data dibutuhkan untuk mengambil library-library yang dibutuhkan dan memasukkannya ke dalam lingkungan analisis yang digunakan, tanpa library-library ini proses analisis data bisa terhambat karena library ini dibutuhkan untuk menjalankan proses analisis lebih lanjut.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Exploratory Data Analysis (EDA)

Exploratory Data Analysis (EDA) dalam bahasa Indonesia sebagai Analisis Data Eksploratif ini salah satu langkah yang dibutuhkan dalam menganalisis data. Langkah ini melibatkan pemeriksaan dataset yang bertujuan untuk memahami data yang sedang di analisis , mengidentifikasi kesalahan dalam data, menemukan pola dan hubungan antara variabel, dan mempersiapkan data untuk analisis lanjutan.

In [2]:
dt = pd.read_csv("dataset/diabetes_dataset.csv")
dt

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


In [3]:
dt.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [4]:
dt.dtypes

gender                  object
age                    float64
hypertension             int64
heart_disease            int64
smoking_history         object
bmi                    float64
HbA1c_level            float64
blood_glucose_level      int64
diabetes                 int64
dtype: object

# Data Information

In [5]:
dt.heart_disease.unique()

array([1, 0], dtype=int64)

In [6]:
dt.hypertension.unique()

array([0, 1], dtype=int64)

In [7]:
dt.diabetes.unique()

array([0, 1], dtype=int64)

In [8]:
dt.smoking_history.unique()

array(['never', 'No Info', 'current', 'former', 'ever', 'not current'],
      dtype=object)

In [9]:
dt.gender.unique()

array(['Female', 'Male', 'Other'], dtype=object)

In [10]:
dt.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

# One Hot Encode

In [11]:
encode = pd.get_dummies(dt, prefix=None)
encode

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Female,gender_Male,gender_Other,smoking_history_No Info,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,80.0,0,1,25.19,6.6,140,0,1,0,0,0,0,0,0,1,0
1,54.0,0,0,27.32,6.6,80,0,1,0,0,1,0,0,0,0,0
2,28.0,0,0,27.32,5.7,158,0,0,1,0,0,0,0,0,1,0
3,36.0,0,0,23.45,5.0,155,0,1,0,0,0,1,0,0,0,0
4,76.0,1,1,20.14,4.8,155,0,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,80.0,0,0,27.32,6.2,90,0,1,0,0,1,0,0,0,0,0
99996,2.0,0,0,17.37,6.5,100,0,1,0,0,1,0,0,0,0,0
99997,66.0,0,0,27.83,5.7,155,0,0,1,0,0,0,0,1,0,0
99998,24.0,0,0,35.42,4.0,100,0,1,0,0,0,0,0,0,1,0


# Normalize Data

Mengapa kita perlu melakukan Data Normalisasi? Karena ketika melakukan pemodelan data, diantaranya dengan SVM, Regresi Linier, dan sejenisnya, perbedaan rentan nilai dalam tiap-tiap nilai sel pada data akan mempengaruhi kinerja algoritma yang akan dilakukan, sehingga menghilangkan perbedaan skala dan mengurangi efek outlier perlu dilakukan

In [12]:
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
d = scaler.fit_transform(encode)
data_normalisasi = pd.DataFrame(d, columns=encode.columns)
data_normalisasi

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Female,gender_Male,gender_Other,smoking_history_No Info,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,1.000000,0.0,1.0,0.177171,0.563636,0.272727,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.674675,0.0,0.0,0.202031,0.563636,0.000000,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.349349,0.0,0.0,0.202031,0.400000,0.354545,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.449449,0.0,0.0,0.156863,0.272727,0.340909,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.949950,1.0,1.0,0.118231,0.236364,0.340909,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1.000000,0.0,0.0,0.202031,0.490909,0.045455,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
99996,0.024024,0.0,0.0,0.085901,0.545455,0.090909,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
99997,0.824825,0.0,0.0,0.207983,0.400000,0.340909,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
99998,0.299299,0.0,0.0,0.296569,0.090909,0.090909,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


# Split Test

In [13]:
from sklearn.model_selection import train_test_split
X = data_normalisasi.drop(['diabetes', 'hypertension', 'heart_disease'], axis=1)
y = data_normalisasi['diabetes']

# Test = 20% dan Train = 80%
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [14]:
X_train

Unnamed: 0,age,bmi,HbA1c_level,blood_glucose_level,gender_Female,gender_Male,gender_Other,smoking_history_No Info,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
39185,0.624625,0.308123,0.563636,0.045455,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
55014,0.261762,0.202031,0.490909,0.363636,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
16399,0.261762,0.202031,0.400000,0.209091,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7644,0.249249,0.165616,0.181818,0.363636,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
57028,0.787287,0.208800,0.490909,0.090909,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
41649,0.712212,0.150210,0.236364,0.340909,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
42072,0.021522,0.083450,0.236364,0.545455,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
34147,0.199199,0.142624,0.236364,0.340909,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
65511,0.499499,0.374883,0.400000,0.000000,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [15]:
X_test

Unnamed: 0,age,bmi,HbA1c_level,blood_glucose_level,gender_Female,gender_Male,gender_Other,smoking_history_No Info,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
40334,0.149149,0.100023,0.090909,0.295455,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
60319,0.874875,0.289916,0.090909,0.363636,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
64312,0.874875,0.160948,0.090909,0.354545,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
35229,0.874875,0.154995,0.236364,0.272727,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
26814,0.099099,0.080766,0.090909,0.090909,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
47487,0.174174,0.268207,0.090909,0.363636,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
71643,0.962462,0.097106,0.090909,0.227273,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
81006,0.662162,0.159897,0.490909,0.045455,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9779,1.000000,0.235411,0.272727,0.209091,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [16]:
y_train

39185    0.0
55014    0.0
16399    0.0
7644     0.0
57028    0.0
        ... 
41649    0.0
42072    0.0
34147    0.0
65511    0.0
51645    0.0
Name: diabetes, Length: 80000, dtype: float64

# Training Model - SVM

In [None]:
from sklearn import svm
model_svm = svm.SVC()

In [None]:
model = model_svm.fit(X_train, y_train)

# Evaluasi Model - SVM

In [None]:
accuracy_test_svm = model_svm.score(X_test,y_test)

In [None]:
accuracy_train_svm = model_svm.score(X_train,y_train)

In [None]:
print(f"Akurasi Model (Train) : {np.round(accuracy_train_svm * 100,2)} %")
print(f"Akurasi Model (Test)  : {np.round(accuracy_test_svm * 100,2)} %")

# Prediction - SVM

In [None]:
df_test = pd.DataFrame(data={
    "age" : [51, 28],
    "gender"  : ["Female", "Male"],
    "smoking_history" : ["current", "never"],
    "bmi": [23.45, 21.67],
    "HbA1c_level": [4.8, 5.4],
    "blood_glucose_level": [158, 142],
})

df_test

In [None]:
encode_test = pd.get_dummies(df_test, prefix=None)
encode_test

encode_test

In [None]:
a=[]
for i in range(len(encode_test)):
    if encode_test["gender_Female"][i]==1 or encode_test["gender_Male"][i]==1:
        a.append(0)
    elif encode_test["gender_Female"][i] != 1 and encode_test["gender_Male"][i] != 1:
        a.append(1)
    else:
        continue
encode_test["gender_Other"]= a
encode_test

In [None]:
pred_test = model.predict(encode_test)

# K-Neiress Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)

In [None]:
knn.fit(X_train, y_train)

# Evaluasi Model - K-Neiress Neighbors

In [None]:
accuracy_test_knn = knn.score(X_test, y_test)

In [None]:
accuracy_train_knn = knn.score(X_train, y_train)

In [None]:
print(f"Akurasi Model (Train) : {np.round(accuracy_train_knn * 100,2)} %")
print(f"Akurasi Model (Test)  : {np.round(accuracy_test_knn * 100,2)} %")

In [None]:
knn_palette = sns.color_palette(['#000C1F', '#29757A', '#FF5050'])
knn_palette

# Training Model - Random Forest

In [17]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()

In [19]:
rfc.fit(X_train, y_train)

# Evaluasi Model - Random Forest

In [20]:
accuracy_test_rfc = rfc.score(X_test, y_test)
accuracy_train_rfc = rfc.score(X_train, y_train)

In [22]:
print(f"Akurasi Model (Train) : {np.round(accuracy_train_rfc * 100,2)} %")
print(f"Akurasi Model (Test)  : {np.round(accuracy_test_rfc * 100,2)} %")

Akurasi Model (Train) : 99.88 %
Akurasi Model (Test)  : 96.78 %


# Training Model - Decision Tree