link dataset: https://www.kaggle.com/datasets/iammustafatz/diabetes-prediction-dataset

In [42]:
import pandas as pd

In [43]:
df = pd.read_csv('../diabetes_prediction_dataset.csv')

In [44]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


 ## **bagi dataset menjadi training set dan testing set dengan proporsi 70:30**

In [45]:
from sklearn.model_selection import train_test_split

In [46]:
x = df.drop("diabetes", axis=1) # feature
y = df['diabetes'] # target

In [47]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
# train = 70% dan test = 30%

In [48]:
print("Dimensi x_train: ", x_train.shape)
print("Dimensi x_test: ", x_test.shape)
print("Dimensi y_train: ", y_train.shape)
print("Dimensi y_test: ", y_test.shape)

Dimensi x_train:  (70000, 8)
Dimensi x_test:  (30000, 8)
Dimensi y_train:  (70000,)
Dimensi y_test:  (30000,)


## **lakukan normalisasi pada dataset**

In [49]:
from sklearn.preprocessing import MinMaxScaler

In [50]:
dfMM = df.copy()

In [51]:
mm = MinMaxScaler()

In [52]:
dfMM[['HbA1c_level', 'blood_glucose_level']] = mm.fit_transform(dfMM[['HbA1c_level', 'blood_glucose_level']])

In [53]:
print("Nilai Max: ", dfMM['HbA1c_level'].max())
print("Nilai Min: ", dfMM['HbA1c_level'].min())

Nilai Max:  1.0
Nilai Min:  0.0


In [54]:
dfMM[['HbA1c_level', 'blood_glucose_level']]

Unnamed: 0,HbA1c_level,blood_glucose_level
0,0.563636,0.272727
1,0.563636,0.000000
2,0.400000,0.354545
3,0.272727,0.340909
4,0.236364,0.340909
...,...,...
99995,0.490909,0.045455
99996,0.545455,0.090909
99997,0.400000,0.340909
99998,0.090909,0.090909


## **lakukan standarisasi pada dataset**

In [55]:
from sklearn.preprocessing import StandardScaler
import numpy as np

In [56]:
dfSS = df.copy()

In [57]:
ss = StandardScaler()
scaled_data = ss.fit_transform(dfSS[['HbA1c_level', 'blood_glucose_level']])

In [58]:
print("Nilai sesudah scaling:")
print(scaled_data)
print("\nNilai rata-rata: ", np.mean(scaled_data))
print("Nilai standar deviasi: ", np.std(scaled_data))

Nilai sesudah scaling:
[[ 1.00170572  0.04770422]
 [ 1.00170572 -1.42620999]
 [ 0.16110802  0.48987848]
 ...
 [ 0.16110802  0.41618277]
 [-1.42668764 -0.93490525]
 [ 1.00170572 -1.18055762]]

Nilai rata-rata:  -1.411493144587439e-16
Nilai standar deviasi:  0.9999999999999999


## **lakukan data cleaning pada data dengan nilai null**

In [59]:
dfDCNull = df.copy()

In [60]:
dfDCNull.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [61]:
dfDCNull.isna().any()

gender                 False
age                    False
hypertension           False
heart_disease          False
smoking_history        False
bmi                    False
HbA1c_level            False
blood_glucose_level    False
diabetes               False
dtype: bool

#### membuat data null dengan menimpa baris 0 menjadi null

In [62]:
dfDCNull.iloc[0,:] = None
dfDCNull.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,,,,,,,,,
1,Female,54.0,0.0,0.0,No Info,27.32,6.6,80.0,0.0
2,Male,28.0,0.0,0.0,never,27.32,5.7,158.0,0.0
3,Female,36.0,0.0,0.0,current,23.45,5.0,155.0,0.0
4,Male,76.0,1.0,1.0,current,20.14,4.8,155.0,0.0


In [63]:
dfDCNull.isna().any()

gender                 True
age                    True
hypertension           True
heart_disease          True
smoking_history        True
bmi                    True
HbA1c_level            True
blood_glucose_level    True
diabetes               True
dtype: bool

#### mengganti nilai null dengan rata-rata pada kolom HbA1c_level

In [64]:
dfDCNull['HbA1c_level'].fillna(dfDCNull['HbA1c_level'].mean(), inplace=True)

In [65]:
dfDCNull.isna().any()

gender                  True
age                     True
hypertension            True
heart_disease           True
smoking_history         True
bmi                     True
HbA1c_level            False
blood_glucose_level     True
diabetes                True
dtype: bool

#### mengubah seluruh kolom bernilai null sesuai ketentuan

In [66]:
dfDCNull['smoking_history'].fillna(dfDCNull['smoking_history'].mode(), inplace=True)
dfDCNull['gender'].fillna(dfDCNull['gender'].mode(), inplace=True)
dfDCNull['age'].fillna(dfDCNull['age'].median(), inplace=True)
dfDCNull['hypertension'].fillna(dfDCNull['hypertension'].median(), inplace=True)
dfDCNull['heart_disease'].fillna(dfDCNull['heart_disease'].median(), inplace=True)
dfDCNull['bmi'].fillna(dfDCNull['bmi'].mean(), inplace=True)
dfDCNull['blood_glucose_level'].fillna(dfDCNull['blood_glucose_level'].median(), inplace=True)
dfDCNull['diabetes'].fillna(dfDCNull['diabetes'].median(), inplace=True)

In [67]:
dfDCNull.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,43.0,0.0,0.0,No Info,27.320788,5.527496,140.0,0.0
1,Female,54.0,0.0,0.0,No Info,27.32,6.6,80.0,0.0
2,Male,28.0,0.0,0.0,never,27.32,5.7,158.0,0.0
3,Female,36.0,0.0,0.0,current,23.45,5.0,155.0,0.0
4,Male,76.0,1.0,1.0,current,20.14,4.8,155.0,0.0


In [68]:
dfDCNull.isna().any()

gender                 False
age                    False
hypertension           False
heart_disease          False
smoking_history        False
bmi                    False
HbA1c_level            False
blood_glucose_level    False
diabetes               False
dtype: bool

## **lakukan data cleaning pada data duplikat**

In [69]:
dfDCNull.duplicated().sum()

3854

banyak bgt.. ini dihapus semua kah?

In [70]:
dfDCNull[dfDCNull.duplicated()]

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
2756,Male,80.0,0.0,0.0,No Info,27.32,6.6,159.0,0.0
3272,Female,80.0,0.0,0.0,No Info,27.32,3.5,80.0,0.0
3418,Female,19.0,0.0,0.0,No Info,27.32,6.5,100.0,0.0
3939,Female,78.0,1.0,0.0,former,27.32,3.5,130.0,0.0
3960,Male,47.0,0.0,0.0,No Info,27.32,6.0,200.0,0.0
...,...,...,...,...,...,...,...,...,...
99980,Female,52.0,0.0,0.0,never,27.32,6.1,145.0,0.0
99985,Male,25.0,0.0,0.0,No Info,27.32,5.8,145.0,0.0
99989,Female,26.0,0.0,0.0,No Info,27.32,5.0,158.0,0.0
99990,Male,39.0,0.0,0.0,No Info,27.32,6.1,100.0,0.0


In [71]:
dfDCNull.drop_duplicates(inplace=True)

In [72]:
dfDCNull[dfDCNull.duplicated()]

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes


semua data duplikat berhasil dihapus.. :D

## **ganti tipe data salah satu atribut angka**

In [73]:
dfDCNull.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96146 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   gender               96146 non-null  object 
 1   age                  96146 non-null  float64
 2   hypertension         96146 non-null  float64
 3   heart_disease        96146 non-null  float64
 4   smoking_history      96146 non-null  object 
 5   bmi                  96146 non-null  float64
 6   HbA1c_level          96146 non-null  float64
 7   blood_glucose_level  96146 non-null  float64
 8   diabetes             96146 non-null  float64
dtypes: float64(7), object(2)
memory usage: 7.3+ MB


In [74]:
dfDCNull['diabetes'] = dfDCNull['diabetes'].astype('int')

In [75]:
dfDCNull.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96146 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   gender               96146 non-null  object 
 1   age                  96146 non-null  float64
 2   hypertension         96146 non-null  float64
 3   heart_disease        96146 non-null  float64
 4   smoking_history      96146 non-null  object 
 5   bmi                  96146 non-null  float64
 6   HbA1c_level          96146 non-null  float64
 7   blood_glucose_level  96146 non-null  float64
 8   diabetes             96146 non-null  int32  
dtypes: float64(6), int32(1), object(2)
memory usage: 7.0+ MB


## **lakukan one hot encoding**

In [76]:
onehot = pd.get_dummies(dfDCNull[['gender']])
dfOH = dfDCNull.join(onehot)

In [77]:
dfOH

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Female,gender_Male,gender_Other
0,Female,43.0,0.0,0.0,No Info,27.320788,5.527496,140.0,0,1,0,0
1,Female,54.0,0.0,0.0,No Info,27.320000,6.600000,80.0,0,1,0,0
2,Male,28.0,0.0,0.0,never,27.320000,5.700000,158.0,0,0,1,0
3,Female,36.0,0.0,0.0,current,23.450000,5.000000,155.0,0,1,0,0
4,Male,76.0,1.0,1.0,current,20.140000,4.800000,155.0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
99994,Female,36.0,0.0,0.0,No Info,24.600000,4.800000,145.0,0,1,0,0
99996,Female,2.0,0.0,0.0,No Info,17.370000,6.500000,100.0,0,1,0,0
99997,Male,66.0,0.0,0.0,former,27.830000,5.700000,155.0,0,0,1,0
99998,Female,24.0,0.0,0.0,never,35.420000,4.000000,100.0,0,1,0,0
