In [16]:
# import library
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, make_scorer, recall_score, f1_score
import pickle

In [17]:
# baca data
df = pd.read_csv("diabetes_prediction_dataset.csv")

In [18]:
# hapus data duplikat
df.drop_duplicates(inplace=True)

In [19]:
# lakukan encoding untuk data kategorical 
df = pd.get_dummies(df, columns=['gender'], prefix='gender')
df = pd.get_dummies(df, columns=['smoking_history'], prefix='smoking_history')

df[['gender_Female', 'gender_Male', 'gender_Other']] = df[['gender_Female', 'gender_Male', 'gender_Other']].astype(int)
df[['smoking_history_No Info', 'smoking_history_current', 'smoking_history_ever',
     'smoking_history_former', 'smoking_history_never', 'smoking_history_not current']] = df[[
    'smoking_history_No Info', 'smoking_history_current', 'smoking_history_ever',
    'smoking_history_former', 'smoking_history_never', 'smoking_history_not current']].astype(int)

In [20]:
# pisahkan fitur dan target
X = df.drop('diabetes', axis=1)
y = df['diabetes']

In [21]:
# Split data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

In [22]:
# deteksi dan tangani outlier 
columns_to_check = ['age', 'HbA1c_level', 'blood_glucose_level']
columns_to_modify = ['bmi']

In [23]:
# fungsi untuk mendeteksi adanya outlier menggunakan iqr
def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

In [24]:
# identifikasi outlier kecuali kolom BMI
outlier_indices = set()
for column in columns_to_check:
    outliers = detect_outliers_iqr(X_train, column)
    outlier_indices.update(outliers.index)

In [25]:
# hapus baris data yang punya outlier selain bmi 
X_train = X_train.drop(index=outlier_indices)
y_train = y_train.drop(index=outlier_indices)

In [26]:
# tangani data outlier di bmi
def replace_outliers_with_median(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    median = df[column].median()
    df[column] = df[column].apply(lambda x: median if x < lower_bound or x > upper_bound else x)

replace_outliers_with_median(X_train, 'bmi')

In [27]:
# menangani ketidakseimbangan kelas menggunakan smote
print("Before SMOTE:")
print(y_train.value_counts())
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

print("After SMOTE:")
print(y_train.value_counts())

Before SMOTE:
diabetes
0    70130
1     4383
Name: count, dtype: int64
After SMOTE:
diabetes
0    70130
1    70130
Name: count, dtype: int64


In [28]:
# hapus spasi di fitur 
X_train.columns = X_train.columns.str.replace(' ', '_')
X_val.columns = X_val.columns.str.replace(' ', '_')
X_test.columns = X_test.columns.str.replace(' ', '_')

In [29]:
# Hyperparameter tuning untuk fokus menaikan f1 skor dan recal 
param_grid = {
    'num_leaves': [31, 50, 70],
    'max_depth': [-1, 10, 20],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 500],
    'class_weight': [None, 'balanced']
}

scorer = make_scorer(f1_score, pos_label=1)
model = LGBMClassifier(random_state=42)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scorer, cv=3, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 162 candidates, totalling 486 fits
[LightGBM] [Info] Number of positive: 70130, number of negative: 70130
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003451 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 810
[LightGBM] [Info] Number of data points in the train set: 140260, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [30]:
# Best parameters dan skor
print("Best Parameters:", grid_search.best_params_)
print("Best F1 Score (Train):", grid_search.best_score_)

Best Parameters: {'class_weight': None, 'learning_rate': 0.2, 'max_depth': -1, 'n_estimators': 200, 'num_leaves': 50}
Best F1 Score (Train): 0.9785774437993258


In [31]:
# latih model dengan best model
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 70130, number of negative: 70130
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002990 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 810
[LightGBM] [Info] Number of data points in the train set: 140260, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [32]:
# simpan model dalam bentuk pickle 
with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

In [33]:
# validasi model
val_preds = best_model.predict(X_val)
print("Validation Metrics:\n", classification_report(y_val, val_preds))
print("Validation Accuracy:", accuracy_score(y_val, val_preds))
print("Validation Confusion Matrix:\n", confusion_matrix(y_val, val_preds))

Validation Metrics:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98      8767
           1       0.91      0.72      0.81       848

    accuracy                           0.97      9615
   macro avg       0.94      0.86      0.90      9615
weighted avg       0.97      0.97      0.97      9615

Validation Accuracy: 0.9696307852314092
Validation Confusion Matrix:
 [[8709   58]
 [ 234  614]]


In [34]:
# test model
test_preds = best_model.predict(X_test)
print("Test Metrics:\n", classification_report(y_test, test_preds))
print("Test Accuracy:", accuracy_score(y_test, test_preds))
print("Test Confusion Matrix:\n", confusion_matrix(y_test, test_preds))

Test Metrics:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98      8767
           1       0.88      0.68      0.76       848

    accuracy                           0.96      9615
   macro avg       0.92      0.83      0.87      9615
weighted avg       0.96      0.96      0.96      9615

Test Accuracy: 0.9630785231409257
Test Confusion Matrix:
 [[8686   81]
 [ 274  574]]
