In [25]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle

In [26]:
# Set the decimal format
pd.options.display.float_format = "{:.2f}".format

# Load the dataset from Kaggle
# https://www.kaggle.com/datasets/iammustafatz/diabetes-prediction-dataset
df = pd.read_csv("./diabetes_prediction_dataset.csv")

# Display the first 5 rows of the dataset
print(df.head())
df["smoking_history"].value_counts()
# update not current and ever to former
df["smoking_history"] = df["smoking_history"].replace("ever", "former")
df["smoking_history"] = df["smoking_history"].replace("not current", "former")
df["smoking_history"].value_counts()


   gender   age  hypertension  heart_disease smoking_history   bmi  \
0  Female 80.00             0              1           never 25.19   
1  Female 54.00             0              0         No Info 27.32   
2    Male 28.00             0              0           never 27.32   
3  Female 36.00             0              0         current 23.45   
4    Male 76.00             1              1         current 20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0         6.60                  140         0  
1         6.60                   80         0  
2         5.70                  158         0  
3         5.00                  155         0  
4         4.80                  155         0  


smoking_history
No Info    35816
never      35095
former     19803
current     9286
Name: count, dtype: int64

In [27]:
# Drop the blood test features
df = df.drop(columns=["HbA1c_level", "blood_glucose_level"])
# Split the data into the features and the target
y = df["diabetes"]
X = df.drop(columns=["diabetes"])
# Encode the categorical columns
ohe = OneHotEncoder(sparse_output=False, drop="if_binary", dtype=int)
ct = ColumnTransformer(transformers=[("ohe", ohe, ['gender', 'smoking_history'])], remainder="passthrough")
X_encoded = ct.fit_transform(X)
X_encoded = pd.DataFrame(X_encoded, columns=ct.get_feature_names_out())
# Display the first 5 rows of the encoded features
print(X_encoded.head())



   ohe__gender_Female  ohe__gender_Male  ohe__gender_Other  \
0                1.00              0.00               0.00   
1                1.00              0.00               0.00   
2                0.00              1.00               0.00   
3                1.00              0.00               0.00   
4                0.00              1.00               0.00   

   ohe__smoking_history_No Info  ohe__smoking_history_current  \
0                          0.00                          0.00   
1                          1.00                          0.00   
2                          0.00                          0.00   
3                          0.00                          1.00   
4                          0.00                          1.00   

   ohe__smoking_history_former  ohe__smoking_history_never  remainder__age  \
0                         0.00                        1.00           80.00   
1                         0.00                        0.00           54.00   
2 

In [28]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled

array([[-1.18685419,  1.18734314, -0.01414355, ..., -0.28630923,
        -0.20355869, -0.38647449],
       [-1.18685419,  1.18734314, -0.01414355, ..., -0.28630923,
        -0.20355869, -0.41217229],
       [ 0.84256348, -0.84221652, -0.01414355, ..., -0.28630923,
        -0.20355869, -0.45298645],
       ...,
       [-1.18685419,  1.18734314, -0.01414355, ..., -0.28630923,
        -0.20355869, -0.17938044],
       [ 0.84256348, -0.84221652, -0.01414355, ..., -0.28630923,
        -0.20355869, -0.35775342],
       [ 0.84256348, -0.84221652, -0.01414355, ..., -0.28630923,
        -0.20355869,  0.1002721 ]])

In [29]:
# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

# Save the encoder, scaler, and model to pickle files
with open("encoder.pkl", "wb") as f:
    pickle.dump(ct, f)
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)
with open("diabetes_model.pkl", "wb") as f:
    pickle.dump(model, f)

In [32]:
# Make predictions
y_pred = model.predict(X_test_scaled)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.90
              precision    recall  f1-score   support

           0       0.93      0.97      0.95     18292
           1       0.31      0.17      0.22      1708

    accuracy                           0.90     20000
   macro avg       0.62      0.57      0.58     20000
weighted avg       0.87      0.90      0.88     20000

[[17674   618]
 [ 1425   283]]


In [45]:
import pandas as pd

age = 35
gender = "Male"
bmi = 23.0
smoking_history = "never"
hypertension = 0
heart_disease = 0

gender = 'Female'
smoking_history = 'former'
age = 80
hypertension = 0
heart_disease = 0
bmi = 27.5
# gender   age  hypertension  heart_disease smoking_history   bmi
data = {"gender": gender, "age": age, "hypertension": hypertension, "heart_disease": heart_disease, "smoking_history": smoking_history, "bmi": bmi}
df = pd.DataFrame(data, index=[0])
with open("encoder.pkl", "rb") as f:
    ct = pickle.load(f)
df = ct.transform(df)
df = pd.DataFrame(df, columns=ct.get_feature_names_out())
with open("scaler.pkl", "rb") as f:
    scaler = pickle.load(f)
df = scaler.transform(df)
with open("diabetes_model.pkl", "rb") as f:
    model = pickle.load(f)
prediction = model.predict(df)
if prediction[0] == 0:
    print("Prediction: No diabetes")
else:   
    print("Prediction: Diabetes")

Prediction: Diabetes
