In [48]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

In [49]:
# Set the decimal format
pd.options.display.float_format = "{:.2f}".format

# Load the dataset from Kaggle
# https://www.kaggle.com/datasets/iammustafatz/diabetes-prediction-dataset
df = pd.read_csv("./diabetes_prediction_dataset.csv")
# Drop the blood test features
df = df.drop(columns=["HbA1c_level", "blood_glucose_level"])
# Split the data into the features and the target
y = df["diabetes"]
X = df.drop(columns=["diabetes"])
# Encode the categorical columns
ohe = OneHotEncoder(sparse_output=False, drop="if_binary", dtype=int)
X_encoded = ohe.fit_transform(X)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)
# Make predictions
y_pred = model.predict(X_test_scaled)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
# Save the encoder, scaler, and model to pickle files
joblib.dump(ohe, "encoder.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(model, "diabetes_model.pkl")

Accuracy: 0.91
              precision    recall  f1-score   support

           0       0.92      1.00      0.95     18292
           1       0.42      0.04      0.07      1708

    accuracy                           0.91     20000
   macro avg       0.67      0.52      0.51     20000
weighted avg       0.87      0.91      0.88     20000

[[18202    90]
 [ 1643    65]]


['diabetes_model.pkl']

In [27]:
import pandas as pd

age = "37"
sex = "2"
bmi = "32"
ubp = "120"
lbp = "80"
avg = (int(ubp) + int(lbp)) / 2
bp = avg

data = {"age": age, "sex": sex, "bmi": bmi, "bp": bp}
df = pd.DataFrame(data, index=[0])
df = scaler.transform(df)

model = joblib.load("diabetes_model.pkl")
prediction = model.predict(df)
print(f"Prediction: {prediction[0]}")


Prediction: 201.82750818019082


In [50]:

df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,diabetes
0,Female,80.0,0,1,never,25.19,0
1,Female,54.0,0,0,No Info,27.32,0
2,Male,28.0,0,0,never,27.32,0
3,Female,36.0,0,0,current,23.45,0
4,Male,76.0,1,1,current,20.14,0


In [51]:
df["smoking_history"].value_counts()


smoking_history
No Info        35816
never          35095
former          9352
current         9286
not current     6447
ever            4004
Name: count, dtype: int64