In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle

In [4]:
! pip list

Package                   Version
------------------------- ---------------
anyio                     4.2.0
argon2-cffi               23.1.0
argon2-cffi-bindings      21.2.0
arrow                     1.3.0
asttokens                 2.4.1
async-lru                 2.0.4
attrs                     23.2.0
Babel                     2.14.0
beautifulsoup4            4.12.3
bleach                    6.1.0
Bottleneck                1.3.5
Brotli                    1.0.9
cached-property           1.5.2
certifi                   2023.11.17
cffi                      1.16.0
charset-normalizer        3.3.2
colorama                  0.4.6
comm                      0.2.1
contourpy                 1.2.0
cycler                    0.11.0
debugpy                   1.6.7
decorator                 5.1.1
defusedxml                0.7.1
entrypoints               0.4
exceptiongroup            1.2.0
executing                 2.0.1
fastjsonschema            2.19.1
fonttools                 4.25.0
fqdn            

In [5]:
# Set the decimal format
pd.options.display.float_format = "{:.2f}".format

# Load the dataset from Kaggle
# https://www.kaggle.com/datasets/iammustafatz/diabetes-prediction-dataset
df = pd.read_csv("./diabetes_prediction_dataset.csv")

# Display the first 5 rows of the dataset
print(df.head())
df["smoking_history"].value_counts()
# update not current and ever to former
df["smoking_history"] = df["smoking_history"].replace("ever", "former")
df["smoking_history"] = df["smoking_history"].replace("not current", "former")
df["smoking_history"].value_counts()


   gender   age  hypertension  heart_disease smoking_history   bmi  \
0  Female 80.00             0              1           never 25.19   
1  Female 54.00             0              0         No Info 27.32   
2    Male 28.00             0              0           never 27.32   
3  Female 36.00             0              0         current 23.45   
4    Male 76.00             1              1         current 20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0         6.60                  140         0  
1         6.60                   80         0  
2         5.70                  158         0  
3         5.00                  155         0  
4         4.80                  155         0  


smoking_history
No Info    35816
never      35095
former     19803
current     9286
Name: count, dtype: int64

In [6]:
# Drop the blood test features
df = df.drop(columns=["HbA1c_level", "blood_glucose_level"])
# Split the data into the features and the target
y = df["diabetes"]
X = df.drop(columns=["diabetes"])
# Encode the categorical columns
ohe = OneHotEncoder(sparse_output=False, drop="if_binary", dtype=int)
X_encoded = ohe.fit_transform(X)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)
# Make predictions
y_pred = model.predict(X_test_scaled)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
# Save the encoder, scaler, and model to pickle files
with open("encoder.pkl", "wb") as f:
    pickle.dump(ohe, f)
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)
with open("diabetes_model.pkl", "wb") as f:
    pickle.dump(model, f)

Accuracy: 0.91
              precision    recall  f1-score   support

           0       0.92      0.99      0.95     18292
           1       0.40      0.04      0.07      1708

    accuracy                           0.91     20000
   macro avg       0.66      0.52      0.51     20000
weighted avg       0.87      0.91      0.88     20000

[[18195    97]
 [ 1643    65]]


In [7]:
import pandas as pd

age = 35
gender = "Male"
bmi = 23.0
smoking_history = "never"
hypertension = 0
heart_disease = 0
# gender   age  hypertension  heart_disease smoking_history   bmi
data = {"gender": gender, "age": age, "hypertension": hypertension, "heart_disease": heart_disease, "smoking_history": smoking_history, "bmi": bmi}
df = pd.DataFrame(data, index=[0])
with open("encoder.pkl", "rb") as f:
    ohe = pickle.load(f)
df = ohe.transform(df)
with open("scaler.pkl", "rb") as f:
    scaler = pickle.load(f)
df = scaler.transform(df)
with open("diabetes_model.pkl", "rb") as f:
    model = pickle.load(f)
prediction = model.predict(df)
if prediction[0] == 0:
    print("Prediction: No diabetes")
else:   
    print("Prediction: Diabetes")

Prediction: No diabetes
