In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib

In [27]:
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,Age,Sex,Ethnicity,BMI,Waist_Circumference,Fasting_Blood_Glucose,HbA1c,Blood_Pressure_Systolic,Blood_Pressure_Diastolic,Cholesterol_Total,Cholesterol_HDL,Cholesterol_LDL,GGT,Serum_Urate,Physical_Activity_Level,Dietary_Intake_Calories,Alcohol_Consumption,Smoking_Status,Family_History_of_Diabetes,Previous_Gestational_Diabetes
0,58,Female,White,35.8,83.4,123.9,10.9,152,114,197.8,50.2,99.2,37.5,7.2,Moderate,1538,Moderate,Never,0,1
1,48,Male,Asian,24.1,71.4,183.7,12.8,103,91,261.6,62.0,146.4,88.5,6.1,Moderate,2653,Moderate,Current,0,1
2,34,Female,Black,25.0,113.8,142.0,14.5,179,104,261.0,32.1,164.1,56.2,6.9,Low,1684,Heavy,Former,1,0
3,62,Male,Asian,32.7,100.4,167.4,8.8,176,118,183.4,41.1,84.0,34.4,5.4,Low,3796,Moderate,Never,1,0
4,27,Female,Asian,33.5,110.8,146.4,7.1,122,97,203.2,53.9,92.8,81.9,7.4,Moderate,3161,Heavy,Current,0,0


In [28]:
df.shape

(10000, 20)

In [29]:
df.isnull().sum().sort_values(ascending=False)

Alcohol_Consumption              3320
Age                                 0
Ethnicity                           0
Sex                                 0
BMI                                 0
Waist_Circumference                 0
HbA1c                               0
Fasting_Blood_Glucose               0
Blood_Pressure_Diastolic            0
Cholesterol_Total                   0
Cholesterol_HDL                     0
Blood_Pressure_Systolic             0
Cholesterol_LDL                     0
GGT                                 0
Physical_Activity_Level             0
Serum_Urate                         0
Dietary_Intake_Calories             0
Smoking_Status                      0
Family_History_of_Diabetes          0
Previous_Gestational_Diabetes       0
dtype: int64

In [30]:
df.drop(columns=['Alcohol_Consumption'], axis=1, inplace=True)


In [None]:
df['Diabetes_Risk'] = 0  
df.loc[(df['Fasting_Blood_Glucose'] >= 126) | (df['HbA1c'] >= 6.5), 'Diabetes_Risk'] = 1


In [35]:
X = df.drop("Diabetes_Risk", axis=1)
y = df["Diabetes_Risk"]

categorical_cols = X.select_dtypes(include=["object"]).columns
numeric_cols = X.select_dtypes(exclude=["object"]).columns

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", StandardScaler(), numeric_cols)
    ]
)


model = Pipeline([
    ("preprocess", preprocess),
    ("classifier", RandomForestClassifier(n_estimators=200, random_state=42))
])


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:")
print(cm)

Accuracy: 1.0
Confusion Matrix:
[[ 181    0]
 [   0 1819]]


In [36]:
joblib.dump(model, "diabetes_model.pkl")
print("Model saved as diabetes_model.pkl")

Model saved as diabetes_model.pkl
