In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [2]:
data = pd.read_csv("diabetes_dataset.csv")
df = pd.DataFrame(data)

In [3]:
df = df.drop(["Unnamed: 0", "Ethnicity"], axis=1)

In [4]:
df

Unnamed: 0,Age,Sex,BMI,Waist_Circumference,Fasting_Blood_Glucose,HbA1c,Blood_Pressure_Systolic,Blood_Pressure_Diastolic,Cholesterol_Total,Cholesterol_HDL,Cholesterol_LDL,GGT,Serum_Urate,Physical_Activity_Level,Dietary_Intake_Calories,Alcohol_Consumption,Smoking_Status,Family_History_of_Diabetes,Previous_Gestational_Diabetes
0,58,Female,35.8,83.4,123.9,10.9,152,114,197.8,50.2,99.2,37.5,7.2,Moderate,1538,Moderate,Never,0,1
1,48,Male,24.1,71.4,183.7,12.8,103,91,261.6,62.0,146.4,88.5,6.1,Moderate,2653,Moderate,Current,0,1
2,34,Female,25.0,113.8,142.0,14.5,179,104,261.0,32.1,164.1,56.2,6.9,Low,1684,Heavy,Former,1,0
3,62,Male,32.7,100.4,167.4,8.8,176,118,183.4,41.1,84.0,34.4,5.4,Low,3796,Moderate,Never,1,0
4,27,Female,33.5,110.8,146.4,7.1,122,97,203.2,53.9,92.8,81.9,7.4,Moderate,3161,Heavy,Current,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,45,Female,39.4,114.0,76.2,13.4,109,82,187.7,60.3,88.7,19.1,6.1,Low,3582,,Former,1,1
9996,41,Female,21.0,71.3,77.1,12.9,154,72,234.3,72.3,200.0,37.1,3.5,Low,2206,,Never,1,1
9997,50,Female,29.0,106.3,97.5,4.9,122,61,266.0,69.8,156.1,85.8,4.9,High,3175,Heavy,Former,1,1
9998,62,Female,27.3,119.9,89.0,11.5,99,115,172.3,74.2,110.9,25.3,5.2,High,3478,Moderate,Never,1,0


In [5]:
risk_conditions = (
    (df['HbA1c'] >= 6.5).astype(int) +
    (df['Fasting_Blood_Glucose'] >= 126).astype(int) +
    (df['BMI'] >= 30).astype(int) +
    (df['Waist_Circumference'] >= 100).astype(int) +
    (df['Blood_Pressure_Systolic'] >= 130).astype(int) +
    (df['Blood_Pressure_Diastolic'] >= 85).astype(int) +
    ((df['Sex'] == 'Female') & (df['Previous_Gestational_Diabetes'] == 1)).astype(int)
)

df['Diabetic'] = (risk_conditions >= 4).astype(int)

In [6]:
missing_percentage = df['Alcohol_Consumption'].isnull().mean() * 100
print(f"Missing: {missing_percentage:.2f}%")

Missing: 33.20%


In [7]:
df['Alcohol_Consumption'] = df['Alcohol_Consumption'].fillna('Unknown')

In [8]:
X = df.drop(columns=['Diabetic'])
y = df['Diabetic']

In [9]:
print(X)

      Age     Sex   BMI  Waist_Circumference  Fasting_Blood_Glucose  HbA1c  \
0      58  Female  35.8                 83.4                  123.9   10.9   
1      48    Male  24.1                 71.4                  183.7   12.8   
2      34  Female  25.0                113.8                  142.0   14.5   
3      62    Male  32.7                100.4                  167.4    8.8   
4      27  Female  33.5                110.8                  146.4    7.1   
...   ...     ...   ...                  ...                    ...    ...   
9995   45  Female  39.4                114.0                   76.2   13.4   
9996   41  Female  21.0                 71.3                   77.1   12.9   
9997   50  Female  29.0                106.3                   97.5    4.9   
9998   62  Female  27.3                119.9                   89.0   11.5   
9999   29    Male  20.6                102.0                   70.8   14.5   

      Blood_Pressure_Systolic  Blood_Pressure_Diastolic  Choles

In [10]:
X = pd.get_dummies(X, drop_first=True)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)

In [12]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [13]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

In [14]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("Classification Report:")
print(classification_report(y_test, y_pred))

print(f"ROC AUC Score: {roc_auc_score(y_test, y_prob):.4f}")

Confusion Matrix:
[[1812   10]
 [  52 2126]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1822
           1       1.00      0.98      0.99      2178

    accuracy                           0.98      4000
   macro avg       0.98      0.99      0.98      4000
weighted avg       0.98      0.98      0.98      4000

ROC AUC Score: 0.9993
