In [None]:
# Diabetes Risk Prediction

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [4]:
df = pd.read_csv('Diabetes_Dataset.csv')


In [5]:
# Display basic info about the datasets
df.head(), df.info(), df.head(), df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies          

(   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
 0            6      148             72             35        0  33.6   
 1            1       85             66             29        0  26.6   
 2            8      183             64              0        0  23.3   
 3            1       89             66             23       94  28.1   
 4            0      137             40             35      168  43.1   
 
    DiabetesPedigreeFunction  Age  Outcome  
 0                     0.627   50        1  
 1                     0.351   31        0  
 2                     0.672   32        1  
 3                     0.167   21        0  
 4                     2.288   33        1  ,
 None,
    Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
 0            6      148             72             35        0  33.6   
 1            1       85             66             29        0  26.6   
 2            8      183             64              0        0

In [7]:


# --- Diabetes Model ---

# Select features and target
diabetes_features = df[["Glucose", "BloodPressure", "BMI", "Age"]]
diabetes_target = df["Outcome"]

# Train-test split
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(
    diabetes_features, diabetes_target, test_size=0.2, random_state=42
)

# Scale the features
scaler_d = StandardScaler()
X_train_d_scaled = scaler_d.fit_transform(X_train_d)
X_test_d_scaled = scaler_d.transform(X_test_d)

# Train models
log_reg_d = LogisticRegression().fit(X_train_d_scaled, y_train_d)
knn_d = KNeighborsClassifier().fit(X_train_d_scaled, y_train_d)
xgb_d = GradientBoostingClassifier().fit(X_train_d_scaled, y_train_d)

# Evaluate
log_pred_d = log_reg_d.predict(X_test_d_scaled)
knn_pred_d = knn_d.predict(X_test_d_scaled)
xgb_pred_d = xgb_d.predict(X_test_d_scaled)

log_report = classification_report(y_test_d, log_pred_d, output_dict=True)
knn_report = classification_report(y_test_d, knn_pred_d, output_dict=True)
xgb_report = classification_report(y_test_d, xgb_pred_d, output_dict=True)

log_report["model"] = "Logistic Regression"
knn_report["model"] = "KNN"
xgb_report["model"] = "XGBoost"

# Summary reports
model_reports = [log_report, knn_report, xgb_report]
model_reports_summary = [
    {
        "Model": r["model"],
        "Accuracy": round(r["accuracy"], 3),
        "Precision (1)": round(r["1"]["precision"], 3),
        "Recall (1)": round(r["1"]["recall"], 3),
        "F1-score (1)": round(r["1"]["f1-score"], 3)
    }
    for r in model_reports
]

model_reports_summary


[{'Model': 'Logistic Regression',
  'Accuracy': 0.747,
  'Precision (1)': 0.648,
  'Recall (1)': 0.636,
  'F1-score (1)': 0.642},
 {'Model': 'KNN',
  'Accuracy': 0.721,
  'Precision (1)': 0.615,
  'Recall (1)': 0.582,
  'F1-score (1)': 0.598},
 {'Model': 'XGBoost',
  'Accuracy': 0.753,
  'Precision (1)': 0.644,
  'Recall (1)': 0.691,
  'F1-score (1)': 0.667}]

In [10]:
def interpret_risk(probability):
    """
    Converts predicted probability into risk category.
    """
    if probability >= 0.7:
        return "High Risk"
    elif probability >= 0.4:
        return "Medium Risk"
    else:
        return "Low Risk"


In [11]:
# Sample patient input
sample = [[130, 85, 32.0, 45]]  # [Glucose, BP, BMI, Age]
sample_scaled = scaler_d.transform(sample)

# Predict using best model (e.g., XGBoost)
prob = xgb_d.predict_proba(sample_scaled)[0][1]  # probability of class 1 (diabetic)
risk = interpret_risk(prob)

print(f"Predicted Probability of Diabetes: {round(prob, 2)}")
print(f"Risk Level: {risk}")


Predicted Probability of Diabetes: 0.74
Risk Level: High Risk


