In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score


In [2]:
df = pd.read_csv("diabetes.csv")
df.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
cols = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
df[cols] = df[cols].replace(0, np.nan)
df = df.dropna()


In [4]:
X = df[['Glucose', 'BMI', 'Age', 'Pregnancies', 'Insulin', 'BloodPressure']]
y = df['Outcome']


In [5]:


X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [8]:

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lr_model = LogisticRegression(
    max_iter=2000,
    class_weight='balanced',
    C=0.5,              # stronger regularization
    solver='liblinear'
)

lr_model.fit(X_train_scaled, y_train)

lr_train_pred = lr_model.predict(X_train_scaled)
lr_test_pred = lr_model.predict(X_test_scaled)

print("Logistic Regression Train Accuracy:",
      accuracy_score(y_train, lr_train_pred))
print("Logistic Regression Test Accuracy:",
      accuracy_score(y_test, lr_test_pred))




Logistic Regression Train Accuracy: 0.7635782747603834
Logistic Regression Test Accuracy: 0.7974683544303798


In [9]:
from sklearn.svm import SVC

svm_model = SVC(
    kernel='rbf',
    C=1.5,
    gamma='scale',
    class_weight='balanced'
)

svm_model.fit(X_train_scaled, y_train)

svm_train_pred = svm_model.predict(X_train_scaled)
svm_test_pred = svm_model.predict(X_test_scaled)

print("SVM Train Accuracy:", accuracy_score(y_train, svm_train_pred))
print("SVM Test Accuracy:", accuracy_score(y_test, svm_test_pred))


SVM Train Accuracy: 0.8306709265175719
SVM Test Accuracy: 0.759493670886076


In [10]:
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=2,
    min_samples_leaf=15,
    subsample=0.75,
    random_state=42
)

gb_model.fit(X_train, y_train)

gb_train_pred = gb_model.predict(X_train)
gb_test_pred = gb_model.predict(X_test)

print("Gradient Boosting Train Accuracy:",
      accuracy_score(y_train, gb_train_pred))
print("Gradient Boosting Test Accuracy:",
      accuracy_score(y_test, gb_test_pred))


Gradient Boosting Train Accuracy: 0.8594249201277955
Gradient Boosting Test Accuracy: 0.8354430379746836


In [11]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=500,
    max_depth=5,
    min_samples_leaf=12,
    class_weight='balanced',
    random_state=42
)

rf_model.fit(X_train, y_train)

rf_train_pred = rf_model.predict(X_train)
rf_test_pred = rf_model.predict(X_test)

print("Random Forest Train Accuracy:",
      accuracy_score(y_train, rf_train_pred))
print("Random Forest Test Accuracy:",
      accuracy_score(y_test, rf_test_pred))

Random Forest Train Accuracy: 0.8306709265175719
Random Forest Test Accuracy: 0.810126582278481


In [12]:
results = pd.DataFrame({
    "Model": [
        "Logistic Regression",
        "SVM",
        "Gradient Boosting",
        "Random Forest"
    ],
    "Train Accuracy": [
        accuracy_score(y_train, lr_train_pred),
        accuracy_score(y_train, svm_train_pred),
        accuracy_score(y_train, gb_train_pred),
        accuracy_score(y_train, rf_train_pred)
    ],
    "Test Accuracy": [
        accuracy_score(y_test, lr_test_pred),
        accuracy_score(y_test, svm_test_pred),
        accuracy_score(y_test, gb_test_pred),
        accuracy_score(y_test, rf_test_pred)
    ]
})

results


Unnamed: 0,Model,Train Accuracy,Test Accuracy
0,Logistic Regression,0.763578,0.797468
1,SVM,0.830671,0.759494
2,Gradient Boosting,0.859425,0.835443
3,Random Forest,0.830671,0.810127


In [13]:
from sklearn.metrics import accuracy_score
# Create dictionary of test accuracies
test_accuracy_dict = {
    "Logistic Regression": accuracy_score(y_test, lr_test_pred),
    "SVM": accuracy_score(y_test, svm_test_pred),
    "Gradient Boosting": accuracy_score(y_test, gb_test_pred),
    "Random Forest": accuracy_score(y_test, rf_test_pred)
}

# Select best model based on highest test accuracy
best_model_name = max(test_accuracy_dict, key=test_accuracy_dict.get)
best_model_accuracy = test_accuracy_dict[best_model_name]

print("Best Algorithm Selected:", best_model_name)
print("Best Test Accuracy:", best_model_accuracy)

Best Algorithm Selected: Gradient Boosting
Best Test Accuracy: 0.8354430379746836


In [14]:
def glucose_status(value):
    if value >= 126:
        return "Diabetes (High)"
    elif 100 <= value <= 125:
        return "Pre-Diabetes"
    else:
        return "Normal"


def bp_status(value):
    if value >= 90:
        return "High BP"
    elif 80 <= value <= 89:
        return "Elevated BP"
    else:
        return "Normal BP"


def bmi_status(value):
    if value >= 30:
        return "Obese"
    elif 25 <= value <= 29.9:
        return "Overweight"
    elif 18.5 <= value <= 24.9:
        return "Normal"
    else:
        return "Underweight"


def insulin_status(value):
    if value > 25:
        return "High"
    elif 2 <= value <= 25:
        return "Normal"
    else:
        return "Low"


def pregnancy_status(value):
    if value > 5:
        return "High Risk"
    elif 3 <= value <= 5:
        return "Moderate Risk"
    else:
        return "Low Risk"


In [15]:
df['Glucose_Status'] = df['Glucose'].apply(glucose_status)
df['BP_Status'] = df['BloodPressure'].apply(bp_status)
df['BMI_Status'] = df['BMI'].apply(bmi_status)
df['Insulin_Status'] = df['Insulin'].apply(insulin_status)
df['Pregnancy_Status'] = df['Pregnancies'].apply(pregnancy_status)


In [16]:
df[['Glucose', 'Glucose_Status',
    'BloodPressure', 'BP_Status',
    'BMI', 'BMI_Status',
    'Insulin', 'Insulin_Status',
    'Pregnancies', 'Pregnancy_Status']].head()


Unnamed: 0,Glucose,Glucose_Status,BloodPressure,BP_Status,BMI,BMI_Status,Insulin,Insulin_Status,Pregnancies,Pregnancy_Status
3,89.0,Normal,66.0,Normal BP,28.1,Overweight,94.0,High,1,Low Risk
4,137.0,Diabetes (High),40.0,Normal BP,43.1,Obese,168.0,High,0,Low Risk
6,78.0,Normal,50.0,Normal BP,31.0,Obese,88.0,High,3,Moderate Risk
8,197.0,Diabetes (High),70.0,Normal BP,30.5,Obese,543.0,High,2,Low Risk
13,189.0,Diabetes (High),60.0,Normal BP,30.1,Obese,846.0,High,1,Low Risk
