In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("diabetes.csv")

In [None]:
df.head()

In [None]:
X = df.iloc[:, :-1].values

In [None]:
y = df.iloc[:, -1].values

In [None]:
df.isnull().sum()

In [None]:
(df == 0).sum()


In [None]:
cols_with_zero = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_zero] = df[cols_with_zero].replace(0, np.nan)

In [None]:
df.isnull().sum()

In [None]:
for col in cols_with_zero:
    df[col] = df[col].fillna(df[col].median())

In [None]:
df.isnull().sum()

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.2 , random_state = 42 , stratify=y)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(
    random_state=42,
    class_weight='balanced',
    max_iter=1000
)

classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

In [None]:
y_prob = classifier.predict_proba(X_test)[:, 1]

# lower threshold
threshold = 0.4
y_pred = (y_prob >= threshold).astype(int)

In [None]:
from sklearn.metrics import recall_score

recall_score(y_test, y_pred)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


In [None]:
import numpy as np

def predict_diabetes_with_confidence(model, scaler, user_input):
    """
    user_input: list or array of 8 features
    order must match training data
    """

    # Convert to numpy array
    user_input = np.array(user_input).reshape(1, -1)

    # Scale input
    user_input_scaled = scaler.transform(user_input)

    # Predict probability
    prob_diabetes = model.predict_proba(user_input_scaled)[0][1]
    confidence = prob_diabetes * 100

    # Decision logic
    if prob_diabetes < 0.4:
        result = "Not Diabetic (Low Risk)"
        message = "The model predicts a low risk of diabetes."
    
    elif 0.4 <= prob_diabetes <= 0.7:
        result = "Uncertain Result"
        message = (
            "The prediction is uncertain. "
            "You may or may not have diabetes. "
            "Please consult a doctor for accurate diagnosis."
        )
    
    else:
        result = "Likely Diabetic (High Risk)"
        message = (
            "The model predicts a high risk of diabetes. "
            "Please consult a doctor for confirmation and treatment."
        )

    return {
        "prediction": result,
        "confidence": f"{confidence:.2f}%",
        "message": message
    }


In [None]:
sample_input = [2, 130, 70, 25, 120, 30.5, 0.5, 45]  # Example input features for moderate risk
sample_input2 = [6, 180, 90, 35, 250, 38.0, 1.2, 55] # Example input features for high risk
sample_input3 = [0, 95, 70, 20, 85, 22.5, 0.3, 25] # Example input features for low risk

output = predict_diabetes_with_confidence(
    model=classifier,
    scaler=sc,
    user_input=sample_input3
)

print(output)


In [None]:
import seaborn as sns
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
roc_auc = roc_auc_score(y_test, y_prob)
print("ROC-AUC:", roc_auc)

In [None]:
from sklearn.metrics import roc_curve
fpr, tpr, _ = roc_curve(y_test, y_prob)

plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
plt.plot([0, 1], [0, 1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.show()