In [None]:
from pathlib import Path
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score, classification_report, confusion_matrix

plt.style.use("seaborn-v0_8")

In [None]:
BASE_DIR = Path().resolve().parent
DATA_PATH = BASE_DIR / "data" / "diabetes.csv"
df = pd.read_csv(DATA_PATH)
df.head()
df.describe()

In [None]:
cols_with_zero=[
    "Glucose",
    "BloodPressure",
    "SkinThickness",
    "Insulin",
    "BMI"
]

df[cols_with_zero]= df[cols_with_zero].replace(0, np.nan)
df.isnull().sum()
for col in cols_with_zero:
    df[col].fillna(df[col].median(), inplace=True)
df.isnull().sum()

In [None]:
X= df.drop("Outcome", axis=1)
y= df["Outcome"]

X_train, X_test, y_train, y_test= train_test_split(
    X, y, test_size=0.30, random_state=42
)

In [None]:
scaler= StandardScaler()

X_train= scaler.fit_transform(X_train)
X_test= scaler.transform(X_test)

In [None]:
log_reg= LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42)
log_reg.fit(X_train, y_train)
y_prob= log_reg.predict_proba(X_test)[:,1]

threshold=0.4

y_pred_thresh= (y_prob >= threshold).astype(int)

In [None]:
fpr, tpr, thresholds= roc_curve(y_test, y_prob)

auc_score= roc_auc_score(y_test, y_prob)

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {auc_score:.2f})")
plt.plot([0,1], [0,1], linestyle="--", label="Random Guess")

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (Recall)")
plt.title("ROC Curve")
plt.legend()
plt.show()

In [None]:
y_pred= log_reg.predict(X_test)

print(classification_report(y_test, y_pred_thresh))

In [None]:
cm= confusion_matrix(y_test, y_pred_thresh)

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Predicted No Diabetes", "Predicted Diabetes"], 
                                                   yticklabels=["Actual No Diabetes", "Actual Diabetes"])

plt.xlabel("Prediction")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()