In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, learning_curve, validation_curve
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, RocCurveDisplay
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("diabetes.csv")
df.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigree,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Pregnancies       768 non-null    int64  
 1   Glucose           768 non-null    int64  
 2   BloodPressure     768 non-null    int64  
 3   SkinThickness     768 non-null    int64  
 4   Insulin           768 non-null    int64  
 5   BMI               768 non-null    float64
 6   DiabetesPedigree  768 non-null    float64
 7   Age               768 non-null    int64  
 8   Outcome           768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [4]:
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
model = RandomForestClassifier(n_estimators=50, random_state=42)
model.fit(X_train, y_train)

print("Model training completed.\n")

Model training completed.



In [7]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

In [8]:
pd.DataFrame({
    "y_test": y_test,
    "y_pred": y_pred
})

Unnamed: 0,y_test,y_pred
668,0,0
324,0,0
624,0,0
690,0,0
473,0,0
...,...,...
355,1,1
534,0,0
344,0,0
296,1,0


In [9]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Specificity = TN / (TN + FP)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
specificity = tn / (tn + fp)

f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

print("Model Evaluation Metrics:")
print(f"Accuracy      : {accuracy:.4f}")
print(f"Precision     : {precision:.4f}")
print(f"Recall        : {recall:.4f}")
print(f"Specificity   : {specificity:.4f}")
print(f"F1 Score      : {f1:.4f}")
print(f"ROC AUC Score : {roc_auc:.4f}\n")

Model Evaluation Metrics:
Accuracy      : 0.7338
Precision     : 0.6250
Recall        : 0.6364
Specificity   : 0.7879
F1 Score      : 0.6306
ROC AUC Score : 0.8184



In [10]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6, 5))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.colorbar()

labels = ['No Diabetes', 'Diabetes']
tick_marks = np.arange(len(labels))

plt.xticks(tick_marks, labels, rotation=45)
plt.yticks(tick_marks, labels)

for i in range(2):
    for j in range(2):
        plt.text(j, i, cm[i, j], horizontalalignment='center',
                 color="white" if cm[i, j] > cm.max() / 2. else "black")

plt.ylabel("Actual")
plt.xlabel("Predicted")

plt.tight_layout()
plt.savefig("confusion_matrix.png")
plt.close()

print("Saved: confusion_matrix.png")


# Plotting ROC Curve 
plt.figure(figsize=(6, 5))
RocCurveDisplay.from_predictions(y_test, y_prob)
plt.title("ROC Curve")
plt.savefig("roc_curve.png")
plt.close()

print("Saved: roc_curve.png")


# 8. Learning Curve Plot
train_sizes, train_scores, test_scores = learning_curve(
    model, X, y, cv=5, scoring="accuracy", n_jobs=-1,
    train_sizes=np.linspace(0.1, 1.0, 10)
)

train_mean = train_scores.mean(axis=1)
test_mean = test_scores.mean(axis=1)

plt.figure(figsize=(7, 5))
plt.plot(train_sizes, train_mean, label="Training Accuracy")
plt.plot(train_sizes, test_mean, label="Validation Accuracy")
plt.xlabel("Training Set Size")
plt.ylabel("Accuracy")
plt.title("Learning Curve (Random Forest)")
plt.legend()
plt.grid()
plt.savefig("learning_curve.png")
plt.close()

print("Saved: learning_curve.png")


# 9. Validation Curve (Hyperparameter: n_estimators)
param_range = [10, 50, 100, 150, 200, 300]

train_scores_v, test_scores_v = validation_curve(
    RandomForestClassifier(random_state=42),
    X, y,
    param_name="n_estimators",
    param_range=param_range,
    cv=5,
    scoring="accuracy"
)

train_mean_v = train_scores_v.mean(axis=1)
test_mean_v = test_scores_v.mean(axis=1)

plt.figure(figsize=(7, 5))
plt.plot(param_range, train_mean_v, label="Training Accuracy")
plt.plot(param_range, test_mean_v, label="Validation Accuracy")
plt.xlabel("n_estimators")
plt.ylabel("Accuracy")
plt.title("Validation Curve (Random Forest)")
plt.legend()
plt.grid()
plt.savefig("validation_curve.png")
plt.close()

print("Saved: validation_curve.png")

Saved: confusion_matrix.png
Saved: roc_curve.png
Saved: learning_curve.png
Saved: validation_curve.png


<Figure size 600x500 with 0 Axes>

In [11]:
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

print("\n Model saved as model.pkl")


 Model saved as model.pkl


In [18]:
X_test[:296]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigree,Age
668,6,98,58,33,190,34.0,0.430,43
324,2,112,75,32,0,35.7,0.148,21
624,2,108,64,0,0,30.8,0.158,21
690,8,107,80,0,0,24.6,0.856,34
473,7,136,90,0,0,29.9,0.210,50
...,...,...,...,...,...,...,...,...
355,9,165,88,0,0,30.4,0.302,49
534,1,77,56,30,56,33.3,1.251,24
344,8,95,72,0,0,36.8,0.485,57
296,2,146,70,38,360,28.0,0.337,29


In [17]:
y_test

668    0
324    0
624    0
690    0
473    0
      ..
355    1
534    0
344    0
296    1
462    0
Name: Outcome, Length: 154, dtype: int64