In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

In [11]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv"
column_names = [
    "Pregnancies",
    "Glucose",
    "BloodPressure",
    "SkinThickness",
    "Insulin",
    "BMI",
    "DiabetesPedigreeFunction",
    "Age",
    "Outcome",
]
data = pd.read_csv(url, names=column_names)

In [12]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [13]:
print("\nClass Distribution:\n", data["Outcome"].value_counts())


Class Distribution:
 Outcome
0    500
1    268
Name: count, dtype: int64


In [14]:
X = data.drop("Outcome", axis=1)
y = data["Outcome"]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [16]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [17]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

In [18]:
y_pred = clf.predict(X_test)
print("\nBefore Oversampling:")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Before Oversampling:
[[85 15]
 [22 32]]
              precision    recall  f1-score   support

           0       0.79      0.85      0.82       100
           1       0.68      0.59      0.63        54

    accuracy                           0.76       154
   macro avg       0.74      0.72      0.73       154
weighted avg       0.75      0.76      0.76       154



In [19]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [20]:
clf_smote = RandomForestClassifier(random_state=42)
clf_smote.fit(X_train_smote, y_train_smote)

In [21]:
y_pred_smote = clf_smote.predict(X_test)
print("\nAfter Oversampling (SMOTE):")
print(confusion_matrix(y_test, y_pred_smote))
print(classification_report(y_test, y_pred_smote))


After Oversampling (SMOTE):
[[74 26]
 [15 39]]
              precision    recall  f1-score   support

           0       0.83      0.74      0.78       100
           1       0.60      0.72      0.66        54

    accuracy                           0.73       154
   macro avg       0.72      0.73      0.72       154
weighted avg       0.75      0.73      0.74       154



In [22]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
    }


y_pred = clf.predict(X_test)
results_before = evaluate_model(y_test, y_pred)

y_pred_smote = clf_smote.predict(X_test)
results_after = evaluate_model(y_test, y_pred_smote)

print("\nPerformance Metrics Comparison:")
print(f"{'Metric':<15}{'Before Oversampling':<25}{'After Oversampling':<25}")
for metric in results_before.keys():
    print(
        f"{metric.capitalize():<15}{results_before[metric]:<25.4f}{results_after[metric]:<25.4f}"
    )


Performance Metrics Comparison:
Metric         Before Oversampling      After Oversampling       
Accuracy       0.7597                   0.7338                   
Precision      0.6809                   0.6000                   
Recall         0.5926                   0.7222                   
F1_score       0.6337                   0.6555                   
