In [1]:
import pandas as pd
import numpy as np


In [3]:
df = pd.read_csv('data/online_data.csv')

In [None]:
X = df.drop(labels=['Outputs'], axis=1)
y = df[['Outputs']]

Unnamed: 0,Outputs
0,1
1,0
2,1
3,0
4,1
...,...
763,0
764,0
765,0
766,1


In [9]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split


In [10]:
num_standard = ['Glucose', 'BloodPressure', 'BMI']
num_robust = ['Pregnancies', 'SkinThickness', 'Insulin', 'Age']


In [11]:
zero_invalid_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[zero_invalid_cols] = df[zero_invalid_cols].replace(0, np.nan)


In [None]:
standard_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [13]:
robust_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

In [14]:
preprocessor = ColumnTransformer(
    transformers=[
        ('std', standard_pipeline, num_standard),
        ('rob', robust_pipeline, num_robust)
    ]
)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [None]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [None]:
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(
    X_train_processed, y_train
)

In [33]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_res, y_train_res)


  y = column_or_1d(y, warn=True)


In [34]:
y_test_pred = model.predict(X_test_processed)


In [35]:
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix
)

print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_test_pred))


Accuracy: 0.7142857142857143

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.73      0.77       100
           1       0.58      0.69      0.63        54

    accuracy                           0.71       154
   macro avg       0.69      0.71      0.70       154
weighted avg       0.73      0.71      0.72       154


Confusion Matrix:
 [[73 27]
 [17 37]]


In [36]:
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix
)

def evaluate_classifier(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='binary')
    recall = recall_score(y_true, y_pred, average='binary')
    f1 = f1_score(y_true, y_pred, average='binary')
    cm = confusion_matrix(y_true, y_pred)

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "confusion_matrix": cm
    }

In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


In [39]:
models = {
    "LogisticRegression": LogisticRegression(
        max_iter=1000,
        class_weight="balanced",
        random_state=42
    ),
    "DecisionTree": DecisionTreeClassifier(
        random_state=42
    ),
    "RandomForest": RandomForestClassifier(
        n_estimators=200,
        class_weight="balanced",
        random_state=42
    ),
    "SVM": SVC(
        kernel="rbf",
        probability=True,
        class_weight="balanced",
        random_state=42
    )
}


In [40]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_classifier(y_true, y_pred):
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred)
    }


In [41]:
model_list = []
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

for model_name, model in models.items():
    
    # Train
    model.fit(X_train_res, y_train_res)

    # Predict on test data (NO SMOTE here)
    y_pred = model.predict(X_test_processed)

    # Evaluate
    metrics = evaluate_classifier(y_test, y_pred)

    print(model_name)
    print("Accuracy :", metrics["accuracy"])
    print("Precision:", metrics["precision"])
    print("Recall   :", metrics["recall"])
    print("F1-score :", metrics["f1"])
    print("=" * 40)

    model_list.append(model_name)
    accuracy_list.append(metrics["accuracy"])
    precision_list.append(metrics["precision"])
    recall_list.append(metrics["recall"])
    f1_list.append(metrics["f1"])


  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


LogisticRegression
Accuracy : 0.7142857142857143
Precision: 0.578125
Recall   : 0.6851851851851852
F1-score : 0.6271186440677966
DecisionTree
Accuracy : 0.7207792207792207
Precision: 0.5901639344262295
Recall   : 0.6666666666666666
F1-score : 0.6260869565217392
RandomForest
Accuracy : 0.7597402597402597
Precision: 0.639344262295082
Recall   : 0.7222222222222222
F1-score : 0.6782608695652174
SVM
Accuracy : 0.7077922077922078
Precision: 0.5652173913043478
Recall   : 0.7222222222222222
F1-score : 0.6341463414634146


  y = column_or_1d(y, warn=True)


In [42]:
import pandas as pd

model_comparison = pd.DataFrame({
    "Model": model_list,
    "Accuracy": accuracy_list,
    "Precision": precision_list,
    "Recall": recall_list,
    "F1-score": f1_list
})

model_comparison.sort_values(by="Recall", ascending=False)


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-score
2,RandomForest,0.75974,0.639344,0.722222,0.678261
3,SVM,0.707792,0.565217,0.722222,0.634146
0,LogisticRegression,0.714286,0.578125,0.685185,0.627119
1,DecisionTree,0.720779,0.590164,0.666667,0.626087
