In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [3]:
# Load dataset
df = pd.read_csv("dataset/diabetes.csv")

X = df.drop(columns= 'Outcome', axis=1)
Y = df['Outcome']

# Check class distribution
print("Class distribution:\n", Y.value_counts())


Class distribution:
 Outcome
0    500
1    268
Name: count, dtype: int64


In [4]:
from imblearn.under_sampling import TomekLinks

tl = TomekLinks()
X, Y = tl.fit_resample(X, Y)  # Apply Tomek Links undersampling


In [5]:
# Split into 80% training and 20% testing, ensuring same class proportions
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42, stratify=Y
)


In [6]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [7]:
# Define models
models = {
    "SVM": SVC(kernel="linear", C=1, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42)
}

# Dictionary to store results
results = {}

# Train and Evaluate each model
for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate accuracy and confusion matrix
    acc = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Store results
    results[name] = {
        "Accuracy": acc,
        "Confusion Matrix": conf_matrix,
        "Classification Report": classification_report(y_test, y_pred, output_dict=True)
    }
    
    # Print model results
    print(f"{name} Accuracy: {acc:.4f}")
    print("Confusion Matrix:\n", conf_matrix)




Training SVM...


SVM Accuracy: 0.7832
Confusion Matrix:
 [[79 10]
 [21 33]]

Training Random Forest...
Random Forest Accuracy: 0.8182
Confusion Matrix:
 [[76 13]
 [13 41]]

Training Decision Tree...
Decision Tree Accuracy: 0.7483
Confusion Matrix:
 [[72 17]
 [19 35]]

Training Logistic Regression...
Logistic Regression Accuracy: 0.7972
Confusion Matrix:
 [[78 11]
 [18 36]]


In [8]:
# Identify the model with the highest accuracy
best_model = max(results, key=lambda x: results[x]["Accuracy"])
best_accuracy = results[best_model]["Accuracy"]

# Print the best model details
print(f"\n🚀 Best Model: {best_model}")
print(f"✅ Highest Accuracy: {best_accuracy:.4f}")
print(f"\n🔹 Confusion Matrix:\n{results[best_model]['Confusion Matrix']}")
print("\n🔹 Classification Report:")
print(pd.DataFrame(results[best_model]['Classification Report']).transpose())



🚀 Best Model: Random Forest
✅ Highest Accuracy: 0.8182

🔹 Confusion Matrix:
[[76 13]
 [13 41]]

🔹 Classification Report:
              precision    recall  f1-score     support
0              0.853933  0.853933  0.853933   89.000000
1              0.759259  0.759259  0.759259   54.000000
accuracy       0.818182  0.818182  0.818182    0.818182
macro avg      0.806596  0.806596  0.806596  143.000000
weighted avg   0.818182  0.818182  0.818182  143.000000
