In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Load dataset
df = pd.read_csv('dataset/parkinsons.csv')

X = df.drop(columns=['name' ,'status'], axis=1)
Y = df['status']

# Check class distribution
print("Class distribution:\n", Y.value_counts())

from imblearn.under_sampling import TomekLinks

tl = TomekLinks()
X, Y = tl.fit_resample(X, Y)  # Apply Tomek Links undersampling

# Split into 80% training and 20% testing, ensuring same class proportions
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42, stratify=Y
)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define models
models = {
    "SVM": SVC(kernel="linear", C=1, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42)
}

# Dictionary to store results
results = {}

# Train and Evaluate each model
for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate accuracy and confusion matrix
    acc = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Store results
    results[name] = {
        "Accuracy": acc,
        "Confusion Matrix": conf_matrix,
        "Classification Report": classification_report(y_test, y_pred, output_dict=True)
    }
    
    # Print model results
    print(f"{name} Accuracy: {acc:.4f}")
    print("Confusion Matrix:\n", conf_matrix)

# Identify the model with the highest accuracy
best_model = max(results, key=lambda x: results[x]["Accuracy"])
best_accuracy = results[best_model]["Accuracy"]

# Print the best model details
print(f"\n🚀 Best Model: {best_model}")
print(f"✅ Highest Accuracy: {best_accuracy:.4f}")
print(f"\n🔹 Confusion Matrix:\n{results[best_model]['Confusion Matrix']}")
print("\n🔹 Classification Report:")
print(pd.DataFrame(results[best_model]['Classification Report']).transpose())






Class distribution:
 status
1    147
0     48
Name: count, dtype: int64

Training SVM...
SVM Accuracy: 0.9211
Confusion Matrix:
 [[ 8  2]
 [ 1 27]]

Training Random Forest...
Random Forest Accuracy: 0.9211
Confusion Matrix:
 [[ 9  1]
 [ 2 26]]

Training Decision Tree...
Decision Tree Accuracy: 0.8947
Confusion Matrix:
 [[ 9  1]
 [ 3 25]]

Training Logistic Regression...
Logistic Regression Accuracy: 0.9211
Confusion Matrix:
 [[ 8  2]
 [ 1 27]]

🚀 Best Model: SVM
✅ Highest Accuracy: 0.9211

🔹 Confusion Matrix:
[[ 8  2]
 [ 1 27]]

🔹 Classification Report:
              precision    recall  f1-score    support
0              0.888889  0.800000  0.842105  10.000000
1              0.931034  0.964286  0.947368  28.000000
accuracy       0.921053  0.921053  0.921053   0.921053
macro avg      0.909962  0.882143  0.894737  38.000000
weighted avg   0.919944  0.921053  0.919668  38.000000
