In [20]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedGroupKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

In [15]:
# Load dataset and add column for patient ID
df = pd.read_csv("../data_raw/parkinsons.csv")
df['patient_id'] = df['name'].str[:12]

# Define features and target
X = df.drop(columns=["patient_id", "status", "name"]).values  # Feature matrix
y = df["status"].values  # Labels
patient_ids = df["patient_id"].values  # Patient grouping

In [22]:
# Use Stratified Group K-Fold (for imbalanced data)
sgkf = StratifiedGroupKFold(n_splits=5)  # 5-fold stratified CV

# Define models
models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "SVM": SVC(random_state=42),
    "XGBoost": XGBClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42)
}

# Iterate over models to find best one
# Note: deep learning is not being tested due to small data size
for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    accuracies = []
    auc_scores = []
    all_y_true = []
    all_y_pred = []
    
    for train_idx, test_idx in sgkf.split(X, y, groups=patient_ids):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        # Train model
        model.fit(X_train, y_train)
        
        # Predict
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else model.decision_function(X_test)
        
        # Store metrics
        accuracies.append(accuracy_score(y_test, y_pred))
        auc_scores.append(roc_auc_score(y_test, y_prob))
        all_y_true.extend(y_test)
        all_y_pred.extend(y_pred)
    
    # Print results
    print(f"{model_name} - Average Accuracy: {np.mean(accuracies):.4f}")
    print(f"{model_name} - Average AUC: {np.mean(auc_scores):.4f}")
    print("\nFinal Classification Report:")
    print(classification_report(all_y_true, all_y_pred))


Training RandomForest...
RandomForest - Average Accuracy: 0.7951
RandomForest - Average AUC: 0.8715

Final Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.38      0.47        48
           1       0.82      0.93      0.87       147

    accuracy                           0.79       195
   macro avg       0.72      0.65      0.67       195
weighted avg       0.77      0.79      0.77       195


Training SVM...
SVM - Average Accuracy: 0.8176
SVM - Average AUC: 0.7715

Final Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.27      0.42        48
           1       0.81      0.99      0.89       147

    accuracy                           0.82       195
   macro avg       0.87      0.63      0.65       195
weighted avg       0.84      0.82      0.77       195


Training XGBoost...
XGBoost - Average Accuracy: 0.7992
XGBoost - Average AUC: 0.8182

Final Classification Rep

In [26]:
# logistic regression performs the best

# Tune hyperparameters
# Define Logistic Regression model
log_reg = LogisticRegression(max_iter=1000, solver="liblinear", random_state=42)

# Define hyperparameter grid
param_grid = {
    "C": np.logspace(-4, 4, 20),  # Regularization strength
    "penalty": ["l1", "l2"],  # Type of regularization
    "class_weight": ["balanced", None]
}

# Perform Randomized Search CV
random_search = RandomizedSearchCV(log_reg, param_grid, n_iter=20, scoring="accuracy", 
                                   cv=5, random_state=42, n_jobs=-1)
random_search.fit(X, y)

In [27]:
# Best model from hyperparameter tuning
best_log_reg = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)

# Evaluate best model using Stratified Group K-Fold
accuracies = []
auc_scores = []
all_y_true = []
all_y_pred = []

for train_idx, test_idx in sgkf.split(X, y, groups=patient_ids):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    # Train best model
    best_log_reg.fit(X_train, y_train)
    
    # Predict
    y_pred = best_log_reg.predict(X_test)
    y_prob = best_log_reg.predict_proba(X_test)[:, 1]
    
    # Store metrics
    accuracies.append(accuracy_score(y_test, y_pred))
    auc_scores.append(roc_auc_score(y_test, y_prob))
    all_y_true.extend(y_test)
    all_y_pred.extend(y_pred)

# Print final results
print(f"Logistic Regression - Average Accuracy: {np.mean(accuracies):.4f}")
print(f"Logistic Regression - Average AUC: {np.mean(auc_scores):.4f}")
print("\nFinal Classification Report:")
print(classification_report(all_y_true, all_y_pred))

Best Parameters: {'penalty': 'l2', 'class_weight': None, 'C': np.float64(0.23357214690901212)}
Logistic Regression - Average Accuracy: 0.8447
Logistic Regression - Average AUC: 0.8209

Final Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.40      0.55        48
           1       0.83      0.99      0.90       147

    accuracy                           0.84       195
   macro avg       0.87      0.69      0.73       195
weighted avg       0.85      0.84      0.82       195



In [28]:
best_log_reg.fit(X, y)  # Best model