In [None]:
# -*- coding: utf-8 -*-
"""
Improved Parkinson's Disease Detection Model
XGBoost + 20 Features + SMOTE+ENN with Proper Aggregation
Eliminates data leakage through speaker-aware aggregation and splitting
"""

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report, confusion_matrix, mean_squared_error
)
from collections import Counter
from imblearn.combine import SMOTEENN
import shap
import kagglehub

# Download latest version
path = kagglehub.dataset_download("dipayanbiswas/parkinsons-disease-speech-signal-features")

print("Path to dataset files:", path)

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

def load_and_aggregate_data(file_path):
    """Load and aggregate data by speaker ID to prevent data leakage"""
    df = pd.read_csv(file_path)

    print(f"Original dataset shape: {df.shape}")
    print(f"Number of unique speakers: {df['id'].nunique()}")
    print(f"Total samples: {len(df)}")

    # Aggregate by speaker ID - one row per speaker
    # Group by 'id' and calculate mean for all numeric features
    feature_cols = [col for col in df.columns if col not in ['id', 'class']]

    # Create aggregation dictionary - mean for all features
    agg_dict = {col: 'mean' for col in feature_cols}
    # For class, take the mode (most common value) or max (assuming 1 = PD)
    agg_dict['class'] = 'max'  # If any sample is PD, speaker is PD

    aggregated_df = df.groupby('id').agg(agg_dict).reset_index()

    print(f"\nAggregated dataset shape: {aggregated_df.shape}")
    print(f"Class distribution after aggregation: {Counter(aggregated_df['class'])}")

    return aggregated_df

def get_top_20_features(df, exclude_cols=['id', 'class']):
    """Get top 20 features based on absolute correlation with target"""
    # Work with feature columns only
    feature_cols = [col for col in df.columns if col not in exclude_cols]

    # Calculate correlations with class
    correlations = pd.Series(
        {col: abs(df[col].corr(df['class'])) for col in feature_cols}
    ).sort_values(ascending=False)

    # Select top 20 features
    top_20_features = correlations.head(20).index.tolist()

    print(f"\nSelected top 20 features:")
    for i, feature in enumerate(top_20_features, 1):
        corr_val = df[feature].corr(df['class'])
        print(f"{i:2d}. {feature[:30]:30} (correlation: {corr_val:6.4f})")

    return top_20_features

def group_aware_train_test_split(df, test_size=0.25):
    """
    Perform train-test split ensuring no speaker appears in both sets
    Stratified by class to maintain class balance
    """
    # Separate features, target, and groups
    X = df.drop(['class', 'id'], axis=1)
    y = df['class']
    groups = df['id']

    # Use GroupShuffleSplit to ensure no ID appears in both train and test
    gss = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=RANDOM_STATE)

    for train_idx, test_idx in gss.split(X, y, groups):
        X_train_initial = X.iloc[train_idx]
        X_test = X.iloc[test_idx]
        y_train_initial = y.iloc[train_idx]
        y_test = y.iloc[test_idx]
        train_ids = groups.iloc[train_idx]
        test_ids = groups.iloc[test_idx]

    # Verify no overlap
    train_id_set = set(train_ids)
    test_id_set = set(test_ids)
    assert len(train_id_set.intersection(test_id_set)) == 0, "Data leakage detected!"

    print(f"\nTrain set: {len(X_train_initial)} speakers")
    print(f"Test set: {len(X_test)} speakers")
    print(f"Train class distribution: {Counter(y_train_initial)}")
    print(f"Test class distribution: {Counter(y_test)}")

    return X_train_initial, X_test, y_train_initial, y_test

def apply_smote_enn_to_train(X_train, y_train):
    """Apply SMOTE + ENN oversampling only to training data"""
    print(f"\nOriginal training class distribution: {Counter(y_train)}")

    smote_enn = SMOTEENN(random_state=RANDOM_STATE)
    X_train_resampled, y_train_resampled = smote_enn.fit_resample(X_train, y_train)

    print(f"After SMOTE + ENN on training: {Counter(y_train_resampled)}")

    return X_train_resampled, y_train_resampled

def create_best_model():
    """Create XGBoost model with optimal hyperparameters"""
    return xgb.XGBClassifier(
        n_estimators=300,
        max_depth=4,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=1,
        random_state=RANDOM_STATE,
        eval_metric='logloss',
        objective='binary:logistic'
    )

def evaluate_model(model, X_test, y_test):
    """Comprehensive model evaluation"""
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision (0)': precision_score(y_test, y_pred, pos_label=0),
        'Precision (1)': precision_score(y_test, y_pred, pos_label=1),
        'Recall (0)': recall_score(y_test, y_pred, pos_label=0),
        'Recall (1)': recall_score(y_test, y_pred, pos_label=1),
        'F1 score (0)': f1_score(y_test, y_pred, pos_label=0),
        'F1 score (1)': f1_score(y_test, y_pred, pos_label=1),
        'ROC-AUC': roc_auc_score(y_test, y_pred_proba),
        'MSE': mean_squared_error(y_test, y_pred)
    }

    return metrics, y_pred

def train_best_model(file_path):
    """Main training pipeline with proper aggregation and no data leakage"""
    print("="*80)
    print("IMPROVED PARKINSON'S DISEASE DETECTION MODEL")
    print("XGBoost + 20 Features + SMOTE+ENN (No Data Leakage)")
    print("="*80)

    # 1. Load and aggregate data by speaker ID
    aggregated_df = load_and_aggregate_data(file_path)

    # 2. Feature Selection - Top 20 correlation-based features
    top_20_features = get_top_20_features(aggregated_df)

    # 3. Prepare data with selected features
    X_selected = aggregated_df[top_20_features]
    y = aggregated_df['class']

    # Create temporary dataframe for splitting (includes ID for group-aware split)
    split_df = pd.concat([
        aggregated_df[['id']],
        X_selected,
        y
    ], axis=1)

    # 4. Group-aware train/test split (no speaker in both sets)
    X_train, X_test, y_train, y_test = group_aware_train_test_split(split_df)

    # 5. Apply SMOTE + ENN only to training data
    X_train_resampled, y_train_resampled = apply_smote_enn_to_train(X_train, y_train)

    # 6. Feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_resampled)
    X_test_scaled = scaler.transform(X_test)  # No synthetic data in test set

    # 7. Create and train model
    model = create_best_model()
    print(f"\nTraining XGBoost model...")
    model.fit(X_train_scaled, y_train_resampled)

    # 8. Evaluate model on real test data (no synthetic samples)
    metrics, y_pred = evaluate_model(model, X_test_scaled, y_test)

    # 9. Display results
    print("\n" + "="*60)
    print("FINAL MODEL PERFORMANCE (on real test data)")
    print("="*60)

    print(f"\nTest Set Results:")
    print("-" * 30)
    for metric, value in metrics.items():
        print(f"{metric:15}: {value:.4f}")

    # 10. Confusion Matrix
    print(f"\nConfusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    print(f"True Negatives: {cm[0,0]}, False Positives: {cm[0,1]}")
    print(f"False Negatives: {cm[1,0]}, True Positives: {cm[1,1]}")

    # 11. Classification Report
    print(f"\nDetailed Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['Non-PD', 'PD']))

    # 12. Feature Importance
    feature_importance = pd.DataFrame({
        'feature': top_20_features,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)

    print(f"\nTop 10 Most Important Features:")
    print("-" * 40)
    for i, (_, row) in enumerate(feature_importance.head(10).iterrows(), 1):
        print(f"{i:2d}. {row['feature'][:25]:25} ({row['importance']:.4f})")


    # 13. compute the SHAP values for the linear model
    background_adult = shap.maskers.Independent(X_test_scaled, max_samples=100)

    # compute SHAP values
    explainer = shap.Explainer(model, background_adult)
    shap_values = explainer(X_test_scaled)
    shap_values.feature_names = top_20_features

    # Now, when you plot, the feature names from 'top_20' will be used
    shap.plots.bar(shap_values, max_display=len(top_20_features))

    print("\n✅ MODEL TRAINING COMPLETE (No Data Leakage!)")
    print(f"Model trained on aggregated speaker data")
    print(f"Test set contains no speakers from training set")
    print(f"SMOTE+ENN applied only to training data")

    return model, scaler, top_20_features, metrics

# Main execution
if __name__ == "__main__":
    # Load your data (update path as needed)
    file_path = "/kaggle/input/parkinsons-disease-speech-signal-features/pd_speech_features.csv"

    try:
        # Train the best model with proper data handling
        model, scaler, features, metrics = train_best_model(file_path)

        # Summary of configuration
        print("\n" + "="*80)
        print("MODEL CONFIGURATION SUMMARY")
        print("="*80)
        print(f"Algorithm: XGBoost")
        print(f"Features: 20 (correlation-based selection)")
        print(f"Data Handling: Aggregated by speaker ID")
        print(f"Train-Test Split: Group-aware (no speaker overlap)")
        print(f"Oversampling: SMOTE + ENN (training only)")
        print(f"Test Data: Real samples only (no synthetic)")
        print(f"Accuracy: {metrics['Accuracy']:.4f}")
        print(f"Recall (PD): {metrics['Recall (1)']:.4f}")
        print(f"ROC-AUC: {metrics['ROC-AUC']:.4f}")
        print(f"MSE: {metrics['MSE']:.4f}")

    except FileNotFoundError:
        print("Please update the file_path variable to point to your dataset")
        print("Current path:", file_path)
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        import traceback
        traceback.print_exc()

IMPROVED PARKINSON'S DISEASE DETECTION MODEL
XGBoost + 20 Features + SMOTE+ENN (No Data Leakage)
Please update the file_path variable to point to your dataset
Current path: /kaggle/input/parkinsons-disease-speech-signal-features/pd_speech_features.csv
