In [None]:
# parkinsons_knn_save.py

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)

MODEL_FILENAME = "knn_parkinsons_model.pkl"

def load_data(path="../input/parkinsons-disease-data-set/parkinsons.data"):
    df = pd.read_csv(path)
    X = df.drop(["name", "status"], axis=1)
    y = df["status"]
    return X, y

def preprocess(X, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled  = scaler.transform(X_test)

    # Save the scaler
    joblib.dump(scaler, "knn_scaler.pkl")

    # Optional: Save the scaled train and test data
    save_scaled_data(X_train_scaled, y_train, filename="scaled_data_train.csv")
    save_scaled_data(X_test_scaled, y_test, filename="scaled_data_test.csv")

    return X_train_scaled, X_test_scaled, y_train, y_test

def train_evaluate_knn(X_train, X_test, y_train, y_test, n_neighbors=5):
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    cv_scores = cross_val_score(knn, X_train, y_train, cv=10)
    print(f"→ KNN (k={n_neighbors}) 10-fold CV accuracy: {cv_scores.mean():.2%}")

    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)

    acc  = accuracy_score(y_test,  y_pred)
    prec = precision_score(y_test, y_pred)
    rec  = recall_score(y_test,    y_pred)
    f1   = f1_score(y_test,        y_pred)

    print(f"→ Test Accuracy : {acc:.2%}")
    print(f"→ Precision     : {prec:.2%}")
    print(f"→ Recall        : {rec:.2%}")
    print(f"→ F1-Score      : {f1:.2%}")

    # Save predictions
    save_predictions(y_test, y_pred)

    # Save evaluation metrics
    save_metrics(acc, prec, rec, f1)

    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=["Healthy (0)", "PD (1)"],
                yticklabels=["Healthy (0)", "PD (1)"])
    plt.xlabel("Predicted label")
    plt.ylabel("True label")
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.show()

    # Save model
    joblib.dump(knn, MODEL_FILENAME)
    print(f"\nModel saved to '{MODEL_FILENAME}'")
    return knn

def save_predictions(y_true, y_pred, filename="knn_predictions.csv"):
    df = pd.DataFrame({
        "True_Label": y_true.reset_index(drop=True),
        "Predicted_Label": pd.Series(y_pred)
    })
    df.to_csv(filename, index=False)
    print(f"Predictions saved to '{filename}'")

def save_metrics(acc, prec, rec, f1, filename="knn_metrics.csv"):
    df = pd.DataFrame([{
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1-Score": f1
    }])
    df.to_csv(filename, index=False)
    print(f"Evaluation metrics saved to '{filename}'")

def save_scaled_data(X_scaled, y, filename):
    df = pd.DataFrame(X_scaled)
    df["Label"] = y.reset_index(drop=True)
    df.to_csv(filename, index=False)
    print(f"Scaled data saved to '{filename}'")

if __name__ == "__main__":
    # 1. Load data
    X, y = load_data()

    # 2. Preprocess (also saves scaled data + scaler)
    X_train, X_test, y_train, y_test = preprocess(X, y)

    # 3. Train, Evaluate, Save everything
    model = train_evaluate_knn(X_train, X_test, y_train, y_test, n_neighbors=5)

# ------------------------------
# You will get the following CSV files:
# - knn_predictions.csv (true vs predicted labels)
# - knn_metrics.csv (accuracy, precision, recall, f1)
# - scaled_data_train.csv
# - scaled_data_test.csv
# - knn_parkinsons_model.pkl (saved model)
# - knn_scaler.pkl (saved scaler)
