In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import os


In [2]:
# Folder containing the dataset
data_folder = "./pima-5-fold"

In [3]:
# Function to parse KEEL format files
def load_keel_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Find the start of the data section
    data_start = next(i for i, line in enumerate(lines) if "@data" in line.lower())
    
    # Extract data and load into a DataFrame
    data = pd.read_csv(
        file_path,
        skiprows=data_start + 1,
        header=None
    )

    # Extract column names from the header
    attribute_lines = [line for line in lines if line.lower().startswith("@attribute")]
    column_names = [line.split()[1] for line in attribute_lines]

    data.columns = column_names
    return data

In [4]:
# Hyperparameter tuning range for k
k_values = [1, 3, 5, 7, 9, 11]

# Dictionary to store results
results = {}

In [6]:
# Iterate over each k value
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    metrics = {
        'accuracy': [],
        'precision': [],
        'recall': [],
        'f1_score': [],
        'roc_auc': [],
        'sensitivity': [],
        'specificity': [],
        'true_positive_rate': [],
        'false_alarm_rate': []
    }

    # Perform evaluation across the 5 training-test pairs
    for fold in range(1, 6):
        # File paths for training and test data
        train_file = os.path.join(data_folder, f"pima-5-{fold}tra.dat")
        test_file = os.path.join(data_folder, f"pima-5-{fold}tst.dat")

        # Load data
        train_data = load_keel_file(train_file)
        test_data = load_keel_file(test_file)

        # Convert categorical target to numeric
        le = LabelEncoder()
        train_data['Class'] = le.fit_transform(train_data['Class'])
        test_data['Class'] = le.transform(test_data['Class'])

        # Split features and target
        X_train, y_train = train_data.iloc[:, :-1], train_data.iloc[:, -1]
        X_test, y_test = test_data.iloc[:, :-1], test_data.iloc[:, -1]

        # Feature scaling
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # Train the model
        knn.fit(X_train, y_train)

        # Predict on the test set
        y_pred = knn.predict(X_test)
        y_prob = knn.predict_proba(X_test)[:, 1] if hasattr(knn, 'predict_proba') else None

        # Compute confusion matrix
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

        # Compute metrics
        metrics['accuracy'].append(accuracy_score(y_test, y_pred))
        metrics['precision'].append(precision_score(y_test, y_pred, zero_division=0))
        metrics['recall'].append(recall_score(y_test, y_pred, zero_division=0))
        metrics['f1_score'].append(f1_score(y_test, y_pred))
        if y_prob is not None:
            metrics['roc_auc'].append(roc_auc_score(y_test, y_prob))
        metrics['sensitivity'].append(tp / (tp + fn) if (tp + fn) > 0 else 0)
        metrics['specificity'].append(tn / (tn + fp) if (tn + fp) > 0 else 0)
        metrics['true_positive_rate'].append(tp / (tp + fn) if (tp + fn) > 0 else 0)
        metrics['false_alarm_rate'].append(fp / (fp + tn) if (fp + tn) > 0 else 0)

    # Store average metrics for this k
    results[k] = {metric: np.mean(values) for metric, values in metrics.items()}

In [7]:
# Print results
print("Hyperparameter tuning results:\n")
for k, metrics in results.items():
    print(f"k = {k}")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")
    print()

# Find the best k based on accuracy
best_k = max(results, key=lambda x: results[x]['accuracy'])
print(f"Best k based on accuracy: {best_k}")
print(f"Metrics for best k ({best_k}): {results[best_k]}")

Hyperparameter tuning results:

k = 1
  accuracy: 0.7070
  precision: 0.5907
  recall: 0.5222
  f1_score: 0.5536
  roc_auc: 0.6641
  sensitivity: 0.5222
  specificity: 0.8060
  true_positive_rate: 0.5222
  false_alarm_rate: 0.1940

k = 3
  accuracy: 0.7448
  precision: 0.6549
  recall: 0.5706
  f1_score: 0.6083
  roc_auc: 0.7680
  sensitivity: 0.5706
  specificity: 0.8380
  true_positive_rate: 0.5706
  false_alarm_rate: 0.1620

k = 5
  accuracy: 0.7422
  precision: 0.6557
  recall: 0.5483
  f1_score: 0.5966
  roc_auc: 0.7751
  sensitivity: 0.5483
  specificity: 0.8460
  true_positive_rate: 0.5483
  false_alarm_rate: 0.1540

k = 7
  accuracy: 0.7461
  precision: 0.6601
  recall: 0.5519
  f1_score: 0.5997
  roc_auc: 0.7928
  sensitivity: 0.5519
  specificity: 0.8500
  true_positive_rate: 0.5519
  false_alarm_rate: 0.1500

k = 9
  accuracy: 0.7408
  precision: 0.6612
  recall: 0.5257
  f1_score: 0.5838
  roc_auc: 0.7943
  sensitivity: 0.5257
  specificity: 0.8560
  true_positive_rate: 0.5