In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [16]:
def knn_pipeline_for_multiple_datasets(datasets, target_column="Target", k_neighbors=5, test_size=0.3, random_state=42):
    """
    Perform KNN classification on multiple datasets and feature selection methods.
    - datasets: List of dataset file paths (base names without method suffix).
    - target_column: Name of the target column.
    - k_neighbors: Number of neighbors for KNN.
    - test_size: Proportion of data for testing.
    - random_state: Random state for reproducibility.
    """
    feature_methods = [
        "_std_mean_selected.csv",
        "_anova_selected.csv",
        "_chi2_selected.csv"
    ]

    results = []  # Store results for comparison

    for dataset in datasets:
        print(f"\nProcessing dataset: {dataset}")
        for method in feature_methods:
            print(f"\n  Feature selection method: {method.replace('_selected.csv', '').upper()}")

            # Load the dataset
            selected_features_file = dataset + method
            data = pd.read_csv(selected_features_file)

            # Separate features (X) and target (y)
            X = data.drop(columns=[data.columns[0], target_column])  # Exclude Sample IDs and target
            y = data[target_column]

            # Split the data into training and testing sets
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, random_state=random_state)

            # Standardize the features for KNN
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            # Initialize and train the KNN classifier
            knn = KNeighborsClassifier(n_neighbors=k_neighbors)
            knn.fit(X_train, y_train)

            # Make predictions
            y_pred = knn.predict(X_test)

            # Evaluate the model
            accuracy = accuracy_score(y_test, y_pred)
            report = classification_report(y_test, y_pred)
            confusion = confusion_matrix(y_test, y_pred)

            print("\n    KNN Classifier Performance:")
            print(f"    Accuracy: {accuracy}")
            print(f"\n    Classification Report:\n{report}")
            print(f"\n    Confusion Matrix:\n{confusion}")

            # Store results
            results.append({
                "Dataset": dataset,
                "Feature Method": method.replace("_selected.csv", "").upper(),
                "Accuracy": accuracy,
                "Classification Report": report,
                "Confusion Matrix": confusion
            })

    return results

In [17]:

# Example usage
if __name__ == "__main__":
    # List of datasets (base file names without method suffix)
    datasets = [
        "preprocessed/filtered_preprocessed_GSE4290",
        "preprocessed/filtered_preprocessed_GSE19804",
        "preprocessed/filtered_preprocessed_GSE27562",
        "preprocessed/filtered_preprocessed_GSE33315",
        "preprocessed/filtered_preprocessed_GSE59856"
    ]

    target_column = "Target"  # Replace with the actual target column name

    print(f"Running KNN classification on multiple datasets with different feature selection methods...")
    results = knn_pipeline_for_multiple_datasets(datasets=datasets, target_column=target_column, k_neighbors=5)

    # Display summary results
    for result in results:
        print("\nSummary for Dataset:", result["Dataset"])
        print("Feature Method:", result["Feature Method"])
        print("Accuracy:", result["Accuracy"])


Running KNN classification on multiple datasets with different feature selection methods...

Processing dataset: preprocessed/filtered_preprocessed_GSE4290

  Feature selection method: _STD_MEAN

    KNN Classifier Performance:
    Accuracy: 0.37735849056603776

    Classification Report:
                   precision    recall  f1-score   support

      astrocytoma       0.19      0.38      0.25         8
     glioblastoma       0.59      0.57      0.58        23
        non-tumor       0.00      0.00      0.00         7
oligodendroglioma       0.57      0.27      0.36        15

         accuracy                           0.38        53
        macro avg       0.34      0.30      0.30        53
     weighted avg       0.45      0.38      0.39        53


    Confusion Matrix:
[[ 3  2  3  0]
 [ 6 13  1  3]
 [ 4  3  0  0]
 [ 3  4  4  4]]

  Feature selection method: _ANOVA

    KNN Classifier Performance:
    Accuracy: 0.6981132075471698

    Classification Report:
                   pr