In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


def knn_pipeline_for_selected_features(datasets, target_column="Target", k_neighbors=5, test_size=0.3, random_state=42, k_values=None, methods=None, output_dir="preprocessed/selected_features"):
    """
    Perform KNN classification on datasets with selected features.
    - datasets: List of dataset file paths (base names without method suffix).
    - target_column: Name of the target column.
    - k_neighbors: Number of neighbors for KNN.
    - test_size: Proportion of data for testing.
    - random_state: Random state for reproducibility.
    - k_values: List of feature counts (k) used in feature selection.
    - methods: List of feature selection methods.
    - output_dir: Directory containing the selected feature files.
    """
    if k_values is None:
        k_values = [100, 200, 300, 400, 500]  # Default k values
    if methods is None:
        methods = ["std_mean", "anova", "chi2"]  # Default feature selection methods

    results = []  # Store results for comparison

    for dataset in datasets:
        dataset_name = dataset.split("/")[-1].replace("filtered_preprocessed_", "").replace(".csv", "")
        print(f"\nProcessing dataset: {dataset_name}")

        for method in methods:
            for k in k_values:
                print(f"\n  Feature selection method: {method.upper()}, k={k}")

                # Construct file path for the selected feature file
                selected_features_file = f"{output_dir}/{dataset_name}_{method}_k{k}_selected.csv"
                try:
                    # Load the dataset
                    data = pd.read_csv(selected_features_file)
                except FileNotFoundError:
                    print(f"File not found: {selected_features_file}. Skipping...")
                    continue

                # Separate features (X) and target (y)
                X = data.drop(columns=[target_column])
                y = data[target_column]

                # Split the data into training and testing sets
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, random_state=random_state)

                # Standardize the features for KNN
                scaler = StandardScaler()
                X_train = scaler.fit_transform(X_train)
                X_test = scaler.transform(X_test)

                # Initialize and train the KNN classifier
                knn = KNeighborsClassifier(n_neighbors=k_neighbors)
                knn.fit(X_train, y_train)

                # Make predictions
                y_pred = knn.predict(X_test)

                # Evaluate the model
                accuracy = accuracy_score(y_test, y_pred)
                report = classification_report(y_test, y_pred)
                confusion = confusion_matrix(y_test, y_pred)

                print("\n    KNN Classifier Performance:")
                print(f"    Accuracy: {accuracy}")
                print(f"\n    Classification Report:\n{report}")
                print(f"\n    Confusion Matrix:\n{confusion}")

                # Store results
                results.append({
                    "Dataset": dataset_name,
                    "Feature Method": method.upper(),
                    "Feature Count (k)": k,
                    "Accuracy": accuracy,
                    "Classification Report": report,
                    "Confusion Matrix": confusion
                })

    return results

In [2]:

# Example usage
if __name__ == "__main__":
    # List of datasets (base file names without method suffix)
    datasets = [
        "preprocessed/filtered_preprocessed_GSE4290",
        "preprocessed/filtered_preprocessed_GSE19804",
        "preprocessed/filtered_preprocessed_GSE27562",
        "preprocessed/filtered_preprocessed_GSE33315",
        "preprocessed/filtered_preprocessed_GSE59856"
    ]

    target_column = "Target"  # Replace with the actual target column name

    print(f"Running KNN classification on multiple datasets with different feature selection methods and k values...")
    results = knn_pipeline_for_selected_features(datasets=datasets, target_column=target_column, k_neighbors=5)

    # Display summary results
    for result in results:
        print("\nSummary for Dataset:", result["Dataset"])
        print("Feature Method:", result["Feature Method"])
        print("Feature Count (k):", result["Feature Count (k)"])
        print("Accuracy:", result["Accuracy"])

Running KNN classification on multiple datasets with different feature selection methods and k values...

Processing dataset: GSE4290

  Feature selection method: STD_MEAN, k=100

    KNN Classifier Performance:
    Accuracy: 0.37735849056603776

    Classification Report:
                   precision    recall  f1-score   support

      astrocytoma       0.19      0.38      0.25         8
     glioblastoma       0.59      0.57      0.58        23
        non-tumor       0.00      0.00      0.00         7
oligodendroglioma       0.57      0.27      0.36        15

         accuracy                           0.38        53
        macro avg       0.34      0.30      0.30        53
     weighted avg       0.45      0.38      0.39        53


    Confusion Matrix:
[[ 3  2  3  0]
 [ 6 13  1  3]
 [ 4  3  0  0]
 [ 3  4  4  4]]

  Feature selection method: STD_MEAN, k=200

    KNN Classifier Performance:
    Accuracy: 0.4528301886792453

    Classification Report:
                   precision

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



    KNN Classifier Performance:
    Accuracy: 0.6140350877192983

    Classification Report:
                precision    recall  f1-score   support

1_Hyperdiploid       0.45      0.83      0.59        35
   2_TCF3-PBX1       1.00      0.83      0.91        12
  3_ETV6_RUNX1       0.59      0.73      0.66        30
         4_MLL       1.00      0.44      0.62         9
          5_Ph       0.50      0.29      0.36         7
        6_Hypo       0.00      0.00      0.00         7
       7_Other       0.65      0.43      0.52        46
       8_T-ALL       0.86      0.72      0.78        25

      accuracy                           0.61       171
     macro avg       0.63      0.54      0.55       171
  weighted avg       0.64      0.61      0.60       171


    Confusion Matrix:
[[29  0  3  0  0  0  2  1]
 [ 2 10  0  0  0  0  0  0]
 [ 6  0 22  0  0  0  2  0]
 [ 2  0  0  4  0  0  2  1]
 [ 3  0  0  0  2  0  2  0]
 [ 5  0  0  0  0  0  2  0]
 [14  0  9  0  2  0 20  1]
 [ 3  0  3  0  0  0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



    KNN Classifier Performance:
    Accuracy: 0.6432748538011696

    Classification Report:
                precision    recall  f1-score   support

1_Hyperdiploid       0.46      0.83      0.59        35
   2_TCF3-PBX1       1.00      0.67      0.80        12
  3_ETV6_RUNX1       0.87      0.90      0.89        30
         4_MLL       1.00      0.56      0.71         9
          5_Ph       0.57      0.57      0.57         7
        6_Hypo       0.00      0.00      0.00         7
       7_Other       0.51      0.41      0.46        46
       8_T-ALL       0.95      0.72      0.82        25

      accuracy                           0.64       171
     macro avg       0.67      0.58      0.60       171
  weighted avg       0.67      0.64      0.64       171


    Confusion Matrix:
[[29  0  0  0  1  0  5  0]
 [ 3  8  0  0  0  0  1  0]
 [ 0  0 27  0  0  0  3  0]
 [ 2  0  0  5  0  0  2  0]
 [ 2  0  0  0  4  0  1  0]
 [ 4  0  1  0  0  0  2  0]
 [21  0  3  0  2  0 19  1]
 [ 2  0  0  0  0  1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



    KNN Classifier Performance:
    Accuracy: 0.18181818181818182

    Classification Report:
                      precision    recall  f1-score   support

biliary tract cancer       0.23      0.10      0.14        29
        colon cancer       0.00      0.00      0.00        15
    esophagus cancer       0.25      0.07      0.11        15
     healthy control       0.29      0.22      0.25        45
        liver cancer       0.11      0.62      0.19        16
   pancreatic cancer       0.28      0.17      0.21        30
      stomach cancer       1.00      0.07      0.12        15

            accuracy                           0.18       165
           macro avg       0.31      0.18      0.15       165
        weighted avg       0.30      0.18      0.17       165


    Confusion Matrix:
[[ 3  0  1  6 15  4  0]
 [ 3  0  0  3  7  2  0]
 [ 0  0  1  4  9  1  0]
 [ 2  1  0 10 28  4  0]
 [ 2  1  0  2 10  1  0]
 [ 3  1  2  6 13  5  0]
 [ 0  0  0  3 10  1  1]]

  Feature selection method:

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



    KNN Classifier Performance:
    Accuracy: 0.7393939393939394

    Classification Report:
                      precision    recall  f1-score   support

biliary tract cancer       0.57      0.69      0.62        29
        colon cancer       0.71      0.67      0.69        15
    esophagus cancer       0.75      0.80      0.77        15
     healthy control       0.98      1.00      0.99        45
        liver cancer       0.70      0.88      0.78        16
   pancreatic cancer       0.65      0.50      0.57        30
      stomach cancer       0.55      0.40      0.46        15

            accuracy                           0.74       165
           macro avg       0.70      0.70      0.70       165
        weighted avg       0.74      0.74      0.73       165


    Confusion Matrix:
[[20  0  0  1  0  8  0]
 [ 0 10  1  0  1  0  3]
 [ 0  0 12  0  2  0  1]
 [ 0  0  0 45  0  0  0]
 [ 0  1  0  0 14  0  1]
 [15  0  0  0  0 15  0]
 [ 0  3  3  0  3  0  6]]

  Feature selection method: 