In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [5]:
def naive_bayes_pipeline_for_selected_features(datasets, target_column="Target", test_size=0.3, random_state=42, k_values=None, methods=None, output_dir="preprocessed/selected_features"):
    """
    Perform Naive Bayes classification on datasets with selected features.
    - datasets: List of dataset file paths (base names without method suffix).
    - target_column: Name of the target column.
    - test_size: Proportion of data for testing.
    - random_state: Random state for reproducibility.
    - k_values: List of feature counts (k) used in feature selection.
    - methods: List of feature selection methods.
    - output_dir: Directory containing the selected feature files.
    """
    if k_values is None:
        k_values = [100, 200, 300, 400, 500]  # Default k values
    if methods is None:
        methods = ["std_mean", "anova", "chi2"]  # Default feature selection methods

    results = []  # Store results for comparison

    for dataset in datasets:
        dataset_name = dataset.split("/")[-1].replace("filtered_preprocessed_", "").replace(".csv", "")
        print(f"\nProcessing dataset: {dataset_name}")

        for method in methods:
            for k in k_values:
                print(f"\n  Feature selection method: {method.upper()}, k={k}")

                # Construct file path for the selected feature file
                selected_features_file = f"{output_dir}/{dataset_name}_{method}_k{k}_selected.csv"
                try:
                    # Load the dataset
                    data = pd.read_csv(selected_features_file)
                except FileNotFoundError:
                    print(f"File not found: {selected_features_file}. Skipping...")
                    continue

                # Separate features (X) and target (y)
                X = data.drop(columns=[target_column])
                y = data[target_column]

                # Split the data into training and testing sets
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, random_state=random_state)

                # Standardize the features (optional for Naive Bayes)
                scaler = StandardScaler()
                X_train = scaler.fit_transform(X_train)
                X_test = scaler.transform(X_test)

                # Initialize and train the Naive Bayes classifier
                nb = GaussianNB()
                nb.fit(X_train, y_train)

                # Make predictions
                y_pred = nb.predict(X_test)

                # Evaluate the model
                accuracy = accuracy_score(y_test, y_pred)
                report = classification_report(y_test, y_pred)
                confusion = confusion_matrix(y_test, y_pred)

                print("\n    Naive Bayes Classifier Performance:")
                print(f"    Accuracy: {accuracy}")
                print(f"\n    Classification Report:\n{report}")
                print(f"\n    Confusion Matrix:\n{confusion}")

                # Store results
                results.append({
                    "Dataset": dataset_name,
                    "Feature Method": method.upper(),
                    "Feature Count (k)": k,
                    "Accuracy": accuracy,
                    "Classification Report": report,
                    "Confusion Matrix": confusion
                })

    return results


In [6]:
# Example usage
if __name__ == "__main__":
    # List of datasets (base file names without method suffix)
    datasets = [
        "preprocessed/filtered_preprocessed_GSE4290",
        "preprocessed/filtered_preprocessed_GSE19804",
        "preprocessed/filtered_preprocessed_GSE27562",
        "preprocessed/filtered_preprocessed_GSE33315",
        "preprocessed/filtered_preprocessed_GSE59856"
    ]

    target_column = "Target"  # Replace with the actual target column name

    print(f"Running Naive Bayes classification on multiple datasets with different feature selection methods and k values...")
    results = naive_bayes_pipeline_for_selected_features(datasets=datasets, target_column=target_column)

    # Display summary results
    for result in results:
        print("\nSummary for Dataset:", result["Dataset"])
        print("Feature Method:", result["Feature Method"])
        print("Feature Count (k):", result["Feature Count (k)"])
        print("Accuracy:", result["Accuracy"])

Running Naive Bayes classification on multiple datasets with different feature selection methods and k values...

Processing dataset: GSE4290

  Feature selection method: STD_MEAN, k=100

    Naive Bayes Classifier Performance:
    Accuracy: 0.41509433962264153

    Classification Report:
                   precision    recall  f1-score   support

      astrocytoma       0.14      0.12      0.13         8
     glioblastoma       0.67      0.26      0.38        23
        non-tumor       0.33      0.71      0.45         7
oligodendroglioma       0.45      0.67      0.54        15

         accuracy                           0.42        53
        macro avg       0.40      0.44      0.38        53
     weighted avg       0.48      0.42      0.40        53


    Confusion Matrix:
[[ 1  2  4  1]
 [ 6  6  1 10]
 [ 0  1  5  1]
 [ 0  0  5 10]]

  Feature selection method: STD_MEAN, k=200

    Naive Bayes Classifier Performance:
    Accuracy: 0.6037735849056604

    Classification Report:
    