In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [3]:
def naive_bayes_pipeline_for_multiple_datasets(datasets, target_column="Target", test_size=0.3, random_state=42):
    """
    Perform Naive Bayes classification on multiple datasets and feature selection methods.
    - datasets: List of dataset file paths (base names without method suffix).
    - target_column: Name of the target column.
    - test_size: Proportion of data for testing.
    - random_state: Random state for reproducibility.
    """
    feature_methods = [
        "_std_mean_selected.csv",
        "_anova_selected.csv",
        "_chi2_selected.csv"
    ]

    results = []  # Store results for comparison

    for dataset in datasets:
        print(f"\nProcessing dataset: {dataset}")
        for method in feature_methods:
            print(f"\n  Feature selection method: {method.replace('_selected.csv', '').upper()}")

            # Load the dataset
            selected_features_file = dataset + method
            data = pd.read_csv(selected_features_file)

            # Separate features (X) and target (y)
            X = data.drop(columns=[data.columns[0], target_column])  # Exclude Sample IDs and target
            y = data[target_column]

            # Split the data into training and testing sets
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, random_state=random_state)

            # Standardize the features (Naive Bayes does not require scaling but it's helpful for numerical stability)
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            # Initialize and train the Naive Bayes classifier
            nb = GaussianNB()
            nb.fit(X_train, y_train)

            # Make predictions
            y_pred = nb.predict(X_test)

            # Evaluate the model
            accuracy = accuracy_score(y_test, y_pred)
            report = classification_report(y_test, y_pred)
            confusion = confusion_matrix(y_test, y_pred)

            print("\n    Naive Bayes Classifier Performance:")
            print(f"    Accuracy: {accuracy}")
            print(f"\n    Classification Report:\n{report}")
            print(f"\n    Confusion Matrix:\n{confusion}")

            # Store results
            results.append({
                "Dataset": dataset,
                "Feature Method": method.replace("_selected.csv", "").upper(),
                "Accuracy": accuracy,
                "Classification Report": report,
                "Confusion Matrix": confusion
            })

    return results

In [5]:

# Example usage
if __name__ == "__main__":
    # List of datasets (base file names without method suffix)
    datasets = [
        "preprocessed/filtered_preprocessed_GSE4290",
        "preprocessed/filtered_preprocessed_GSE19804",
        "preprocessed/filtered_preprocessed_GSE27562",
        "preprocessed/filtered_preprocessed_GSE33315",
        "preprocessed/filtered_preprocessed_GSE59856"
    ]

    target_column = "Target"  # Replace with the actual target column name

    print(f"Running Naive Bayes classification on multiple datasets with different feature selection methods...")
    results = naive_bayes_pipeline_for_multiple_datasets(datasets=datasets, target_column=target_column)

    # Display summary results
    for result in results:
        print("\nSummary for Dataset:", result["Dataset"])
        print("Feature Method:", result["Feature Method"])
        print("Accuracy:", result["Accuracy"])

Running Naive Bayes classification on multiple datasets with different feature selection methods...

Processing dataset: preprocessed/filtered_preprocessed_GSE4290

  Feature selection method: _STD_MEAN

    Naive Bayes Classifier Performance:
    Accuracy: 0.4339622641509434

    Classification Report:
                   precision    recall  f1-score   support

      astrocytoma       0.25      0.25      0.25         8
     glioblastoma       0.67      0.26      0.38        23
        non-tumor       0.36      0.71      0.48         7
oligodendroglioma       0.45      0.67      0.54        15

         accuracy                           0.43        53
        macro avg       0.43      0.47      0.41        53
     weighted avg       0.50      0.43      0.42        53


    Confusion Matrix:
[[ 2  2  3  1]
 [ 6  6  1 10]
 [ 0  1  5  1]
 [ 0  0  5 10]]

  Feature selection method: _ANOVA

    Naive Bayes Classifier Performance:
    Accuracy: 0.5094339622641509

    Classification Report