In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
def random_forest_pipeline_for_multiple_datasets(datasets, target_column="Target", n_estimators=100, test_size=0.3, random_state=42):
    """
    Perform Random Forest classification on multiple datasets and feature selection methods.
    - datasets: List of dataset file paths (base names without method suffix).
    - target_column: Name of the target column.
    - n_estimators: Number of trees in the forest.
    - test_size: Proportion of data for testing.
    - random_state: Random state for reproducibility.
    """
    feature_methods = [
        "_std_mean_selected.csv",
        "_anova_selected.csv",
        "_chi2_selected.csv"
    ]

    results = []  # Store results for comparison

    for dataset in datasets:
        print(f"\nProcessing dataset: {dataset}")
        for method in feature_methods:
            print(f"\n  Feature selection method: {method.replace('_selected.csv', '').upper()}")

            # Load the dataset
            selected_features_file = dataset + method
            data = pd.read_csv(selected_features_file)

            # Separate features (X) and target (y)
            X = data.drop(columns=[data.columns[0], target_column])  # Exclude Sample IDs and target
            y = data[target_column]

            # Split the data into training and testing sets
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, random_state=random_state)

            # Initialize and train the Random Forest classifier
            rf = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)
            rf.fit(X_train, y_train)

            # Make predictions
            y_pred = rf.predict(X_test)

            # Evaluate the model
            accuracy = accuracy_score(y_test, y_pred)
            report = classification_report(y_test, y_pred)
            confusion = confusion_matrix(y_test, y_pred)

            print("\n    Random Forest Classifier Performance:")
            print(f"    Accuracy: {accuracy}")
            print(f"\n    Classification Report:\n{report}")
            print(f"\n    Confusion Matrix:\n{confusion}")

            # Store results
            results.append({
                "Dataset": dataset,
                "Feature Method": method.replace("_selected.csv", "").upper(),
                "Accuracy": accuracy,
                "Classification Report": report,
                "Confusion Matrix": confusion
            })

    return results


In [3]:

# Example usage
if __name__ == "__main__":
    # List of datasets (base file names without method suffix)
    datasets = [
        "preprocessed/filtered_preprocessed_GSE4290",
        "preprocessed/filtered_preprocessed_GSE19804",
        "preprocessed/filtered_preprocessed_GSE27562",
        "preprocessed/filtered_preprocessed_GSE33315",
        "preprocessed/filtered_preprocessed_GSE59856"
    ]

    target_column = "Target"  # Replace with the actual target column name

    print(f"Running Random Forest classification on multiple datasets with different feature selection methods...")
    results = random_forest_pipeline_for_multiple_datasets(datasets=datasets, target_column=target_column, n_estimators=100)

    # Display summary results
    for result in results:
        print("\nSummary for Dataset:", result["Dataset"])
        print("Feature Method:", result["Feature Method"])
        print("Accuracy:", result["Accuracy"])


Running Random Forest classification on multiple datasets with different feature selection methods...

Processing dataset: preprocessed/filtered_preprocessed_GSE4290

  Feature selection method: _STD_MEAN


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



    Random Forest Classifier Performance:
    Accuracy: 0.6037735849056604

    Classification Report:
                   precision    recall  f1-score   support

      astrocytoma       0.00      0.00      0.00         8
     glioblastoma       0.61      0.83      0.70        23
        non-tumor       0.75      0.43      0.55         7
oligodendroglioma       0.56      0.67      0.61        15

         accuracy                           0.60        53
        macro avg       0.48      0.48      0.46        53
     weighted avg       0.52      0.60      0.55        53


    Confusion Matrix:
[[ 0  5  0  3]
 [ 0 19  1  3]
 [ 0  2  3  2]
 [ 0  5  0 10]]

  Feature selection method: _ANOVA


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



    Random Forest Classifier Performance:
    Accuracy: 0.660377358490566

    Classification Report:
                   precision    recall  f1-score   support

      astrocytoma       0.00      0.00      0.00         8
     glioblastoma       0.68      0.83      0.75        23
        non-tumor       0.70      1.00      0.82         7
oligodendroglioma       0.60      0.60      0.60        15

         accuracy                           0.66        53
        macro avg       0.49      0.61      0.54        53
     weighted avg       0.56      0.66      0.60        53


    Confusion Matrix:
[[ 0  4  0  4]
 [ 0 19  2  2]
 [ 0  0  7  0]
 [ 0  5  1  9]]

  Feature selection method: _CHI2


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



    Random Forest Classifier Performance:
    Accuracy: 0.6037735849056604

    Classification Report:
                   precision    recall  f1-score   support

      astrocytoma       0.00      0.00      0.00         8
     glioblastoma       0.63      0.83      0.72        23
        non-tumor       0.70      1.00      0.82         7
oligodendroglioma       0.46      0.40      0.43        15

         accuracy                           0.60        53
        macro avg       0.45      0.56      0.49        53
     weighted avg       0.50      0.60      0.54        53


    Confusion Matrix:
[[ 0  3  0  5]
 [ 0 19  2  2]
 [ 0  0  7  0]
 [ 0  8  1  6]]

Processing dataset: preprocessed/filtered_preprocessed_GSE19804

  Feature selection method: _STD_MEAN

    Random Forest Classifier Performance:
    Accuracy: 0.9444444444444444

    Classification Report:
                        precision    recall  f1-score   support

           lung cancer       1.00      0.89      0.94        18


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



    Random Forest Classifier Performance:
    Accuracy: 0.5672514619883041

    Classification Report:
                precision    recall  f1-score   support

1_Hyperdiploid       0.48      0.60      0.53        35
   2_TCF3-PBX1       0.92      1.00      0.96        12
  3_ETV6_RUNX1       0.42      0.37      0.39        30
         4_MLL       1.00      0.44      0.62         9
          5_Ph       0.00      0.00      0.00         7
        6_Hypo       0.00      0.00      0.00         7
       7_Other       0.47      0.57      0.51        46
       8_T-ALL       0.79      0.92      0.85        25

      accuracy                           0.57       171
     macro avg       0.51      0.49      0.48       171
  weighted avg       0.53      0.57      0.54       171


    Confusion Matrix:
[[21  0  4  0  0  0 10  0]
 [ 0 12  0  0  0  0  0  0]
 [ 7  0 11  0  0  0 12  0]
 [ 0  0  1  4  0  0  2  2]
 [ 3  0  1  0  0  0  2  1]
 [ 3  0  1  0  0  0  3  0]
 [ 8  1  8  0  0  0 26  3]
 [ 2  0  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



    Random Forest Classifier Performance:
    Accuracy: 0.8070175438596491

    Classification Report:
                precision    recall  f1-score   support

1_Hyperdiploid       0.78      0.83      0.81        35
   2_TCF3-PBX1       0.92      1.00      0.96        12
  3_ETV6_RUNX1       0.91      0.97      0.94        30
         4_MLL       1.00      0.89      0.94         9
          5_Ph       0.00      0.00      0.00         7
        6_Hypo       0.00      0.00      0.00         7
       7_Other       0.68      0.78      0.73        46
       8_T-ALL       0.86      0.96      0.91        25

      accuracy                           0.81       171
     macro avg       0.64      0.68      0.66       171
  weighted avg       0.74      0.81      0.77       171


    Confusion Matrix:
[[29  0  0  0  0  0  5  1]
 [ 0 12  0  0  0  0  0  0]
 [ 0  0 29  0  0  0  0  1]
 [ 0  0  0  8  0  0  1  0]
 [ 1  0  0  0  0  0  6  0]
 [ 1  0  1  0  0  0  5  0]
 [ 5  1  2  0  0  0 36  2]
 [ 1  0  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



    Random Forest Classifier Performance:
    Accuracy: 0.6432748538011696

    Classification Report:
                precision    recall  f1-score   support

1_Hyperdiploid       0.58      0.60      0.59        35
   2_TCF3-PBX1       0.92      1.00      0.96        12
  3_ETV6_RUNX1       0.83      0.67      0.74        30
         4_MLL       1.00      0.78      0.88         9
          5_Ph       0.00      0.00      0.00         7
        6_Hypo       0.00      0.00      0.00         7
       7_Other       0.43      0.63      0.51        46
       8_T-ALL       0.88      0.84      0.86        25

      accuracy                           0.64       171
     macro avg       0.58      0.56      0.57       171
  weighted avg       0.63      0.64      0.63       171


    Confusion Matrix:
[[21  0  0  0  0  0 14  0]
 [ 0 12  0  0  0  0  0  0]
 [ 0  0 20  0  0  0  9  1]
 [ 0  0  0  7  0  0  2  0]
 [ 2  0  0  0  0  0  5  0]
 [ 2  0  1  0  0  0  4  0]
 [11  1  3  0  0  0 29  2]
 [ 0  0  