In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.preprocessing import StandardScaler


def standard_deviation_by_mean_ratio(df, k):
    """
    Select top k features based on the ratio of standard deviation to mean.
    """
    feature_scores = (df.std() / df.mean()).abs()
    top_features = feature_scores.nlargest(k).index
    return df[top_features]


def anova_f_value(X, y, k):
    """
    Select top k features based on ANOVA F-value.
    """
    selector = SelectKBest(score_func=f_classif, k=k)
    X_new = selector.fit_transform(X, y)
    top_features = X.columns[selector.get_support()]
    return pd.DataFrame(X_new, columns=top_features)


def chi_square_statistics(X, y, k):
    """
    Select top k features based on Chi-Square statistics.
    """
    X_scaled = StandardScaler().fit_transform(X)  # Standardize for Chi-Square
    X_scaled = np.abs(X_scaled)  # Ensure non-negative values
    selector = SelectKBest(score_func=chi2, k=k)
    X_new = selector.fit_transform(X_scaled, y)
    top_features = X.columns[selector.get_support()]
    return pd.DataFrame(X_new, columns=top_features)


def feature_selection_pipeline(file_path, target_column, k_values, methods, output_dir):
    """
    Perform feature selection using various methods and save results.
    - file_path: Path to the dataset.
    - target_column: Name of the target column.
    - k_values: List of k values for feature selection.
    - methods: List of methods ['std_mean', 'anova', 'chi2'].
    - output_dir: Directory to save filtered datasets.
    """
    # Load dataset
    df = pd.read_csv(file_path)
    X = df.drop(columns=[df.columns[0], target_column])  # Exclude Sample IDs and target column
    y = df[target_column]

    for method in methods:
        for k in k_values:
            print(f"Running {method.upper()} with k={k} on {file_path}...")

            if method == 'std_mean':
                filtered_features = standard_deviation_by_mean_ratio(X, k)
            elif method == 'anova':
                filtered_features = anova_f_value(X, y, k)
            elif method == 'chi2':
                filtered_features = chi_square_statistics(X, y, k)
            else:
                raise ValueError(f"Unsupported method: {method}")

            # Add the target column to the filtered features
            filtered_df = pd.concat([filtered_features, y], axis=1)

            # Save the filtered dataset
            dataset_name = file_path.split("/")[-1].replace("filtered_preprocessed_", "").replace(".csv", "")
            output_file = f"{output_dir}/{dataset_name}_{method}_k{k}_selected.csv"
            filtered_df.to_csv(output_file, index=False)

            print(f"Saved {method} selected features (k={k}) to {output_file}")


# Example usage
if __name__ == "__main__":
    # Preprocessed datasets
    file_paths = [
        "preprocessed/filtered_preprocessed_GSE27562.csv",
        "preprocessed/filtered_preprocessed_GSE19804.csv",
        "preprocessed/filtered_preprocessed_GSE4290.csv",
        "preprocessed/filtered_preprocessed_GSE59856.csv",
        "preprocessed/filtered_preprocessed_GSE33315.csv"
    ]
    target_column = "Target"  # Replace with the actual target column name
    k_values = [100, 200, 300, 400, 500]  # Different values of k
    methods = ['std_mean', 'anova', 'chi2']  # Feature selection methods
    output_dir = "preprocessed/selected_features"  # Directory to save outputs

    for file_path in file_paths:
        feature_selection_pipeline(file_path, target_column, k_values, methods, output_dir)


Running STD_MEAN with k=100 on preprocessed/filtered_preprocessed_GSE27562.csv...
Saved std_mean selected features (k=100) to preprocessed/selected_features/GSE27562_std_mean_k100_selected.csv
Running STD_MEAN with k=200 on preprocessed/filtered_preprocessed_GSE27562.csv...
Saved std_mean selected features (k=200) to preprocessed/selected_features/GSE27562_std_mean_k200_selected.csv
Running STD_MEAN with k=300 on preprocessed/filtered_preprocessed_GSE27562.csv...
Saved std_mean selected features (k=300) to preprocessed/selected_features/GSE27562_std_mean_k300_selected.csv
Running STD_MEAN with k=400 on preprocessed/filtered_preprocessed_GSE27562.csv...
Saved std_mean selected features (k=400) to preprocessed/selected_features/GSE27562_std_mean_k400_selected.csv
Running STD_MEAN with k=500 on preprocessed/filtered_preprocessed_GSE27562.csv...
Saved std_mean selected features (k=500) to preprocessed/selected_features/GSE27562_std_mean_k500_selected.csv
Running ANOVA with k=100 on preproc

  f = msb / msw
  f = msb / msw


Saved anova selected features (k=100) to preprocessed/selected_features/GSE59856_anova_k100_selected.csv
Running ANOVA with k=200 on preprocessed/filtered_preprocessed_GSE59856.csv...
Saved anova selected features (k=200) to preprocessed/selected_features/GSE59856_anova_k200_selected.csv
Running ANOVA with k=300 on preprocessed/filtered_preprocessed_GSE59856.csv...


  f = msb / msw


Saved anova selected features (k=300) to preprocessed/selected_features/GSE59856_anova_k300_selected.csv
Running ANOVA with k=400 on preprocessed/filtered_preprocessed_GSE59856.csv...


  f = msb / msw


Saved anova selected features (k=400) to preprocessed/selected_features/GSE59856_anova_k400_selected.csv
Running ANOVA with k=500 on preprocessed/filtered_preprocessed_GSE59856.csv...


  f = msb / msw


Saved anova selected features (k=500) to preprocessed/selected_features/GSE59856_anova_k500_selected.csv
Running CHI2 with k=100 on preprocessed/filtered_preprocessed_GSE59856.csv...
Saved chi2 selected features (k=100) to preprocessed/selected_features/GSE59856_chi2_k100_selected.csv
Running CHI2 with k=200 on preprocessed/filtered_preprocessed_GSE59856.csv...
Saved chi2 selected features (k=200) to preprocessed/selected_features/GSE59856_chi2_k200_selected.csv
Running CHI2 with k=300 on preprocessed/filtered_preprocessed_GSE59856.csv...
Saved chi2 selected features (k=300) to preprocessed/selected_features/GSE59856_chi2_k300_selected.csv
Running CHI2 with k=400 on preprocessed/filtered_preprocessed_GSE59856.csv...
Saved chi2 selected features (k=400) to preprocessed/selected_features/GSE59856_chi2_k400_selected.csv
Running CHI2 with k=500 on preprocessed/filtered_preprocessed_GSE59856.csv...
Saved chi2 selected features (k=500) to preprocessed/selected_features/GSE59856_chi2_k500_sel