In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.preprocessing import StandardScaler

In [2]:
def standard_deviation_by_mean_ratio(df, k):
    """
    Select top k features based on the ratio of standard deviation to mean.
    """
    # Calculate the ratio for each feature
    feature_scores = (df.std() / df.mean()).abs()
    top_features = feature_scores.nlargest(k).index
    return df[top_features]

In [3]:
def anova_f_value(X, y, k):
    """
    Select top k features based on ANOVA F-value.
    """
    selector = SelectKBest(score_func=f_classif, k=k)
    X_new = selector.fit_transform(X, y)
    top_features = X.columns[selector.get_support()]
    return pd.DataFrame(X_new, columns=top_features)

In [4]:
def chi_square_statistics(X, y, k):
    """
    Select top k features based on Chi-Square statistics.
    """
    # Chi-Square requires non-negative values, scale if necessary
    X_scaled = StandardScaler().fit_transform(X)
    X_scaled = np.abs(X_scaled)  # Ensure all values are non-negative
    selector = SelectKBest(score_func=chi2, k=k)
    X_new = selector.fit_transform(X_scaled, y)
    top_features = X.columns[selector.get_support()]
    return pd.DataFrame(X_new, columns=top_features)

In [4]:
def chi_square_statistics(X, y, k):
    """
    Select top k features based on Chi-Square statistics.
    """
    # Chi-Square requires non-negative values, scale if necessary
    X_scaled = StandardScaler().fit_transform(X)
    X_scaled = np.abs(X_scaled)  # Ensure all values are non-negative
    selector = SelectKBest(score_func=chi2, k=k)
    X_new = selector.fit_transform(X_scaled, y)
    top_features = X.columns[selector.get_support()]
    return pd.DataFrame(X_new, columns=top_features)

In [11]:
def feature_selection_pipeline(file_path, target_column, k):
    """
    Perform feature selection on the given dataset using three methods.
    - file_path: Path to the filtered dataset.
    - target_column: Name of the target column.
    - k: Number of features to select.
    """
    # Load dataset
    data = pd.read_csv(file_path)

    # Separate features (X) and target (y)
    X = data.drop(columns=[data.columns[0], target_column])  # Exclude Sample IDs and target column
    y = data[target_column]

    # Standard Deviation by Mean Ratio
    print("Performing Standard Deviation by Mean Ratio...")
    std_mean_selected = standard_deviation_by_mean_ratio(X, k)
    std_mean_selected[target_column] = y  # Add the target column back
    std_mean_file = file_path.replace(".csv", "_std_mean_selected.csv")
    std_mean_selected.to_csv(std_mean_file, index=False)
    print(f"Standard Deviation by Mean Ratio results saved to {std_mean_file}")
    
    # ANOVA F-Value
    print("Performing ANOVA F-Value Selection...")
    anova_selected = anova_f_value(X, y, k)
    anova_selected[target_column] = y  # Add the target column back
    anova_file = file_path.replace(".csv", "_anova_selected.csv")
    anova_selected.to_csv(anova_file, index=False)
    print(f"ANOVA F-Value results saved to {anova_file}")
    
    # Chi-Square Statistics
    print("Performing Chi-Square Statistics Selection...")
    chi2_selected = chi_square_statistics(X, y, k)
    chi2_selected[target_column] = y  # Add the target column back
    chi2_file = file_path.replace(".csv", "_chi2_selected.csv")
    chi2_selected.to_csv(chi2_file, index=False)
    print(f"Chi-Square Statistics results saved to {chi2_file}")

    print("Feature selection completed for all methods.")

In [13]:
# Example usage
if __name__ == "__main__":
    # Preprocessed datasets
    file_paths = [
        "preprocessed/filtered_preprocessed_GSE27562.csv",
        "preprocessed/filtered_preprocessed_GSE19804.csv",
        "preprocessed/filtered_preprocessed_GSE4290.csv",
        "preprocessed/filtered_preprocessed_GSE59856.csv",
        "preprocessed/filtered_preprocessed_GSE33315.csv"
    ]
    target_column = "Target"  # Replace with the actual target column name
    k = 100  # Number of features to select

    for file_path in file_paths:
        print(f"Processing {file_path} for feature selection...")
        feature_selection_pipeline(file_path, target_column, k)

Processing preprocessed/filtered_preprocessed_GSE27562.csv for feature selection...
Performing Standard Deviation by Mean Ratio...
Standard Deviation by Mean Ratio results saved to preprocessed/filtered_preprocessed_GSE27562_std_mean_selected.csv
Performing ANOVA F-Value Selection...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  std_mean_selected[target_column] = y  # Add the target column back


ANOVA F-Value results saved to preprocessed/filtered_preprocessed_GSE27562_anova_selected.csv
Performing Chi-Square Statistics Selection...
Chi-Square Statistics results saved to preprocessed/filtered_preprocessed_GSE27562_chi2_selected.csv
Feature selection completed for all methods.
Processing preprocessed/filtered_preprocessed_GSE19804.csv for feature selection...
Performing Standard Deviation by Mean Ratio...
Standard Deviation by Mean Ratio results saved to preprocessed/filtered_preprocessed_GSE19804_std_mean_selected.csv
Performing ANOVA F-Value Selection...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  std_mean_selected[target_column] = y  # Add the target column back


ANOVA F-Value results saved to preprocessed/filtered_preprocessed_GSE19804_anova_selected.csv
Performing Chi-Square Statistics Selection...
Chi-Square Statistics results saved to preprocessed/filtered_preprocessed_GSE19804_chi2_selected.csv
Feature selection completed for all methods.
Processing preprocessed/filtered_preprocessed_GSE4290.csv for feature selection...
Performing Standard Deviation by Mean Ratio...
Standard Deviation by Mean Ratio results saved to preprocessed/filtered_preprocessed_GSE4290_std_mean_selected.csv
Performing ANOVA F-Value Selection...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  std_mean_selected[target_column] = y  # Add the target column back


ANOVA F-Value results saved to preprocessed/filtered_preprocessed_GSE4290_anova_selected.csv
Performing Chi-Square Statistics Selection...
Chi-Square Statistics results saved to preprocessed/filtered_preprocessed_GSE4290_chi2_selected.csv
Feature selection completed for all methods.
Processing preprocessed/filtered_preprocessed_GSE59856.csv for feature selection...
Performing Standard Deviation by Mean Ratio...
Standard Deviation by Mean Ratio results saved to preprocessed/filtered_preprocessed_GSE59856_std_mean_selected.csv
Performing ANOVA F-Value Selection...
ANOVA F-Value results saved to preprocessed/filtered_preprocessed_GSE59856_anova_selected.csv
Performing Chi-Square Statistics Selection...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  std_mean_selected[target_column] = y  # Add the target column back
  f = msb / msw


Chi-Square Statistics results saved to preprocessed/filtered_preprocessed_GSE59856_chi2_selected.csv
Feature selection completed for all methods.
Processing preprocessed/filtered_preprocessed_GSE33315.csv for feature selection...
Performing Standard Deviation by Mean Ratio...
Standard Deviation by Mean Ratio results saved to preprocessed/filtered_preprocessed_GSE33315_std_mean_selected.csv
Performing ANOVA F-Value Selection...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  std_mean_selected[target_column] = y  # Add the target column back


ANOVA F-Value results saved to preprocessed/filtered_preprocessed_GSE33315_anova_selected.csv
Performing Chi-Square Statistics Selection...
Chi-Square Statistics results saved to preprocessed/filtered_preprocessed_GSE33315_chi2_selected.csv
Feature selection completed for all methods.
