In [None]:
# feature_selection.py
# Select features by F-score or P-value

from sklearn.feature_selection import f_classif
import pandas as pd

def select_features(x, y, method='f_score', n_features=100, p_value_threshold=0.05):
    f_values, p_values = f_classif(x, y)

    if method == 'f_score':
        # DataFrame with F-scores
        f_scores_df = pd.DataFrame({'feature': x.columns, 'f_score': f_values})
        # Sort features and select the top N
        top_features = f_scores_df.nlargest(n_features, 'f_score')['feature']
        return x[top_features]

    elif method == 'p_value':
        # DataFrame with P-values
        p_values_df = pd.DataFrame({'feature': x.columns, 'p_value': p_values})
        # Select features with P-values below the threshold
        selected_features = p_values_df[p_values_df['p_value'] < p_value_threshold]['feature']
        return x[selected_features]

# Sample usage:
# X_train_selected = select_features(X_train, y_train, method='p_value')
