In [None]:
# feature_selection.py
# Select features by F-score or P-value

import pandas as pd
from sklearn.feature_selection import f_classif

def select_features(X_train, y_train, X_test, method='f_score', n_features=100, p_value_threshold=0.05):

    # Perform feature selection
    f_values, p_values = f_classif(X_train, y_train)
    # delete the repeated columns from train and test
    X_train = X_train.loc[:, ~X_train.columns.duplicated()]
    X_test = X_test.loc[:, ~X_test.columns.duplicated()]


    if X_train.columns.duplicated().any():
        print("Warning: Duplicate feature names found in training data.")
    
    if method == 'f_score':
        # Select top n_features based on F-score
        f_scores = dict(zip(X_train.columns, f_values))
        top_features = sorted(f_scores, key=f_scores.get, reverse=True)[:n_features]
    elif method == 'p_value':
        # Select features based on p-value
        p_values_dict = dict(zip(X_train.columns, p_values))
        top_features = [feature for feature, p_value in p_values_dict.items() if p_value < p_value_threshold]
    else:
        raise ValueError("Method must be 'f_score' or 'p_value'")

    # Select the features in both train and test sets
    X_train_selected = X_train[top_features]
    X_test_selected = X_test[top_features].copy()


    return X_train_selected, X_test_selected


# Sample usage:
# X_train, X_test = select_features(X_train, y_train, X_test, method='p_value')
