In [None]:
import os 
import pandas as pd 
import numpy as np
from boruta import BorutaPy
from preprocessing import imputation_scaling

from sklearn.ensemble import RandomForestClassifier

# Boruta

In [None]:
datasetTimepoint = "12_sterol_discoveryValidation"
target = "disease_severity"

''' 
Define paths
'''
dataPath = "../../results/preprocessing/cleaned"
dataset = f"{datasetTimepoint}_{target}_cleaned.csv"
resultsPath = f"../../results/featureSelection/{datasetTimepoint}"
os.makedirs(resultsPath, exist_ok=True)

In [None]:
''' 
Read data
'''
data = pd.read_csv(f"{dataPath}/{dataset}", index_col=0)
X = data.drop(target, axis=1)
y = data[target].ravel()

'''
Prepare preprocessing
'''
num_columns = X.select_dtypes(include=["float64"]).columns
bin_columns = X.select_dtypes(include=["int64"]).columns
cat_columns = X.select_dtypes(include=["object"]).columns
preprocessor = imputation_scaling(num_columns, bin_columns, cat_columns, X)
columnOrderAfterPreprocessing = [ele[5:] for ele in preprocessor.get_feature_names_out()]

''' 
Preprocess
'''
X_preproc = preprocessor.fit_transform(X)

In [None]:
''' 
Boruta
'''

perc = 100

rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=0, random_state=None, perc=perc)
# find all relevant features
feat_selector.fit(X_preproc, y)


''' 
Get selected variables
'''
np.array(columnOrderAfterPreprocessing)[feat_selector.support_]

In [None]:
print(len(np.array(columnOrderAfterPreprocessing)[feat_selector.support_]))
[ele for ele in np.array(columnOrderAfterPreprocessing)[feat_selector.support_]]

# Save outcome

In [None]:
with open(f"{resultsPath}/boruta.txt", "w") as f: 
    for ele in np.array(columnOrderAfterPreprocessing)[feat_selector.support_]:
        f.write(ele+"\n")