In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold

In [2]:
# load data
df_X = pd.read_csv('../data/X.csv', index_col=0)
df_y = pd.read_csv('../data/y.csv', index_col=0)

X = df_X.to_numpy()
y = df_y.values.ravel()  # 0 is HER2+, 1 is HR+, 2 is Triple Negative

In [3]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12345)

In [4]:
estimator = LogisticRegression(solver='liblinear', multi_class='ovr', C=1, max_iter=100, penalty='l1')  # best parameters from grid search
selector = RFECV(estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
selector = selector.fit(X_train, y_train)
print("Optimal number of features: ", selector.n_features_)
selected_features = selector.support_
print("Selected features: ", np.where(selected_features)[0])

Optimal number of features:  33
Selected features:  [ 192  230  261  671  761  802  818  851  888 1303 1306 1559 1656 1663
 1679 1788 1895 1898 1910 1994 2130 2184 2212 2213 2218 2221 2547 2723
 2732 2742 2776 2791 2817]
