In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score

In [68]:
# load data
df_X = pd.read_csv('../data/X.csv', index_col=0)
df_y = pd.read_csv('../data/y.csv', index_col=0)

X = df_X.to_numpy()
y = df_y.values.ravel()  # 0 is HER2+, 1 is HR+, 2 is Triple Negative

In [69]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12345)

# RFECV

In [70]:
estimator = LogisticRegression(solver='liblinear', multi_class='ovr', C=1, max_iter=100, penalty='l1')  # best parameters from grid search
selector = RFECV(estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
selector = selector.fit(X_train, y_train)
print("Optimal number of features: ", selector.n_features_)
selected_features = np.where(selector.support_)[0]
print("Selected features: ", selected_features)

Optimal number of features:  33
Selected features:  [ 192  230  261  671  761  800  818  851  888 1303 1306 1559 1656 1663
 1679 1788 1895 1898 1910 1994 2130 2184 2212 2213 2218 2221 2547 2723
 2732 2742 2776 2791 2817]


# Lasso regression

In [71]:
# First Logistic Regression with L1 penalty
logreg1 = LogisticRegression(solver='liblinear', multi_class='ovr', C=1, max_iter=100, penalty='l1')
logreg1.fit(X_train, y_train)

# Create a boolean mask for features with non-zero coefficients in any class
features_first_round = np.any(logreg1.coef_ != 0, axis=0)
selected_features_first_round = df_X.columns[features_first_round].tolist()

# Apply the mask to reduce X to significant features only
X_reduced_train = X_train[:, features_first_round]
X_reduced_test = X_test[:, features_first_round]

# Second Logistic Regression with L1 penalty on the reduced feature set
logreg2 = LogisticRegression(solver='liblinear', multi_class='ovr', C=1, max_iter=100, penalty='l1')
logreg2.fit(X_reduced_train, y_train)

# Identify features with non-zero coefficients in the second round
features_second_round = np.any(logreg2.coef_ != 0, axis=0)
selected_features_second_round = df_X.columns[features_first_round][features_second_round].tolist()

# Output selected features
print("Selected features in the first round:", selected_features_first_round, 
      "\n length:", len(selected_features_first_round))
print("Selected features in the second round:", selected_features_second_round,
      "\n length:", len(selected_features_second_round))

# Evaluate model performance on the test set with reduced features
y_pred_first = logreg1.predict(X_test)
accuracy_first = accuracy_score(y_test, y_pred_first)
print("Accuracy on the test set (First Model):", accuracy_first)

y_pred_second = logreg2.predict(X_reduced_test)
accuracy_second = accuracy_score(y_test, y_pred_second)
print("Accuracy on the test set (Second Model):", accuracy_second)


Selected features in the first round: ['174', '189', '192', '229', '230', '261', '263', '385', '623', '671', '744', '745', '761', '765', '771', '791', '800', '801', '802', '818', '851', '854', '857', '888', '1059', '1065', '1087', '1160', '1243', '1302', '1303', '1306', '1551', '1559', '1656', '1663', '1679', '1788', '1895', '1897', '1898', '1902', '1910', '1994', '2017', '2026', '2058', '2083', '2130', '2184', '2206', '2212', '2213', '2214', '2218', '2221', '2382', '2547', '2549', '2662', '2723', '2732', '2742', '2775', '2776', '2777', '2789', '2791', '2816', '2817', '2818', '2827'] 
 length: 72
Selected features in the second round: ['174', '189', '192', '229', '230', '261', '385', '623', '671', '744', '745', '761', '765', '771', '791', '800', '801', '802', '818', '851', '854', '857', '888', '1065', '1087', '1160', '1243', '1302', '1303', '1306', '1559', '1656', '1663', '1679', '1788', '1895', '1898', '1902', '1910', '1994', '2017', '2026', '2058', '2083', '2130', '2184', '2206', '22

In [72]:
# find overlaps between RFECV and Lasso
second_round_features_int = [int(feature) for feature in selected_features_second_round]

# Find the common elements
common_features = set(second_round_features_int).intersection(selected_features)
print("Common features between RFECV and Lasso:", common_features,
      "\n length:", len(common_features))

Common features between RFECV and Lasso: {2817, 261, 2184, 1679, 1559, 1303, 1306, 671, 800, 2723, 2212, 2213, 2218, 2732, 2221, 818, 2742, 192, 1994, 2130, 851, 2776, 230, 1895, 2791, 1898, 2547, 888, 1910, 1656, 761, 1788, 1663} 
 length: 33
