# Recursive Feature Elimination (RFE)

In [2]:
import pandas as pd
%run "..\Model\DataHelpers.ipynb"

print(f'Loading gene data - Start')
df = pd.read_csv('../Data/geneDataPreProcessed.csv')
print(f'Loading gene data - End')

Loading gene data - Start
Loading gene data - End


# Select Features to use in Recursive Feature Elimination (RFE)

In [18]:
variant  = FeatureVariant.RFE

onlyFeatures = len(df.columns)-2
features = df.columns[:onlyFeatures].tolist()

### Dataset split: training and test data

In [19]:
X, y, X_train, X_test, y_train, y_test, test_case_ids = split_data(df[features+constTargetAndMetadata], "tnbc", True)
print("\nApplied Smote")
X_smote, y_smote, X_train_smote, X_test_smote, y_train_smote, y_test_smote, test_case_ids_smote = split_data_apply_smote(df[features+constTargetAndMetadata], "tnbc")

X_train.shape=(781, 19938)
X_test.shape=(196, 19938)
y_train.shape=(781,)
y_test.shape=(196,)

Applied Smote
X_train.shape=(1379, 19938)
X_test.shape=(345, 19938)
y_train.shape=(1379,)
y_test.shape=(345,)


# Apply Recursive Feature Elimination (RFE)

In [21]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVR # Used by RFE
import numpy as np

### train n = 781

In [22]:
estimator = SVR(kernel="linear")

feat_selector = RFE(estimator=estimator, n_features_to_select= 0.25)

print(f'RFE - Start')
# Fit RFE to the data
feat_selector.fit(X_train, y_train)
print(f'RFE - End')

RFE - Start



KeyboardInterrupt



# Apply Recursive Feature Elimination (RFE) SMOTE
### SMOTE train n = 1379

In [6]:
estimator_smote = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5, random_state=42)

feat_selector_smote = RFE(estimator=estimator_smote, n_features_to_select= 0.25)

print(f'RFE SMOTE - Start')
# Fit RFE to the data #1205
feat_selector_smote.fit(X_train_smote, y_train_smote)
print(f'RFE SMOTE - End')

RFE SMOTE - Start
RFE SMOTE - End


# Convert RFE output to Features Lists (SMOTE and nSMOTE)

In [7]:
featureList = []
smoteFeatureList = []

for i in range(len(feat_selector.support_)):
    if feat_selector.support_[i]:
        featureList.append(X_train.columns[i])

for i in range(len(feat_selector_smote.support_)):
    if feat_selector_smote.support_[i]:
        smoteFeatureList.append(X_train_smote.columns[i])

print('Featurelist nSMOTE: '+ str(len(featureList)))
print('Featurelist SMOTE: ' + str(len(smoteFeatureList)))

Featurelist nSMOTE: 71
Featurelist SMOTE: 71


# Send dataframe to a csv

In [8]:
df[featureList+constTargetAndMetadata].to_csv(f'../Data/patient_genes_all_{variant}.csv', index=False)
df[smoteFeatureList+constTargetAndMetadata].to_csv(f'../Data/patient_genes_all_{variant}_smote.csv', index=False)