In [None]:
import pandas as pd
import numpy as np
import time
import operator
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import ADASYN
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

In [None]:
# SET THE VARIABLES FOR LOOP

file_csv = pd.read_csv(r"training - linear after WEKA CfsSubsetEval.csv", index_col='exam') # LOAD THE DATABASE
size_mapping={"codeletion":0,"noncodeletion":1} # MAPPING for outcome 
file_csv["outcome"]=file_csv["outcome"].map(size_mapping)

LOOP = range(300) # SET THE NUMBER OF ITERATIONS

model = RandomForestClassifier(random_state=1, n_estimators=100) # SELECT THE MODEL

CV = 10 # SELECT THE NUMBER OF CROSS VALIDATION FOLDERS 

correlation_limit = 0.85 # SELECT VALUE FOR CORRELATION LIMIT TO REMOVE HIGH CORRELATED FEATURES (SPEARMAN)

n_features_CVFSE = 15 # HOW MANY FEATURES TO KEEP EACH LOOP IN RFECV (FSE WITH CROSS VALIDATION)

In [None]:
# START THE LOOP

all_features_list=[]

for exp in LOOP:
    
    print ("Cicle number:",exp+1)
    start = time.time()
    data_df = file_csv

    x = data_df.drop("outcome",axis=1).values
    y = data_df["outcome"].values 

    train_df, test_df = train_test_split(data_df, test_size=0.3,random_state=exp, stratify=y)

    x_train_df=train_df.drop("outcome",axis=1)
    x_test_df=test_df.drop("outcome",axis=1)
    y_train_df = train_df["outcome"]
    y_test_df = test_df["outcome"]

    # StandardScaler
    ss = StandardScaler() 
    x_train_np = ss.fit_transform(x_train_df) 
    x_train_df=pd.DataFrame(x_train_np, index=x_train_df.index, columns=x_train_df.columns)
    x_test_np = ss.fit_transform(x_test_df) 
    x_test_df=pd.DataFrame(x_test_np, index=x_test_df.index, columns=x_test_df.columns)

    # Create correlation matrix
    corr_matrix = x_train_df.corr(method ='spearman').abs()
    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    # Find features with correlation greater than 0.85
    to_drop = [column for column in upper.columns if any(upper[column] > correlation_limit)]
    # Drop features and create a new dataframe (train_df2)
    x_train_df2=x_train_df.drop(data_df[to_drop], axis=1)
    y_train_df2 = y_train_df.copy()

    # TO BALANCE THE DATASET FOR RFE WITH ADASYN     
    sm = ADASYN(random_state=1)
    x_train_NP2_balanced_for_RFE, y_train_NP2_balanced_for_RFE = sm.fit_sample(x_train_df2, y_train_df2)
    x_train_df2_balanced_RFE=pd.DataFrame(x_train_NP2_balanced_for_RFE, columns=x_train_df2.columns)
    y_train_df2_balanced_RFE=pd.DataFrame(y_train_NP2_balanced_for_RFE, columns=["outcome"])

    # FSE WITH CROSS VALIDATION (RFECV)
    model_RFE = model # Choose the model for RFECV
    rfecv = RFECV(estimator=model_RFE, step=1, cv=StratifiedKFold(CV), scoring='accuracy')
    rfecv.fit(x_train_df2_balanced_RFE,y_train_df2_balanced_RFE.values.ravel())
    rfecv.grid_scores_df=pd.DataFrame(rfecv.grid_scores_)
    rfecv.grid_scores_df.index += 1 
    number_of_features=n_features_CVFSE 
    selected_feature_names=x_train_df2_balanced_RFE.columns[list(rfecv.support_)]
    feat_dict ={}
    feat_import = list(rfecv.estimator_.feature_importances_)
    for i,name in enumerate(selected_feature_names):
        feat_dict[name] = feat_import[i]
    sorted_list = sorted(feat_dict.items(), key=operator.itemgetter(1), reverse=True)
    temp_NP=np.array(sorted_list)
    temp_DF=pd.Series(temp_NP[:number_of_features,0])
    selected_features_rfe=[list(temp_DF)]
    sorted_list[:number_of_features]
    features_list=[[l[0]] for l in sorted_list[:number_of_features]]
    all_features_list.append(features_list)
    
    end = time.time()
    print("time elapsed", end - start)

In [None]:
# to SAVE results into panda dataframe and EXPORT in csv format
all_features_list_df=pd.DataFrame(all_features_list)
all_features_list_df.to_csv("training_linear_all_features_list_result.csv", index=False)