In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc
from matplotlib.legend_handler import HandlerLine2D
from imblearn.over_sampling import ADASYN

In [None]:
# TO IMPORT CVS FILE AND CONVERT THEM INTO PANDAS DATAFRAME
all_features_list_df=pd.read_csv("training_cubic_all_features_list_result.csv",index_col=False)
all_features_count_df=all_features_list_df.stack().value_counts().sort_index().sort_values(ascending=False) # it returns a dataframe with the frequency for each features

In [None]:
# LEARNING CURVE (AUC vs incremental number of features )

# CHOOSE THE MODEL
model = RandomForestClassifier (random_state=1, n_estimators=100)

# CHOOSE THE FOLDS FOR CROSS VALIDATION
CV = 10

max_n_features_to_select = 50

all_accuracy_score_CV = []
all_roc_auc_CV = []
list_n_features = []

for first_n_features_to_select in range (max_n_features_to_select):
    first_n_features_to_select+=1
    print ("Computing with", first_n_features_to_select, "features:") 

    # To create a dataframe with N features (only from training dataset)
    training_dataframe_df = pd.read_csv("training - cubic after WEKA CfsSubsetEval.csv",index_col='exam')    
    size_mapping = {"codeletion":0,"noncodeletion":1}
    training_dataframe_df["outcome"] = training_dataframe_df["outcome"].map(size_mapping)
    training_feature_names = [x[2:-2] for x in [*all_features_count_df.index]]
    training_selected_features = training_feature_names[:first_n_features_to_select]
    training_New_dataframe = training_dataframe_df[training_selected_features]
    training_New_dataframe["outcome"] = training_dataframe_df["outcome"]
    training_dataframe_with_selected_features_df = training_New_dataframe

    # To rename the dataframe into X_np, Y_np (numpy arrays)
    X_np=(training_dataframe_with_selected_features_df.drop('outcome',axis=1)).values
    Y_np=(training_dataframe_with_selected_features_df['outcome']).values

    # Run classifier with cross-validation and store data (into all_roc_auc_CV)
    cv = StratifiedKFold(CV)
    Y_trues = []
    Y_predictions = []
    Y_probabilities = []
    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)

    for train, test in cv.split(X_np, Y_np):

        # StandardScaler   
        ss = StandardScaler() 
        X_train_CV_SS_np = ss.fit_transform(X_np[train])
        X_test_CV_SS_np = ss.transform(X_np[test])
        
        # Balancing X_train_CV_SS_np with ADASYN
        sm = ADASYN(random_state=1)
        X_train_CV_SS_BAL_np, y_train_CV_balanced = sm.fit_sample(X_train_CV_SS_np, Y_np[train])

        X_for_CV_model_training = X_train_CV_SS_BAL_np
        Y_for_CV_model_training = y_train_CV_balanced

        # Model fitting
        model.fit (X_for_CV_model_training, Y_for_CV_model_training)

        pred_ = model.predict(X_test_CV_SS_np)
        probas_ = model.predict_proba(X_test_CV_SS_np)
        accuracy_ = accuracy_score(Y_np[test], pred_)

        Y_trues.extend(Y_np[test])
        Y_predictions.extend(pred_)
        Y_probabilities.extend(probas_)

    # Compute Accuracy
    Y_trues_CV = Y_trues
    predicted_CV = Y_predictions
    accuracy_score_CV = accuracy_score (Y_trues_CV, predicted_CV)
    print ('Accuracy (computed with Cross Validation) for', first_n_features_to_select, 'features:', round(accuracy_score_CV,3))
        
    # Compute AUC
    Y_trues_CV = Y_trues
    Y_probabilities_CV = Y_probabilities
    fpr_CV, tpr_CV, threshold_CV = roc_curve (Y_trues_CV, np.array(Y_probabilities_CV)[:,1])
    roc_auc_CV = auc(fpr_CV, tpr_CV)
    print ("AUC (computed with Cross Validation) for", first_n_features_to_select, 'features:', round(roc_auc_CV,3))
    
    all_accuracy_score_CV.append(accuracy_score_CV)
    all_roc_auc_CV.append(roc_auc_CV)
    list_n_features.append(first_n_features_to_select)

# plot the learning curve
line1, = plt.plot(list_n_features, all_accuracy_score_CV, "r", label="Accuracy (Cross Validation)")
line2, = plt.plot(list_n_features, all_roc_auc_CV, "b", label="AUC (Cross Validation)")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel("Metric")
plt.xlabel("Number of features")
plt.show()