In [None]:

!pip install mrmr_selection --quiet
!pip install skfeature-chappers --quiet
!pip install ReliefF --quiet
!pip install imbalanced-learn --quiet
!pip install scikit-posthocs --quiet

In [None]:
import pandas as pd
import heapq
import numpy as np
from sklearn.feature_selection import SelectFdr, RFE,f_classif
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC, SVR, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
import csv
import scipy
import scipy.io

from sklearn.model_selection import LeavePOut,LeaveOneOut, KFold
from sklearn.model_selection import cross_validate

# Feature selection preprocess
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import PowerTransformer
from sklearn.impute import SimpleImputer

from sklearn.feature_selection import SelectKBest



In [None]:
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from scipy.io.arff import loadarff

# Duplicating a single instance for the sake of stratifiedKFold
def duplicate_single_instances(df):
    labels = list(df.iloc[:,-1])
    unique_labels, counts = np.unique(labels,return_counts=True)
    labels_once = [unique_labels[i] for i in range(len(counts)) if counts[i] ==1 ]
    for x in labels_once:
        row = df[df.iloc[:,-1] == x]
        df = df.append(row,ignore_index=True)
        
    return df

# Converting labels to [1....n]
def classes_to_1_and_above(df):
    labels = df.iloc[:,-1]
    if type(labels[0]) is bytes:
        labels = [l.decode() for l in labels]
    if type(labels[0]) is str:
        unique_labels = np.unique(labels)
        new_labels = [np.where(unique_labels == x)[0][0] + 1 for x in labels]
    else:
        le = preprocessing.LabelEncoder()
        new_labels = le.fit_transform(labels)
        new_labels = [x+1 for x in new_labels]
    df.iloc[:,-1] = new_labels

# Not using
def fill_labels(df):
    labels = df.iloc[:,-1]
    maxl = labels[0] # find max label
    if labels.isnull().values.any():
        for label in labels:
            if label != np.nan:
                if maxl == np.nan:
                    maxl = label
                else:
                    maxl = max(maxl,label)
        df.iloc[:,-1].fillna(maxl + 1,inplace=True) # fill holes in max label + 1 and treat it as a class
    
def impute(df):
    mean = df.mean()
    df.fillna(mean,inplace=True)
    
# Main funciton for performing preprocessing
# Inversing, turning first row to header, move first column to last and deleting first column if needed.
# Additionaly, perofrming imputation and convertion of labels to [1....n]
def perform_df_processing(df,inv=False, FirstRowToHeader=False, putFirstLast=False,DeleteFirst=False):
    if inv:
        df=df.T
    if FirstRowToHeader:
        df.columns = df.iloc[0] # Removing the first row of rows names
        df = df.drop(df.index[0])
    if putFirstLast:
        temp_cols=df.columns.tolist()
        new_cols = temp_cols[1:-1] + temp_cols[0:1]
        df=df[new_cols]
    if DeleteFirst:
        colname = df.columns[0]
        df.drop(columns=[colname],inplace=True)
    
    df = duplicate_single_instances(df)
    impute(df)
    classes_to_1_and_above(df)
    return df

def read_csv(name,inv=False, FirstRowToHeader=False, putFirstLast=False,DeleteFirst=False):
    df = pd.read_csv(name)
    return perform_df_processing(df,inv,FirstRowToHeader, putFirstLast,DeleteFirst)

def read_mat(name,inv=False, FirstRowToHeader=False, putFirstLast=False,DeleteFirst=False):
    mat = scipy.io.loadmat(name)
    df = pd.DataFrame(np.hstack((mat['X'], mat['Y'])))
    return perform_df_processing(df,inv,FirstRowToHeader, putFirstLast,DeleteFirst)

def read_arff(name,inv=False, FirstRowToHeader=False, putFirstLast=False,DeleteFirst=False):
    data = loadarff(name)
    df = pd.DataFrame(data[0])
    return perform_df_processing(df,inv,FirstRowToHeader,putFirstLast,DeleteFirst)
#     return df

# General read dataset file which reads all relevant kinds of dataset files
def read(name,inv=False, FirstRowToHeader=False, putFirstLast=False,DeleteFirst=False):
    if name.endswith("mat"):
        return read_mat(name,inv,FirstRowToHeader, putFirstLast,DeleteFirst)
    elif name.endswith("csv"):
        return read_csv(name,inv,FirstRowToHeader, putFirstLast, DeleteFirst)
    elif name.endswith("arff"):
        return read_arff(name,inv,FirstRowToHeader, putFirstLast, DeleteFirst)
    
def get_x_y(df):
    return df.iloc[:,0:-1], df.iloc[:,-1].astype('int')



# WRITING FUNCTION

In [None]:
from os.path import exists
csv_top = ['Dataset Name',
          'Number of samples', 'Original number of features', 
          'Filtering Algorithm', 'Learning Algorithm', 
          'Number of features selected (K)', 'CV Method',
          'Fold', 'Measure Type', 
          'Measure Value', 'List of Selected Features Names', 
          'Selected Features Scores', 'Training time', 
          'Testing time']

def write_to_results(   ds_name,
                        n_samples,
                        original_n_features,
                        filtering_algo, 
                        learning_algo,
                        n_features_selected,
                        cv_method,
                        fold,
                        measure_type, 
                        measure_value,
                        selected_features_names,
                        selected_features_scores, 
                        train_time,
                        test_time):
    
    row = [
            ds_name,
            n_samples,
            original_n_features,
            filtering_algo,
            learning_algo,
            n_features_selected,
            cv_method,
            fold,
            measure_type,
            measure_value,
            selected_features_names,
            selected_features_scores,
            train_time,
            test_time]
    
    if (not exists(ds_name + ".csv")):
      with open(ds_name+".csv", 'w+', encoding='UTF8') as ds_file:
        writer = csv.writer(ds_file)
        writer.writerow(csv_top)
    with open(ds_name+ ".csv", 'a', encoding='UTF8', newline='') as ds_file:
        writer = csv.writer(ds_file)
        writer.writerow(row)


In [None]:
base_lpo_fold = 2
small_size = 50
mid_size = small_size * 2
large_size = mid_size * 10

from sklearn.model_selection import StratifiedKFold

def get_lpo_fold(): 
    lpo_fold = base_lpo_fold if len(X) >base_lpo_fold else len(X)-1
    return lpo_fold

# Using to get the correct cv for a given dataset
def get_cross_validation(X):
    l = len(X)
    if l <= small_size:
        return LeavePOut(get_lpo_fold())
    if l <=mid_size:
        return LeaveOneOut()
    if l <=large_size:
        classes = np.unique(X.iloc[:,-1])
        if len(classes) <10:
            return KFold(n_splits=10)
        return StratifiedKFold(n_splits=10)
    classes = np.unique(X.iloc[:,-1])
    if len(classes) <5:
        return KFold(n_splits=5)
    return StratifiedKFold(n_splits=5)
        
def get_cross_validation_name(X):
    l = len(X)
    if l <= 50:
        return "LeavePOut"
    if l <=100:
        return "LeaveOneOut"
    if l <=1000:
        return "KFold 10"
    return "KFold 5"

def get_cross_validation_num_fold(X):
    l = len(X)
    if l <= 50:
        return get_lpo_fold()
    if l <=100:
        return 1
    if l <=1000:
        return 10
    return 5
        

# Datasets

In [None]:
def get_name(file_path):
    return file_path.split(".")[2].split("/")[-1]

datasets = [
    ('../input/bioconductor/COPDSexualDimorphism.data.csv',True,True,True),
    ('../input/bioconductor/bcellViper.csv',True,True,True),
    ('../input/bioconductor/bladderbatch.csv',True,True,True),
    ('../input/bioconductor/ALL.csv',True,True,True),
    
    ('../input/bioconductor/SRBCT.arff',False,False,False),
    ('../input/bioconductor/Lymphoma.arff',False,False,False),
    ('../input/bioconductor/Breast.arff',False,False,False),
    
    
    ('../input/microbiomic/CS.csv',False,False,False,True),
    ('../input/microbiomic/CSS.csv',False,False,False,True),
    ('../input/microbiomic/FS.csv',False,False,False,True),
    ('../input/microbiomic/FSH.csv',False,False,False,False),
    ('../input/microbiomic/CBH.csv',False,False,False,True),
    
    ('../input/sickitmat/ALLAML.mat',False,False,False),
    ('../input/sickitmat/Carcinom.mat',False,False,False),
    ('../input/sickitmat/pixraw10P.mat',False,False,False),
    ('../input/sickitmat/Prostate-GE.mat',False,False,False),
    ('../input/sickitmat/SMK-CAN-187.mat',False,False,False),
    
    
    ('../input/microarray/chin.csv',False,False,False),
    ('../input/microarray/su.csv',False,False,False),
    ('../input/microarray/yeoh.csv',False,False,False),
]


# K-Values

In [None]:
k_values = [1,2,3,4,5,10,15,20,25,30,50,100]

# Models and Measurments

# Models

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVR
from sklearn.naive_bayes import GaussianNB

models = [
    KNeighborsClassifier(5),
    RandomForestClassifier(),
    LogisticRegression(),
    SVC(probability=True),
    GaussianNB(),
]
models_names = [
    'KNN',
    'Random Forest',
    'Logistic Regression',
    'SVM',
    'NB',
]

# Metrics

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import average_precision_score
from sklearn.metrics import classification_report



def pr_auc(y, y_pred_proba,**kwargs):
    if 0 not in y:
        y = [x-1 for x in y]
    classes = np.unique(y)
    def calc_pr_auc(y,y_pred_proba,class_index):
        pos_class_probas = [pred_probas[class_index] for pred_probas in y_pred_proba]
        class_num = classes[class_index]
        y_binary = [1 if x == class_num else 0 for x in y]
        precision, recall, thresh = precision_recall_curve(y_binary,pos_class_probas)
        pr_auc = auc(sorted(precision),sorted(recall)) # calc auc of precision recall curve
        return pr_auc
    
    final_pr_auc = 0
    for i in range(len(classes)):
        final_pr_auc += calc_pr_auc(y,y_pred_proba,i)
    return final_pr_auc / len(classes)

def roc_auc(y,y_pred_proba, **kwargs):
    classes = np.unique(y)
    y_pred_proba = np.array(y_pred_proba)
    if len(classes) == 2:
        y_pred_proba = [x[1] for x in y_pred_proba]
    classes_index = [c-1 for c in classes]
    try:
        return roc_auc_score(y,y_pred_proba, multi_class='ovo',labels=kwargs["labels"])
    except:
        return -1
    
def matthew(y,y_pred, **kwargs):
    return matthews_corrcoef(y,y_pred)
def acc(y,y_pred,**kwargs):
    return accuracy_score(y,y_pred)
    
from sklearn.metrics import  make_scorer

# Returns a generic metric function for a standard API
def metric_closure(metric, needs_proba=False):
    def apply_metric(estimator, X, y, y_pred=None, y_pred_proba=None,labels=None):
        if needs_proba:
            if y_pred_proba is None:
                y_pred_proba = estimator.predict_proba(X)
            return metric(y,y_pred_proba, labels=labels)
        else:
            if y_pred is None:
                y_pred = estimator.predict(X)
            return metric(y,y_pred, labels=labels)
    return apply_metric
        

# Some metrics need to receive y_score with more than one class
# in the it or an error is thrown.
# This is a wrapper function for these metrics
def more_than_one_class(metric):
    def more_than_one_class1(y,y_pred,labels=None):
        return None if len(np.unique(y)) <= 1 else metric(y,y_pred,labels=labels)
    return more_than_one_class1

metrics = [
    metric_closure(more_than_one_class(roc_auc),True), # making sure there is more than 1 classes in y - else not defined
    metric_closure(more_than_one_class(acc)),
    metric_closure(matthew),
    metric_closure(pr_auc,True),

]
metrics_names = [
    "AUC",
    "ACC",
    "MCC",
    "PR-AUC",
]


# Feature selections

# # CL4-FS

In [None]:
def feature_selection1(X, Y, K):
    """
    Thabtah, Fadi, et al. "Least Loss: A simplified filter method for feature selection."
    Information Sciences 534 (2020): 1-15
    """
    from sklearn.metrics import mutual_info_score
    import numpy as np
    ##Entropy
    def entropy(Y):
        """
        Also known as Shanon Entropy
        Reference: https://en.wikipedia.org/wiki/Entropy_(information_theory)
        """
        unique, count = np.unique(Y, return_counts=True, axis=0)
        prob = count/len(Y)
        en = np.sum((-1)*prob*np.log2(prob))
        return en


    #Joint Entropy
    def jEntropy(Y,X):
        """
        H(Y;X)
        Reference: https://en.wikipedia.org/wiki/Joint_entropy
        """
        YX = np.c_[Y,X]
        return entropy(YX)

    #Conditional Entropy
    def cEntropy(Y, X):
        """
        conditional entropy = Joint Entropy - Entropy of X
        H(Y|X) = H(Y;X) - H(X)
        Reference: https://en.wikipedia.org/wiki/Conditional_entropy
        """
        return jEntropy(Y, X) - entropy(X)
    
    def sort_by_other(other, n):
        ans = [0]*n
        for i in range(n):
            ans[i] = np.argmax(other)
            other[np.argmax(other)] = -1
        return ans
    
    num_rows, num_cols = X.shape
    MI = [0]*num_cols
    NS = [i for i in range(num_cols)]
    S = [0]*K
    
    # initialization
    for i in range(num_cols):
        MI[i] = mutual_info_score(Y, X[:, i])
    NS = sort_by_other(MI.copy(), num_cols)
    #choosing K features
    for k in range(min(K, num_cols)):
        S[k] = np.argmax(MI)
        MI[S[k]] = 0
        NS.remove(S[k])
        # updating the rest of the features
        for i in range(num_cols-k-1):
            MI[NS[i]] = min(MI[NS[i]], mutual_info_score(Y, X[:, NS[i]]))
        NS = sort_by_other(MI.copy(), num_cols-k)
            
    # ready the answer, for every value in S is a feature we chose so we place 1 for it, otherwise 0
    ans = [0]*num_cols
    for i in range(K):
        ans[S[i]] = 1
    return np.array(ans)

def feature_selection2(X, Y, K):
    """
    R. Cai, Z. Hao, X. Yang, W. Wen
    An efficient gene selection algorithm based on mutual information
    Neurocomputing, 72 (4-6) (2009), pp. 991-999

    """
    import numpy as np
    
    def calc_pxy(X, Y, xi, yi):
        match = 0
        for i in range(len(X)):
            if X[i] == xi and Y[i] == yi:
                match += 1
        return match/len(X)

    def calc_L2(pxy, X, Y, uX, uY):
        counter = 0
        sum_L2 = 0
        for i in range(len(uX)):
            xi_prob = np.count_nonzero(X == uX[i]) / len(X)
            for j in range(len(uY)):
                yj_prob = np.count_nonzero(Y == uY[j]) / len(Y)
                sum_L2 += (pxy[counter] - (xi_prob*yj_prob)) ** 2
                counter += 1
        return sum_L2
    
    num_rows, num_cols = X.shape
    feature_ans = [0] * num_cols
    L2_values = [0] * num_cols
    y_values = np.unique(Y)
    for i in range(num_cols):
        column = X[ :,i]
        unique_column = np.unique(column)
        xiyj = [calc_pxy(column, Y, j, k) for j in unique_column for k in y_values]
        L2 = calc_L2(xiyj, column, Y, unique_column, y_values)
        L2_values[i] = L2
    for i in range(min(K, num_cols)):
        max_val = np.amax(L2_values)
        arg = np.where(L2_values == max_val)[0][0]
        L2_values[arg] = -1
        feature_ans[arg] = 1
    
    return np.array(feature_ans)
        

def feature_selection_improved(X, Y, K):
    """
    R. Cai, Z. Hao, X. Yang, W. Wen
    An efficient gene selection algorithm based on mutual information
    Neurocomputing, 72 (4-6) (2009), pp. 991-999

    """
    import numpy as np
    
    def calc_pxy_improved(X, Y, xi, yi):
        match = 0
        for i in range(len(X)):
            if X[i] == xi and Y[i] == yi:
                match += 1
        return match/len(X)

    def calc_L2_improved(pxy, X, Y, uX, uY):
        margin = (np.amax(X) - np.amin(X)) * 0.01
        counter = 0
        sum_L2 = 0
        for i in range(len(uX)):
            xi_prob = np.count_nonzero(np.where(np.logical_and((X > uX[i] - margin), (X < uX[i] + margin)))[0])
            for j in range(len(uY)):
                yj_prob = np.count_nonzero(Y == uY[j]) / len(Y)
                sum_L2 += (pxy[counter] - (xi_prob*yj_prob)) ** 2
                counter += 1
        return sum_L2
    
    num_rows, num_cols = X.shape
    feature_ans = [0] * num_cols
    L2_values = [0] * num_cols
    y_values = np.unique(Y)
    for i in range(num_cols):
        column = X[ :,i]
        unique_column = np.unique(column)
        xiyj = [calc_pxy_improved(column, Y, j, k) for j in unique_column for k in y_values]
        L2 = calc_L2_improved(xiyj, column, Y, unique_column, y_values)
        L2_values[i] = L2
    for i in range(min(K, num_cols)):
        max_val = np.amax(L2_values)
        arg = np.where(L2_values == max_val)[0][0]
        L2_values[arg] = -1
        feature_ans[arg] = 1
    
    return np.array(feature_ans)

In [None]:
from sklearn.feature_selection import RFE
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectFdr
import heapq

import ReliefF
from skfeature.function.information_theoretical_based.MRMR import mrmr
from skfeature.function.similarity_based import reliefF


def get_k_biggest(a,K):
    best_indexes = heapq.nlargest(K, range(len(a)), a.take)
    return best_indexes

# Creating a class for each Feature selection algorithm so 
# it would fit exactly to the pipeline of evaluating the algorithms

class mrmrclass:
    def __init__(self,K):
        self.K= K
        self.features_names = []
        self.features = None
        self.scores_=[]
        
    def fit(self,X,y):
        self.features = mrmr(X.to_numpy(),y.to_numpy(),mode="index",n_selected_features=self.K)
        self.scores_=[1 for x in range(len(self.features))]
        
    def transform(self,X):
        new_X = X.iloc[:,self.features]
        self.features_names = [name for name in new_X.columns]
        return new_X
    
    def fit_transform(self,X,y):
        self.fit(X,y)
        return self.transform(X)

    def get_scores(self):
        return self.scores_ 
    
class reliefFclass:
    def __init__(self,K):
        self.features_names = []
        self.rel = None
        self.K = K
        self.features = None
        self.scores_ = []
    
    def fit(self,X,y):
        self.rel = ReliefF.ReliefF(n_neighbors=5,n_features_to_keep=self.K)
        self.rel.fit(X.to_numpy(),y.to_numpy())
        self.features = self.rel.top_features[:self.K] 
        self.features_names = [name for name in X.iloc[:,self.features]]
        self.scores_= self.rel.feature_scores[self.features]
        
    def transform(self,X):
        return X.loc[:,self.features_names]
    
    def fit_transform(self,X,y):
        self.fit(X,y)
        return self.transform(X)
    
    def get_scores(self):
        return self.scores_ 
    
class RFEclass:
    def __init__(self,K):
        self.features_names = []
        self.rfe = None
        self.K = K
        self.features = None
        self.scores_ = []
    
    def fit(self,X,y):
        self.rfe = RFE(SVR(kernel="linear"), n_features_to_select=self.K,verbose=2)
        self.rfe = self.rfe.fit(X,y)
        self.features = self.rfe.get_support(True)
        self.features_names = [x for x in X.iloc[:,self.features].columns]
        self.scores_= [1 for i in range(self.K)]
        
    def transform(self,X):
        return X.iloc[:,self.features]
    
    def fit_transform(self,X,y):
        self.fit(X,y)
        return self.transform(X)
    
    def get_scores(self):
        return self.scores_
    
class FDRclass:
    def __init__(self,K):
        self.features_names = []
        self.fdr=None
        self.K = K
        self.features = None
        self.scores_ = []
    
    def fit(self,X,y):
        self.fdr =  SelectFdr(f_classif, alpha=0.1)
        self.fdr.fit(X,y)
        self.scores_= self.fdr.scores_
        
    def transform(self,X):
        self.features=np.argsort(self.scores_)[::-1][0:self.K] # Taking top k elements in the scores array
        new_X = X.iloc[:,self.features]
        self.features_names = [name for name in new_X.columns]
        return new_X
    
    
    def fit_transform(self,X,y):
        self.fit(X,y)
        return self.transform(X)
    
    def get_scores(self):
        return [self.fdr.scores_[i] for i in self.features]
    
class fs1:
    def __init__(self,K):
        self.features_names = []
        self.K = K
        self.features = None
        self.scores_ = []
    
    def fit(self,X,y):
        fs_result = feature_selection1(X.to_numpy(),y.to_numpy(),self.K) # result = [0,1,1,0,0,1] (1 we take, 0 we dont)
        self.features = [] # saving the indexes inside features
        for i,selected in enumerate(fs_result):
            if selected == 1:
                self.features.append(i)
                
        self.features_names = np.array(X.columns[self.features])
        self.scores_= [1 for _ in range(self.K)]
        
    def transform(self,X):
        return X.loc[:,self.features_names]
    
    def fit_transform(self,X,y):
        self.fit(X,y)
        return self.transform(X)
    
    def get_scores(self):
        return self.scores_ 
    
class fs2:
    def __init__(self,K):
        self.features_names = []
        self.K = K
        self.features = None
        self.scores_ = []
    
    def fit(self,X,y):
        fs_result = feature_selection2(X.to_numpy(),y.to_numpy(),self.K) # result = [0,1,1,0,0,1] (1 we take, 0 we dont)
        self.features = [] # saving the indexes inside features
        for i,selected in enumerate(fs_result):
            if selected == 1:
                self.features.append(i)
                
        self.features_names = np.array(X.columns[self.features])
        self.scores_= [1 for _ in range(self.K)]
        
    def transform(self,X):
        return X.loc[:,self.features_names]
    
    def fit_transform(self,X,y):
        self.fit(X,y)
        return self.transform(X)
    
    def get_scores(self):
        return self.scores_ 
    
    
    
class fs2Improved:
    def __init__(self,K):
        self.features_names = []
        self.K = K
        self.features = None
        self.scores_ = []
    
    def fit(self,X,y):
        fs_result = feature_selection_improved(X.to_numpy(),y.to_numpy(),self.K) # result = [0,1,1,0,0,1] (1 we take, 0 we dont)
        self.features = [] # saving the indexes inside features
        for i,selected in enumerate(fs_result):
            if selected == 1:
                self.features.append(i)
                
        self.features_names = np.array(X.columns[self.features])
        self.scores_= [1 for _ in range(self.K)]
        
    def transform(self,X):
        return X.loc[:,self.features_names]
    
    def fit_transform(self,X,y):
        self.fit(X,y)
        return self.transform(X)
    
    def get_scores(self):
        return self.scores_ 
    
# All feature selection algorithms
fs = [
    lambda k: fs1(k),
    lambda k: fs2(k),
    lambda k: fs2Improved(k),
    lambda k: mrmrclass(k),
    lambda k: FDRclass(k),
    lambda k: reliefFclass(k),
    lambda k: RFEclass(k),
]
fs_names = [
    "mutual_information",
    "least_loss",
    "least_loss_improved",
    "mrmr",
    "f_classif",
    "reliefF",
    "RFE",
]


# Reading All Of The Datasets into **dataframes**


In [None]:
df_arr = [read(*ds) for ds in datasets]

# Feature Selection Methods

In [None]:
import time
class FoldInfo:
    def __init__(self,fold, measure_type, measure_value, selected_features, selected_features_scores,train_time, test_time):
        self.fold = fold
        self.measure_type = measure_type
        self.measure_value = measure_value
        self.selected_features = selected_features
        self.selected_features_scores = selected_features_scores
        self.train_time = train_time
        self.test_time = test_time
        
# selects K-best features for a df, and returns a df instead of np.array
def selectKBest_df(X,y,k): # X,y are dataframes!
    kb = SelectKBest(k=k)
    kb.fit(X,y)
    indices = kb.get_support(True)
    return X.iloc[:,indices]



def create_pipeline(fs,model,preprocess=None):
    pipe = Pipeline(
    [
        ("fs", fs),
        ("model",model),
    ])
    return pipe

def cross_validate_once(model,cv,X,y):
    scoring={metrics_names[i]: metrics[i] for i in range(len(metrics))}
    return cross_validate(model,X,y,scoring=scoring,cv=cv, verbose=1)

def apply_with_time(f):
    start_time = time.time()
    res = f()
    time_took = time.time() - start_time()
    return (res,time_took)

from tqdm import tqdm
# Applying custom CV function given a cv.
# Calculating the relevant metrics for the model and return the results in a special object.
def apply_cv(model,fs_model,cv,X,y): # X,y are dataFrame!
    scoring={metrics_names[i]: metrics[i] for i in range(len(metrics))}
    fold_counter = 1
    folds_information = [] # FoldInfo array
    # for each fold, apply all metrics and save the information needed with FoldInfo object
    
    
    for train_index, test_index in tqdm(cv.split(X,y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        train_time, test_time, evaluations = evaluate_metrics(model,X_train,y_train,X_test,y_test)
        features_names = fs_model.features_names
        scores = fs_model.get_scores()
        for i in range(len(evaluations)):
            measure_type = metrics_names[i]
            measure_value = evaluations[i]
            folds_information.append(FoldInfo(fold_counter,measure_type,measure_value,features_names,scores,train_time,test_time))
        
        fold_counter +=1
    return folds_information
        
# returns the evaluation of all metrics for the model
def evaluate_metrics(model,X_train,y_train,X_test,y_test):
    train_start = time.time()    
    model.fit(X_train,y_train)
    train_time = time.time() - train_start
    
    # just measuring the time it takes to predict_proba as requested
    test_start = time.time()
    y_pred_proba = model.predict_proba(X_test)
    test_time = time.time() - test_start
    y_pred = model.predict(X_test)
    
    labels= np.union1d(np.unique(y_train),np.unique(y_test))

    evaluations = [metric(model,X_test,y_test,y_pred,y_pred_proba,labels) for metric in metrics] # metrics values array
    return train_time, test_time, evaluations



# Starting the pipelining


In [None]:
# Evaluating all metrics on all models for a given feature selection algorithm
# There is more options of running this function for part D (giving specific K, specific model etc...)
def evaluate_fs(fs,fs_name,X,y,ds_name,cv_function,model=None,model_name=None,k_only=None,fs_ready=False):
    number_of_samples = len(X)
    dataset_name = ds_name
    original_number_of_features = len(X.columns)
    filtering_algorithm = fs_name
    
    models1 = models
    models_names1 = models_names
    if model is not None:
        models1 = [model]
        models_names1 = [model_name]
        
    for i in range(len(models1)): #For each model
        base_model = models1[i] # model for the classification
        learning_algorithm = models_names1[i]
        
        k_values1 = k_values
        if k_only is not None:
            k_values1 = [k_only]
        for k in k_values1: # For each k (number_of_features)
            
            number_of_features_selected = k
            cv = get_cross_validation(X)
            cv_method = get_cross_validation_name(X)
            fold = get_cross_validation_num_fold(X)
            
            # calculate metrics:
            if not fs_ready:
                fs_model = fs(k)
            else:
                fs_model = fs
            pipeline = create_pipeline(fs_model,base_model)            
            
            fold_info = cv_function(pipeline,fs_model,cv,X,y)
            
            for fold in fold_info:
                write_to_results(dataset_name,number_of_samples,original_number_of_features,
                                filtering_algorithm,learning_algorithm,number_of_features_selected,cv_method,
                                fold.fold,fold.measure_type,fold.measure_value,fold.selected_features,fold.selected_features_scores,
                                fold.train_time,fold.test_time,)

Running algorithms on all datasets in the datasets array

In [None]:
from tqdm import tqdm
for i in tqdm(range(len(fs))): # For each feature selection
    for j in range(len(datasets)): # For each dataset
        X,y = get_x_y(df_arr[j])
        X,y = X.copy(), y.copy()
        ds_name = get_name(datasets[j][0])

        evaluate_fs(fs[i],fs_names[i],X,y,ds_name,apply_cv)
print("Finished")

Part D

In [None]:
results = pd.read_csv("../input/all-results/ds_results_combined2.csv")

ds_names = results["Dataset Name"].unique()
# print(ds_names)
# print("-------------------------------------------------------")
column_names = list(results.columns.values)[2:-1]
# print(column_names)

In [None]:
from sklearn.decomposition import KernelPCA
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
import random


fs_col = "Filtering Algorithm"
k_col = "Number of features selected (K)"
n_comp = 5
model_col = "Learning Algorithm"


def apply_cv2(model,fs_model,cv,X,y): # X,y are dataFrame!
    scoring={metrics_names[i]: metrics[i] for i in range(len(metrics))}
    fold_counter = 1
    folds_information = [] # FoldInfo array
    # for each fold, apply all metrics and save the information needed with FoldInfo object
    
    
    for train_index, test_index in tqdm(cv.split(X,y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        X_train, y_train, X_test, y_test = apply_section_d(X_train,y_train,X_test,y_test)
        
        train_time, test_time, evaluations = evaluate_metrics(model,X_train,y_train,X_test,y_test)
        features_names = fs_model.features_names
        scores = fs_model.get_scores()
        for i in range(len(evaluations)):
            measure_type = metrics_names[i]
            measure_value = evaluations[i]
            folds_information.append(FoldInfo(fold_counter,measure_type,measure_value,features_names,scores,train_time,test_time))
        
        fold_counter +=1
    return folds_information
        

def get_minimal_samples(y_train):
    return y_train.value_counts().min()

def apply_section_d(X_train,y_train,X_test,y_test):
    # create KernelPCA of kind linear and rbf 
    transformer1 = KernelPCA(n_components=n_comp, kernel='linear')
    transformer2 = KernelPCA(n_components=n_comp, kernel='rbf')
    
    # use the KernelPCA on the X of the training using fit_tranform
    print("KernelIPCA linear fit trans on train...")
    X1_train = transformer1.fit_transform(X_train)
    print("KernelIPCA rbf fit trans on train...")
    X2_train = transformer2.fit_transform(X_train)
    print("Done fit trans on train!\n\n")
    
    # Add the result to the new train
    X_train = np.concatenate((X_train, X1_train), axis=1)
    X_train = np.concatenate((X_train, X2_train), axis=1)

    # use the two KernelPCA that were trained on the X of the test using tranform (is this the new test?)
    print("KernelIPCA linear transform on test...")
    X1_test = transformer1.transform(X_test)
    print("KernelIPCA rbf transform on test...")
    X2_test = transformer2.transform(X_test)
    print("Done fit on test!\n\n")
    
    # add the result to the new test
    X_test = np.concatenate((X_test, X1_test), axis=1)
    X_test = np.concatenate((X_test, X2_test), axis=1)
    
    
    # extend the X and y of the training using one of the methods in d
    print("Applying SMOTE...")
    min_samp = get_minimal_samples(y_train) # becuase sometime there is a low representation of some class we need to define the neighbor number as the lowest sample count for SMOTE to work. (becasue neighbors need to be less than number of samples for some class)
    kn = min(min_samp - 1, 5)
    mn = min(min_samp - 1, 10)
    oversample = BorderlineSMOTE(k_neighbors=kn, m_neighbors=mn)
    X_train, y_train = oversample.fit_resample(X_train, y_train)
    
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)
    return X_train, y_train, X_test, y_test

for name in ds_names[:10] :
    print()
    print("Computing ",name, " :")
    print()
    relevent_lines = results[results["Dataset Name"] == name]
    
    auc_lines = relevent_lines[relevent_lines["Measure Type"] == "AUC"]
    max_lines = (auc_lines[auc_lines["Measure Value"] == (auc_lines[auc_lines["Measure Value"] != None])["Measure Value"].max()])

    if max_lines.empty:
        max_lines.append(auc_lines.iloc[0])
        max_line = auc_lines.iloc[[0]]
    else:
        max_line = max_lines.iloc[[0]]

    df = pd.read_csv("../input/allsets/" + name + ".csv")

    # read the data and divide to X and y
    X,y = get_x_y(df)

    # use the fs from max line on the dataset and get the new dataset
    k = max_line[k_col].values[0]
    fs_name = max_line[fs_col].values[0]
    fs_index = fs_names.index(fs_name)
    fs_algo = fs[fs_index]
    
    model_name = max_line[model_col].values[0]
    model_index = models_names.index(model_name)
    model = models[model_index]
    print("model:,",model_name,model)


    print(f"{fs_name} fitting k = {k}...\n\n")
    t0 = time.time()
    fs1 = fs_algo(k)
    X = fs1.fit_transform(X,y)
    fs_train_time = time.time() - t0
        
    # activate the training model in the max line on the new training set
    evaluate_fs(fs1,fs_name,X,y,name,apply_cv2,model,model_name+"_Aug",k,True)
    # report the results in according to the right CV algorithm (add the results to the csv of all the results)


Part E

In [None]:
def part_e():    
    from scipy import stats
    import pandas as pd
    import numpy as np
    import scikit_posthocs as sp

    results = pd.read_csv("../input/all-results/ds_results_combined2.csv")

    ds_names = results["Dataset Name"].unique()
    column_names = list(results.columns.values)
    f_names = results["Filtering Algorithm"].unique()

    all_arrays = [0] * len(f_names)

    for i in range(len(f_names)):
        name = f_names[i]
        f_rows = results[results["Filtering Algorithm"] == name]
        f_auc_rows = f_rows[f_rows["Measure Type"] == "AUC"]
        all_arrays[i] = f_auc_rows["Measure Value"].values

    for i in range(7):
        all_arrays[i] = all_arrays[i].tolist()
    all_arrays[2] += all_arrays[0][-5:]

    print(stats.friedmanchisquare(all_arrays[0], all_arrays[1], all_arrays[2], all_arrays[3], all_arrays[4], all_arrays[5], all_arrays[6]))

    #combine three groups into one array
    data = np.array(all_arrays)

    #perform Nemenyi post-hoc test
    print(sp.posthoc_nemenyi_friedman(data.T))