In [1]:
!pip install pyCausalFS tabulate xgboost scipy feature_engine

Collecting pyCausalFS
  Downloading pyCausalFS-0.23-py3-none-any.whl (300 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.9/300.9 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Collecting feature_engine
  Downloading feature_engine-1.6.2-py2.py3-none-any.whl (328 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.9/328.9 kB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyCausalFS, feature_engine
Successfully installed feature_engine-1.6.2 pyCausalFS-0.23


In [2]:
import pandas as pd
import os
import math
from pyCausalFS.CBD.MBs.IAMB import IAMB
from pyCausalFS.CBD.MBs.BAMB import BAMB
from pyCausalFS.CBD.MBs.STMB import STMB
from pyCausalFS.CBD.MBs.MBOR import MBOR
from pyCausalFS.CBD.MBs.LCMB import LRH
from pyCausalFS.CBD.MBs.MMMB.MMMB import MMMB
from pyCausalFS.CBD.MBs.HITON.HITON_MB import HITON_MB
from pyCausalFS.CBD.MBs.HITON.HITON_PC import HITON_PC
from pyCausalFS.CBD.MBs.MMMB.MMPC import MMPC
from pyCausalFS.CBD.MBs.GSMB import GSMB
from pyCausalFS.CBD.MBs.fast_IAMB import fast_IAMB
from pyCausalFS.CBD.MBs.inter_IAMB import inter_IAMB
from pyCausalFS.CBD.MBs.IAMBnPC import IAMBnPC
from pyCausalFS.CBD.MBs.interIAMBnPC import interIAMBnPC
from pyCausalFS.CBD.MBs.FBEDk import FBED
from pyCausalFS.CBD.MBs.PCMB.PCMB import PCMB
from pyCausalFS.CBD.MBs.semi_HITON.semi_HITON_MB import semi_HITON_MB
from pyCausalFS.CBD.MBs.IPCMB.IPCMB import IPC_MB
from pyCausalFS.LSL.MBs.PCDbyPCD import PCDbyPCD
from pyCausalFS.LSL.MBs.MBbyMB import MBbyMB
from pyCausalFS.LSL.MBs.CMB.CMB import CMB
from pyCausalFS.CBD.MBs.KIAMB import KIAMB
from pyCausalFS.CBD.MBs.TIE_star.TIEs import TIE_p
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import ADASYN
from sklearn import model_selection
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, matthews_corrcoef as mcc_score,  roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
import numpy as np
from tabulate import tabulate
from scipy.io import arff
from sklearn import svm
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif, f_classif, RFECV, SequentialFeatureSelector
from feature_engine.selection import DropCorrelatedFeatures, SmartCorrelatedSelection

In [3]:
significance_val = 0.05

# Feature Selection Algorithms

In [4]:
def findGSMB(df, index, alpha, isDiscrete):
    try:
        MB, ci_num = GSMB(df, index, alpha, isDiscrete)
        return MB
    except Exception as e:
        print(str(e))
        return []

def findIAMB(df, index, alpha, isDiscrete):
    try:
        MB, ci_num = IAMB(df, index, alpha, isDiscrete)
        return MB
    except Exception as e:
        print(str(e))
        return []

def findInterIAMB(df, index, alpha, isDiscrete):
    try:
        MB, ci_num = inter_IAMB(df, index, alpha, isDiscrete)
        return MB
    except Exception as e:
        print(str(e))
        return []

def findFastIAMB(df, index, alpha, isDiscrete):
    try:
        MB, ci_num = fast_IAMB(df, index, alpha, isDiscrete)
        return MB
    except Exception as e:
        print(str(e))
        return []

def findIAMBnPC(df, index, alpha, isDiscrete):
    try:
        MB, ci_num = IAMBnPC(df, index, alpha, isDiscrete)
        return MB
    except Exception as e:
        print(str(e))
        return []
    
def findinterIAMBnPC(df, index, alpha, isDiscrete):
    try:
        MB, ci_num = interIAMBnPC(df, index, alpha, isDiscrete)
        return MB
    except Exception as e:
        print(str(e))
        return []

def findLRH(df, index, alpha, isDiscrete):
    try:
        MB, ci_num = LRH(df, index, alpha, isDiscrete)
        return MB
    except Exception as e:
        print(str(e))
        return []

def findBAMB(df, index, alpha, isDiscrete):
    try:
        MB, ci_num = BAMB(df, index, alpha, isDiscrete)
        return MB
    except Exception as e:
        print(str(e))
        return []


def findFBEDk(df, index, alpha, isDiscrete):
    try:
        MB, ci_num = FBED(df, index, 15, alpha, isDiscrete)
        return MB
    except Exception as e:
        print(str(e))
        return []

def findMMMB(df, index, alpha, isDiscrete):
    try:
        MB, ci_num = MMMB(df, index, alpha, isDiscrete)
        return MB
    except Exception as e:
        print(str(e))
        return []

def findPCMB(df, index, alpha, isDiscrete):
    try:
        MB, ci_num = PCMB(df, index, alpha, isDiscrete)
        return MB
    except Exception as e:
        print(str(e))
        return []

def findHITON_MB(df, index, alpha, isDiscrete):
    try:
        MB, ci_num = HITON_MB(df, index, alpha, isDiscrete)
        return MB
    except Exception as e:
        print(str(e))
        return []

def findSemi_HITON_MB(df, index, alpha, isDiscrete):
    try:
        MB, ci_num = semi_HITON_MB(df, index, alpha, isDiscrete)
        return MB
    except Exception as e:
        print(str(e))
        return []

def findMBOR(df, index, alpha, isDiscrete):
    try:
        MB, ci_num = MBOR(df, index, alpha, isDiscrete)
        return MB
    except Exception as e:
        print(str(e))
        return []

def findIPCMB(df, index, alpha, isDiscrete):
    try:
        MB, ci_num = IPC_MB(df, index, alpha, isDiscrete)
        return MB
    except Exception as e:
        print(str(e))
        return []

def findSTMB(df, index, alpha, isDiscrete):
    try:
        MB, ci_num = STMB(df, index, alpha, isDiscrete)
        return MB
    except Exception as e:
        print(str(e))
        return []

def findKIAMB(df, index, alpha, isDiscrete):
    try:
        MBList = []
        for i in range(0, 100):
            MB, ci_num = KIAMB(df, index, alpha, 0.1, isDiscrete)
            MBList.append(MB)
        MB = set()
        for mb in MBList:
            for x in mb:
                MB.add(x)
        return list(MB)
    except Exception as e:
        print(str(e))
        return []


def findTIE_P(df, index, alpha, isDiscrete):
    try:
        MBList = TIE_p(df, index, alpha, isDiscrete)
        MB = set()
        for mb in MBList:
            for x in mb:
                MB.add(x)
        return list(MB)
    except Exception as e:
        print(str(e))
        return []

In [5]:
def findChiSquare1(df, index, alpha, isDicrete):
    try:
        X = df.drop(columns=[df.columns[-1]]).values
        y = df[df.columns[-1]].values
        SelectBest = SelectKBest(chi2, k=4).fit(X, y)
        features = SelectBest.get_feature_names_out(range(0, index))
        return features.astype(int)
    except Exception as e:
        print(str(e))
        return []

def findChiSquare2(df, index, alpha, isDicrete):
    try:
        X = df.drop(columns=[df.columns[-1]]).values
        y = df[df.columns[-1]].values
        SelectBest = SelectKBest(chi2, k=6).fit(X, y)
        features = SelectBest.get_feature_names_out(range(0, index))
        return features.astype(int)
    except Exception as e:
        print(str(e))
        return []

def findChiSquare3(df, index, alpha, isDicrete):
    try:
        X = df.drop(columns=[df.columns[-1]]).values
        y = df[df.columns[-1]].values
        SelectBest = SelectKBest(chi2, k=8).fit(X, y)
        features = SelectBest.get_feature_names_out(range(0, index))
        return features.astype(int)
    except Exception as e:
        print(str(e))
        return []

def findChiSquare4(df, index, alpha, isDicrete):
    try:
        X = df.drop(columns=[df.columns[-1]]).values
        y = df[df.columns[-1]].values
        SelectBest = SelectKBest(chi2, k=10).fit(X, y)
        features = SelectBest.get_feature_names_out(range(0, index))
        return features.astype(int)
    except Exception as e:
        print(str(e))
        return []

def findChiSquare5(df, index, alpha, isDicrete):
    try:
        X = df.drop(columns=[df.columns[-1]]).values
        y = df[df.columns[-1]].values
        SelectBest = SelectKBest(chi2, k=12).fit(X, y)
        features = SelectBest.get_feature_names_out(range(0, index))
        return features.astype(int)
    except Exception as e:
        print(str(e))
        return []

def findMutualInfo1(df, index, alpha, isDicrete):
    X = df.drop(columns=[df.columns[-1]]).values
    y = df[df.columns[-1]].values
    SelectBest = SelectKBest(mutual_info_classif, k=4).fit(X, y)
    features = SelectBest.get_feature_names_out(range(0, index))
    return features.astype(int)

def findMutualInfo2(df, index, alpha, isDicrete):
    X = df.drop(columns=[df.columns[-1]]).values
    y = df[df.columns[-1]].values
    SelectBest = SelectKBest(mutual_info_classif, k=6).fit(X, y)
    features = SelectBest.get_feature_names_out(range(0, index))
    return features.astype(int)

def findMutualInfo3(df, index, alpha, isDicrete):
    X = df.drop(columns=[df.columns[-1]]).values
    y = df[df.columns[-1]].values
    SelectBest = SelectKBest(mutual_info_classif, k=8).fit(X, y)
    features = SelectBest.get_feature_names_out(range(0, index))
    return features.astype(int)


def findMutualInfo4(df, index, alpha, isDicrete):
    X = df.drop(columns=[df.columns[-1]]).values
    y = df[df.columns[-1]].values
    SelectBest = SelectKBest(mutual_info_classif, k=10).fit(X, y)
    features = SelectBest.get_feature_names_out(range(0, index))
    return features.astype(int)

def findMutualInfo5(df, index, alpha, isDicrete):
    X = df.drop(columns=[df.columns[-1]]).values
    y = df[df.columns[-1]].values
    SelectBest = SelectKBest(mutual_info_classif, k=12).fit(X, y)
    features = SelectBest.get_feature_names_out(range(0, index))
    return features.astype(int)

def findAnovaFeat1(df, index, alpha, isDiscrete):
    X = df.drop(columns=[df.columns[-1]]).values
    y = df[df.columns[-1]].values
    SelectBest = SelectKBest(f_classif, k=4).fit(X, y)
    features = SelectBest.get_feature_names_out(range(0, index))
    return features.astype(int)

def findAnovaFeat2(df, index, alpha, isDiscrete):
    X = df.drop(columns=[df.columns[-1]]).values
    y = df[df.columns[-1]].values
    SelectBest = SelectKBest(f_classif, k=6).fit(X, y)
    features = SelectBest.get_feature_names_out(range(0, index))
    return features.astype(int)

def findAnovaFeat3(df, index, alpha, isDiscrete):
    X = df.drop(columns=[df.columns[-1]]).values
    y = df[df.columns[-1]].values
    SelectBest = SelectKBest(f_classif, k=8).fit(X, y)
    features = SelectBest.get_feature_names_out(range(0, index))
    return features.astype(int)

def findAnovaFeat4(df, index, alpha, isDiscrete):
    X = df.drop(columns=[df.columns[-1]]).values
    y = df[df.columns[-1]].values
    SelectBest = SelectKBest(f_classif, k=10).fit(X, y)
    features = SelectBest.get_feature_names_out(range(0, index))
    return features.astype(int)

def findAnovaFeat5(df, index, alpha, isDiscrete):
    X = df.drop(columns=[df.columns[-1]]).values
    y = df[df.columns[-1]].values
    SelectBest = SelectKBest(f_classif, k=12).fit(X, y)
    features = SelectBest.get_feature_names_out(range(0, index))
    return features.astype(int)

def findCorrelatedFeat1(df, index, alpha, isDiscrete):
    X = df.drop(columns=[df.columns[-1]]).values
    y = df[df.columns[-1]].values
    dcf = DropCorrelatedFeatures(threshold=0.3).fit(X, y)
    features = dcf.get_feature_names_out(range(0, index))
    features = [int(x) for x in features]
    return features

def findCorrelatedFeat2(df, index, alpha, isDiscrete):
    X = df.drop(columns=[df.columns[-1]]).values
    y = df[df.columns[-1]].values
    dcf = DropCorrelatedFeatures(threshold=0.4).fit(X, y)
    features = dcf.get_feature_names_out(range(0, index))
    features = [int(x) for x in features]
    return features


def findCorrelatedFeat3(df, index, alpha, isDiscrete):
    X = df.drop(columns=[df.columns[-1]]).values
    y = df[df.columns[-1]].values
    dcf = DropCorrelatedFeatures(threshold=0.5).fit(X, y)
    features = dcf.get_feature_names_out(range(0, index))
    features = [int(x) for x in features]
    return features


def findCorrelatedFeat4(df, index, alpha, isDiscrete):
    X = df.drop(columns=[df.columns[-1]]).values
    y = df[df.columns[-1]].values
    dcf = DropCorrelatedFeatures(threshold=0.6).fit(X, y)
    features = dcf.get_feature_names_out(range(0, index))
    features = [int(x) for x in features]
    return features

def findCorrelatedFeat5(df, index, alpha, isDiscrete):
    X = df.drop(columns=[df.columns[-1]]).values
    y = df[df.columns[-1]].values
    dcf = DropCorrelatedFeatures(threshold=0.7).fit(X, y)
    features = dcf.get_feature_names_out(range(0, index))
    features = [int(x) for x in features]
    return features

def findSmartCorrelatedFeat1(df, index, alpha, isDiscrete):
    X = df.drop(columns=[df.columns[-1]]).values
    y = df[df.columns[-1]].values
    scs = SmartCorrelatedSelection(threshold=0.3).fit(X, y)
    features = scs.get_feature_names_out(range(0, index))
    features = [int(x) for x in features]
    return features

def findSmartCorrelatedFeat2(df, index, alpha, isDiscrete):
    X = df.drop(columns=[df.columns[-1]]).values
    y = df[df.columns[-1]].values
    scs = SmartCorrelatedSelection(threshold=0.4).fit(X, y)
    features = scs.get_feature_names_out(range(0, index))
    features = [int(x) for x in features]
    return features

def findSmartCorrelatedFeat3(df, index, alpha, isDiscrete):
    X = df.drop(columns=[df.columns[-1]]).values
    y = df[df.columns[-1]].values
    scs = SmartCorrelatedSelection(threshold=0.5).fit(X, y)
    features = scs.get_feature_names_out(range(0, index))
    features = [int(x) for x in features]
    return features

def findSmartCorrelatedFeat4(df, index, alpha, isDiscrete):
    X = df.drop(columns=[df.columns[-1]]).values
    y = df[df.columns[-1]].values
    scs = SmartCorrelatedSelection(threshold=0.6).fit(X, y)
    features = scs.get_feature_names_out(range(0, index))
    features = [int(x) for x in features]
    return features

def findSmartCorrelatedFeat5(df, index, alpha, isDiscrete):
    X = df.drop(columns=[df.columns[-1]]).values
    y = df[df.columns[-1]].values
    scs = SmartCorrelatedSelection(threshold=0.7).fit(X, y)
    features = scs.get_feature_names_out(range(0, index))
    features = [int(x) for x in features]
    return features

def RFELogistic(df, index, alpha, isDicrete):
    X = df.drop(columns=[df.columns[-1]]).values
    y = df[df.columns[-1]].values
    min_features_to_select = 1
    clf = LogisticRegression()
    cv = model_selection.StratifiedKFold(5)

    rfecv = RFECV(
        estimator=clf,
        step=1,
        cv=cv,
        scoring="roc_auc",
        min_features_to_select=min_features_to_select,
    )
    rfecv.fit(X, y)
    return rfecv.ranking_

def RFERandom(df, index, alpha, isDicrete):
    X = df.drop(columns=[df.columns[-1]]).values
    y = df[df.columns[-1]].values
    min_features_to_select = 1
    clf = RandomForestClassifier()
    cv = model_selection.StratifiedKFold(5)

    rfecv = RFECV(
        estimator=clf,
        step=1,
        cv=cv,
        scoring="roc_auc",
        min_features_to_select=min_features_to_select,
    )
    rfecv.fit(X, y)
    return rfecv.ranking_

def findRFELogistic1(df, index, alpha, isDiscrete):
    k = 4
    ranking = RFELogistic(df, index, alpha, isDiscrete)
    ind = np.argsort(ranking)[-k:]
    return ind

def findRFELogistic2(df, index, alpha, isDiscrete):
    k = 6
    ranking = RFELogistic(df, index, alpha, isDiscrete)
    ind = np.argsort(ranking)[-k:]
    return ind

def findRFELogistic3(df, index, alpha, isDiscrete):
    k = 8
    ranking = RFELogistic(df, index, alpha, isDiscrete)
    ind = np.argsort(ranking)[-k:]
    return ind

def findRFELogistic4(df, index, alpha, isDiscrete):
    k = 10
    ranking = RFELogistic(df, index, alpha, isDiscrete)
    ind = np.argsort(ranking)[-k:]
    return ind

def findRFELogistic5(df, index, alpha, isDiscrete):
    k = 12
    ranking = RFELogistic(df, index, alpha, isDiscrete)
    ind = np.argsort(ranking)[-k:]
    return ind

def findRFERandom1(df, index, alpha, isDiscrete):
    k = 4
    ranking = RFERandom(df, index, alpha, isDiscrete)
    ind = np.argsort(ranking)[-k:]
    return ind

def findRFERandom2(df, index, alpha, isDiscrete):
    k = 6
    ranking = RFERandom(df, index, alpha, isDiscrete)
    ind = np.argsort(ranking)[-k:]
    return ind

def findRFERandom3(df, index, alpha, isDiscrete):
    k = 8
    ranking = RFERandom(df, index, alpha, isDiscrete)
    ind = np.argsort(ranking)[-k:]
    return ind

def findRFERandom4(df, index, alpha, isDiscrete):
    k = 10
    ranking = RFERandom(df, index, alpha, isDiscrete)
    ind = np.argsort(ranking)[-k:]
    return ind

def findRFERandom5(df, index, alpha, isDiscrete):
    k = 12
    ranking = RFERandom(df, index, alpha, isDiscrete)
    ind = np.argsort(ranking)[-k:]
    return ind

def findForward1(df, index, alpha, isDiscrete):
    clf = LogisticRegression()
    sfs = SequentialFeatureSelector(clf, n_features_to_select=4, scoring="roc_auc")
    sfs.fit(X, y)
    features = sfs.get_feature_names_out(range(0, index))
    return features.astype(int)

def findForward2(df, index, alpha, isDiscrete):
    clf = LogisticRegression()
    sfs = SequentialFeatureSelector(clf, n_features_to_select=6, scoring="roc_auc")
    sfs.fit(X, y)
    features = sfs.get_feature_names_out(range(0, index))
    return features.astype(int)

def findForward3(df, index, alpha, isDiscrete):
    clf = LogisticRegression()
    sfs = SequentialFeatureSelector(clf, n_features_to_select=8, scoring="roc_auc")
    sfs.fit(X, y)
    features = sfs.get_feature_names_out(range(0, index))
    return features.astype(int)

def findForward4(df, index, alpha, isDiscrete):
    clf = LogisticRegression()
    sfs = SequentialFeatureSelector(clf, n_features_to_select=10, scoring="roc_auc")
    sfs.fit(X, y)
    features = sfs.get_feature_names_out(range(0, index))
    return features.astype(int)

def findForward5(df, index, alpha, isDiscrete):
    clf = LogisticRegression()
    sfs = SequentialFeatureSelector(clf, n_features_to_select=12, scoring="roc_auc")
    sfs.fit(X, y)
    features = sfs.get_feature_names_out(range(0, index))
    return features.astype(int)

def findBackward1(df, index, alpha, isDiscrete):
    clf = LogisticRegression()
    sfs = SequentialFeatureSelector(clf, n_features_to_select=4, direction='backward', scoring="roc_auc")
    sfs.fit(X, y)
    features = sfs.get_feature_names_out(range(0, index))
    return features.astype(int)

def findBackward2(df, index, alpha, isDiscrete):
    clf = LogisticRegression()
    sfs = SequentialFeatureSelector(clf, n_features_to_select=6, direction='backward', scoring="roc_auc")
    sfs.fit(X, y)
    features = sfs.get_feature_names_out(range(0, index))
    return features.astype(int)

def findBackward3(df, index, alpha, isDiscrete):
    clf = LogisticRegression()
    sfs = SequentialFeatureSelector(clf, n_features_to_select=8, direction='backward', scoring="roc_auc")
    sfs.fit(X, y)
    features = sfs.get_feature_names_out(range(0, index))
    return features.astype(int)

def findBackward4(df, index, alpha, isDiscrete):
    clf = LogisticRegression()
    sfs = SequentialFeatureSelector(clf, n_features_to_select=10, direction='backward', scoring="roc_auc")
    sfs.fit(X, y)
    features = sfs.get_feature_names_out(range(0, index))
    return features.astype(int)

def findBackward5(df, index, alpha, isDiscrete):
    clf = LogisticRegression()
    sfs = SequentialFeatureSelector(clf, n_features_to_select=12,direction='backward', scoring="roc_auc")
    sfs.fit(X, y)
    features = sfs.get_feature_names_out(range(0, index))
    return features.astype(int)

In [6]:
feature_algo = {
# Causal 
#     'GSMB': findGSMB,
#     'IAMB': findIAMB,
#     'IAMBnPC': findIAMBnPC,
#     'LRH': findLRH,
#     'BAMB': findBAMB,
#     'FBEDk': findFBEDk,
#     'MMMB': findMMMB,
#     'PCMB': findPCMB,
#     'HITON_MB': findHITON_MB,
#     'MBOR': findMBOR,
#     'IPCMB': findIPCMB,
#     'STMB': findSTMB,
#     'KIAMB': findKIAMB,
#     'TIE_P': findTIE_P,
    
# Non-Causal
    'Chi-Square1': findChiSquare1,
    'Chi-Square2': findChiSquare2,
    'Chi-Square3': findChiSquare3,
    'Chi-Square4': findChiSquare4,
#     'Chi-Square5': findChiSquare5,
    
    'Mutual-Info1': findMutualInfo1,
    'Mutual-Info2': findMutualInfo2,
    'Mutual-Info3': findMutualInfo3,
    'Mutual-Info4': findMutualInfo4,
#     'Mutual-Info5': findMutualInfo5,
    
    'Anova1': findAnovaFeat1,
    'Anova2': findAnovaFeat2,
    'Anova3': findAnovaFeat3,
    'Anova4': findAnovaFeat4,
#     'Anova5': findAnovaFeat5,
    
    'DropCorrelated1': findCorrelatedFeat1,
    'DropCorrelated2': findCorrelatedFeat2,
    'DropCorrelated3': findCorrelatedFeat3,
    'DropCorrelated4': findCorrelatedFeat4,
#     'DropCorrelated5': findCorrelatedFeat5,
    
#     'SmartCorrelated1': findSmartCorrelatedFeat1,
#     'SmartCorrelated2': findSmartCorrelatedFeat2,
#     'SmartCorrelated3': findSmartCorrelatedFeat3,
#     'SmartCorrelated4': findSmartCorrelatedFeat4,
#     'SmartCorrelated5': findSmartCorrelatedFeat5,
    
#     'RFE-Logistic1': findRFELogistic1,
#     'RFE-Logistic2': findRFELogistic2,
#     'RFE-Logistic3': findRFELogistic3,
#     'RFE-Logistic4': findRFELogistic4,
#     'RFE-Logistic5': findRFELogistic5,
    
#     'RFE-Random1': findRFERandom1,
#     'RFE-Random2': findRFERandom2,
#     'RFE-Random3': findRFERandom3,
#     'RFE-Random4': findRFERandom4,
#     'RFE-Random5': findRFERandom5,
    
#     'Forward1': findForward1,
#     'Forward2': findForward2,
#     'Forward3': findForward3,
#     'Forward4': findForward4,
#     'Forward5': findForward5,
    
#     'Backward1': findBackward1,
#     'Backward2': findBackward2,
#     'Backward3': findBackward3,
#     'Backward4': findBackward4,
#     'Backward5': findBackward5
}

# Data Processing

In [7]:
def get_list_of_csv(folder_path):
    csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]
    return csv_files

### AEEEM

In [8]:
def process_aeeem_dataset(df):
    df.columns = df.columns.str.strip()
    df.rename(columns={df.columns[-1]: 'temp'}, inplace=True)
    columns_to_drop = ['temp', 'classname', 'nonTrivialBugs', 'majorBugs', 'criticalBugs', 'highPriorityBugs']
    columns_to_drop = [col for col in columns_to_drop if col in df.columns]
    df = df.drop(columns=columns_to_drop)
    df.loc[df['bugs'] > 0, 'bugs'] = 1
    return df

### JIRA

In [9]:
def process_jira_dataset(df):
    columns_to_drop = ['File', 'RealBug', 'HeuBug', 'HeuBugCount']
    df = df.drop(columns=columns_to_drop)
    df.loc[df['RealBugCount'] > 0, 'RealBugCount'] = 1
    return df

### TERA-PROMISE-ck

In [10]:
def process_promise_ck_dataset(df):
    columns_to_drop = ['Name', 'version']
    columns_to_drop = [col for col in columns_to_drop if col in df.columns]
    df = df.drop(columns=columns_to_drop)  
    df.loc[df['bug'] > 0, 'bug'] = 1
    return df

# Models

In [11]:
models_list = {
    "KNN": KNeighborsClassifier(),
#     "Logistic Regression": LogisticRegression(),
#     "Decision Tree": tree.DecisionTreeClassifier(),
#     "XgBoost": XGBClassifier(),
#     "Random Forest": RandomForestClassifier(),
#     "SVM":  svm.SVC(kernel='rbf')
}
models_search_params = {
    "KNN": {  
        'n_neighbors' : [3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31],
        'weights' : ['uniform','distance'],
        'metric' : ['minkowski','euclidean','manhattan']
     },
    "Logistic Regression": {
        'penalty' : ['l1', 'l2', 'elasticnet'],
        'C' : [1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001],
        'solver' : ['lbfgs','newton-cg','liblinear', 'newton-cholesky', 'sag', 'saga'],
        'max_iter' : [100, 1500, 3000]
     },
    "Decision Tree": {
        'max_features': [1, 2, 3, 5, 7, 10, 'log2','sqrt', None],
        'max_depth': [2, 3, 5, 7, 10, 20, 30, 40, 50, 60, 70, None],
        'min_samples_split': [1, 2, 3, 5, 7, 9, 10, 0.1, 0.2, 0.3],
        'min_samples_leaf': [1, 2, 3, 5, 7, 9, 10, 0.1, 0.2],
     },
    "Random Forest": {
        'max_depth': [2, 5, 10, None],
        'max_features': ['log2', 'sqrt', None],
    },
    "XgBoost": {
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 6, 10],
        'subsample': [0.5, 0.7, 1],
        'n_estimators': [100, 500]
    },
    "SVM": {
        'kernel': ['rbf']
    }
}

# METRICS

In [12]:
def g_measure_score(y_test, y_pred):
    TN, FP, FN, TP = confusion_matrix(y_test, y_pred).ravel()
    FPR = FP/(FP+TN)
    recall = recall_score(y_test, y_pred, average='macro')
    g_measure = (2*recall*(1-FPR))/(recall+(1-FPR))
    return g_measure

def bal_score(y_test, y_pred):
    TN, FP, FN, TP = confusion_matrix(y_test, y_pred).ravel()
    recall = recall_score(y_test, y_pred, average='macro')
    FPR = FP/(FP+TN)
    PF = FPR
    PD = recall
    bal = 1 - (math.sqrt((1-PD)*(1-PD)+(0-PF)*(0-PF))/math.sqrt(2))
    return bal

In [13]:
def get_best_model_params(X_train, y_train, model):
    classifier = models_list[model]
    cv = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid_obj = RandomizedSearchCV(classifier, models_search_params[model], n_iter = 30, cv = cv, scoring='roc_auc')
    grid_obj.fit(X_train, y_train)
    best_model_params = grid_obj.best_params_
    return best_model_params

In [14]:
def build_model(file_name, X, y, model, feat_algo):
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    mcc_scores = []
    roc_auc_scores = []
    g_measure_scores = []
    bal_scores = []

    best_model_params = {}
    MB = []
    
    kf = model_selection.StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        adasyn = ADASYN()
        X_train, y_train = adasyn.fit_resample(X_train, y_train)
        
        if feat_algo != 'None' and len(MB)<1:
            balanced_df = pd.concat([pd.DataFrame(X_train), pd.Series(y_train, name='bug')], axis=1)
            MB = feature_algo[feat_algo](balanced_df, balanced_df.shape[1]-1, significance_val, False)
            print(file_name, feat_algo, balanced_df.shape[1]-1, MB)
            
        if len(MB) >= 1 and feat_algo != 'None':
            X_train = X_train[:, MB]
            X_test = X_test[:, MB]
        elif feat_algo != 'None':
            return [file_name, feat_algo, '--', '--', '--', '--', '--', '--', '--']
        
        if not best_model_params:
            best_model_params = get_best_model_params(X_train, y_train, model)
            print(file_name, feat_algo, model, best_model_params)
            
        classifier = models_list[model]
        classifier.set_params(**best_model_params)
        
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        
        
        accuracy_scores.append(accuracy_score(y_test, y_pred))
        precision_scores.append(precision_score(y_test, y_pred, average='macro'))
        recall_scores.append(recall_score(y_test, y_pred, average='macro'))
        f1_scores.append(f1_score(y_test, y_pred, average='macro'))
        mcc_scores.append(mcc_score(y_test, y_pred))
        roc_auc_scores.append(roc_auc_score(y_test, classifier.predict_proba(X_test)[:, 1]))
        g_measure_scores.append(g_measure_score(y_test, y_pred))
        bal_scores.append(bal_score(y_test, y_pred))
    
    return [file_name, feat_algo, round(np.mean(accuracy_scores),2), round(np.mean(precision_scores), 2), round(np.mean(recall_scores), 2), round(np.mean(f1_scores), 2), round(np.mean(mcc_scores), 2), round(np.mean(roc_auc_scores),2), round(np.mean(g_measure_scores), 2), round(np.mean(bal_scores), 2)]

# TRAINING

### TERA-PROMISE-ck

In [15]:
folder_path = '/kaggle/input/defect-prediction/TeraPromise-defect-dataset/ck'
files_list = get_list_of_csv(folder_path)

for model in models_list.keys():
    print('----------------',model,'---------------------------')
    table = []
    table.append(["dataset", "feat_algo", "acc", "prec", "recall", "f1", "mcc", "roc_auc", "g-m", "bal"])
    for file_name in files_list:
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path)

        if df.shape[0]<=1500:
            continue
        
        df = process_promise_ck_dataset(df)
        X = df.drop(columns=[df.columns[-1]]).values
        y = df[df.columns[-1]].values
        
        table.append(build_model(file_name, X, y, model, 'None'))
        for feat_algo in feature_algo.keys():
            table.append(build_model(file_name, X, y, model, feat_algo))
        table.append(['--------', '--------', '--------', '--------', '--------', '--------', '--------', '--------', '--------', '--------'])
    print(tabulate(table))

---------------- KNN ---------------------------
prop-3.csv None KNN {'weights': 'distance', 'n_neighbors': 19, 'metric': 'manhattan'}
prop-3.csv Chi-Square1 20 [ 3  4 10 17]
prop-3.csv Chi-Square1 KNN {'weights': 'distance', 'n_neighbors': 15, 'metric': 'manhattan'}
prop-3.csv Chi-Square2 20 [ 3  4  5  7 10 17]
prop-3.csv Chi-Square2 KNN {'weights': 'distance', 'n_neighbors': 17, 'metric': 'manhattan'}
prop-3.csv Chi-Square3 20 [ 3  4  5  6  7 10 17 18]
prop-3.csv Chi-Square3 KNN {'weights': 'distance', 'n_neighbors': 11, 'metric': 'manhattan'}
prop-3.csv Chi-Square4 20 [ 3  4  5  6  7 10 12 17 18 19]
prop-3.csv Chi-Square4 KNN {'weights': 'distance', 'n_neighbors': 13, 'metric': 'manhattan'}
prop-3.csv Mutual-Info1 20 [ 3  4  7 10]
prop-3.csv Mutual-Info1 KNN {'weights': 'distance', 'n_neighbors': 11, 'metric': 'manhattan'}
prop-3.csv Mutual-Info2 20 [ 3  4  7 10 14 17]
prop-3.csv Mutual-Info2 KNN {'weights': 'distance', 'n_neighbors': 5, 'metric': 'manhattan'}
prop-3.csv Mutual-Info

### JIRA

In [16]:
folder_path = '/kaggle/input/defect-prediction/JIRA-defect-dataset'
files_list = get_list_of_csv(folder_path)

for model in models_list.keys():
    print('----------------',model,'---------------------------')
    table = []
    table.append(["dataset", "feat_algo", "acc", "prec", "recall", "f1", "mcc", "roc_auc", "g-m", "bal"])
    for file_name in files_list:
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path)

        if df.shape[0]<=1500:
            continue
        
        df = process_jira_dataset(df)
        X = df.drop(columns=[df.columns[-1]]).values
        y = df[df.columns[-1]].values
        
        table.append(build_model(file_name, X, y, model, 'None'))
        for feat_algo in feature_algo.keys():
            table.append(build_model(file_name, X, y, model, feat_algo))
        table.append(['--------', '--------', '--------', '--------', '--------', '--------', '--------', '--------', '--------', '--------'])
    print(tabulate(table))

---------------- KNN ---------------------------
camel-2.11.0.csv None KNN {'weights': 'distance', 'n_neighbors': 27, 'metric': 'manhattan'}
camel-2.11.0.csv Chi-Square1 65 [ 2 27 48 49]
camel-2.11.0.csv Chi-Square1 KNN {'weights': 'distance', 'n_neighbors': 9, 'metric': 'manhattan'}
camel-2.11.0.csv Chi-Square2 65 [ 2 23 27 31 48 49]
camel-2.11.0.csv Chi-Square2 KNN {'weights': 'distance', 'n_neighbors': 19, 'metric': 'manhattan'}
camel-2.11.0.csv Chi-Square3 65 [ 2 23 27 28 31 33 48 49]
camel-2.11.0.csv Chi-Square3 KNN {'weights': 'distance', 'n_neighbors': 15, 'metric': 'manhattan'}
camel-2.11.0.csv Chi-Square4 65 [ 2  7 17 23 27 28 31 33 48 49]
camel-2.11.0.csv Chi-Square4 KNN {'weights': 'distance', 'n_neighbors': 13, 'metric': 'manhattan'}
camel-2.11.0.csv Mutual-Info1 65 [18 23 28 29]
camel-2.11.0.csv Mutual-Info1 KNN {'weights': 'distance', 'n_neighbors': 11, 'metric': 'manhattan'}
camel-2.11.0.csv Mutual-Info2 65 [18 26 28 29 33 40]
camel-2.11.0.csv Mutual-Info2 KNN {'weights'

### AEEEM Defect Dataset

In [17]:
folder_path = '/kaggle/input/defect-prediction/AEEEM-defect-dataset'
files_list = get_list_of_csv(folder_path)

for model in models_list.keys():
    print('----------------',model,'---------------------------')
    table = []
    table.append(["dataset", "feat_algo", "acc", "prec", "recall", "f1", "mcc", "roc_auc", "g-m", "bal"])
    for file_name in files_list:
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path, delimiter=';')
        
        if df.shape[0]<=1500:
            continue
            
        df = process_aeeem_dataset(df)
        X = df.drop(columns=[df.columns[-1]]).values
        y = df[df.columns[-1]].values
        
        table.append(build_model(file_name, X, y, model, 'None'))
        for feat_algo in feature_algo.keys():
            table.append(build_model(file_name, X, y, model, feat_algo))
        table.append(['--------', '--------', '--------', '--------', '--------', '--------', '--------', '--------', '--------', '--------'])
    print(tabulate(table))

---------------- KNN ---------------------------
mylyn.csv None KNN {'weights': 'distance', 'n_neighbors': 9, 'metric': 'manhattan'}
Input X must be non-negative.
mylyn.csv Chi-Square1 15 []
Input X must be non-negative.
mylyn.csv Chi-Square2 15 []
Input X must be non-negative.
mylyn.csv Chi-Square3 15 []
Input X must be non-negative.
mylyn.csv Chi-Square4 15 []
mylyn.csv Mutual-Info1 15 [ 0  5  8 13]
mylyn.csv Mutual-Info1 KNN {'weights': 'distance', 'n_neighbors': 5, 'metric': 'manhattan'}
mylyn.csv Mutual-Info2 15 [ 0  4  8 10 11 13]
mylyn.csv Mutual-Info2 KNN {'weights': 'distance', 'n_neighbors': 13, 'metric': 'manhattan'}
mylyn.csv Mutual-Info3 15 [ 0  4  5  7  8 10 11 13]
mylyn.csv Mutual-Info3 KNN {'weights': 'distance', 'n_neighbors': 5, 'metric': 'manhattan'}
mylyn.csv Mutual-Info4 15 [ 0  1  4  5  7  8 10 11 12 13]
mylyn.csv Mutual-Info4 KNN {'weights': 'distance', 'n_neighbors': 7, 'metric': 'manhattan'}
mylyn.csv Anova1 15 [ 0  1 10 13]
mylyn.csv Anova1 KNN {'weights': 'di