In [1]:
!pip install tabulate



In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import wilcoxon
from scipy.stats import mannwhitneyu
import tabulate
from tabulate import tabulate

In [3]:
def parsedf(df):
    df = df.copy()
    df = df[df['dataset'] != '--------']
    df.replace('--', np.nan, inplace=True)
    
    df ['feat_algo'] = df['feat_algo'].astype('string')
    df['dataset'] = df['dataset'].str.replace('.csv', '')
    df ['feat_algo'] = df['feat_algo'].astype('string')
    
    columns_to_drop = ['acc', 'mcc', 'g-m']
    df = df.drop(columns=columns_to_drop)
    
    algos1 = ['DropCorrelated1', 'SmartCorrelated1']
    algos2 = ['DropCorrelated2', 'SmartCorrelated2']
    datasets = ['prop-1', 'prop-2', 'prop-3', 'prop-4', 'mylyn']
    
    for index, row in df.iterrows():
        if type(row['feat_algo']) != type(pd.NA):
            if row['feat_algo'] in algos1:
                df.at[index, 'feat_algo'] = row['feat_algo'][:-1]
            elif row['dataset'] in datasets and row['feat_algo'].endswith('1'):
                df.at[index, 'feat_algo'] = row['feat_algo'][:-1]
            elif row['dataset'] not in datasets and row['feat_algo'] not in algos2 and row['feat_algo'].endswith('2'):
                df.at[index, 'feat_algo'] = row['feat_algo'][:-1]
    
    return df

In [4]:
def convert_df(df, col):
    return df[col].astype(float)

feat_algo = [
             'GSMB', 
             'IAMB',
             'ATE-FS',
             'Mutual-Info', 
             'Forward',
             'Auto-Spearman',
            ]

In [5]:
def build_table(table, feat_algo_data_list, i, j):
    cur_row = []
    sub_row = []
    
    cur_row.append(f"{feat_algo[i]} vs {feat_algo[j]}")
    cur_row.append("p-value")
    for feat_algo_data in feat_algo_data_list:
        res = mannwhitneyu(feat_algo_data[feat_algo[i]].values, feat_algo_data[feat_algo[j]].values)
        res = round(res.pvalue, 3)
        cur_row.append(res)
    table.append(cur_row)

    sub_row.append(" " * len(f"{feat_algo[i]} vs {feat_algo[j]}"))  # Indentation
    sub_row.append("mean-diff")
    for feat_algo_data in feat_algo_data_list:
        diff = np.nanmean(feat_algo_data[feat_algo[i]].values) - np.nanmean(feat_algo_data[feat_algo[j]].values)
        arrow = "↑" if diff > 0 else ("↓" if diff < 0 else "")
        sub_row.append(f"{abs(round(diff, 2))}{arrow}")
    table.append(sub_row)
    

def get_mean_res(feat_algo_data_prec, feat_algo_data_rec, feat_algo_data_f1, feat_algo_data_roc, feat_algo_data_bal):
    table = []
    table.append(["Techniques", "Comparison", "Precision", "Recall", "F1-score", "Roc-Auc", "Balance"])
    for i in range(0, len(feat_algo)):
        for j in range(0, len(feat_algo)):
            if i >= j:
                continue
            
            build_table(table, [feat_algo_data_prec, feat_algo_data_rec, feat_algo_data_f1, feat_algo_data_roc, feat_algo_data_bal], i, j)
            
    return table
            

### KNN

In [6]:
df = pd.read_csv("/kaggle/input/resuts-all/Copy-Final-Results-with-CFS - KNN.csv")
df = parsedf(df)

In [7]:
feat_algo_data_prec = { }
for algo in feat_algo:
    feat_algo_data_prec[algo] = df[df['feat_algo'] == algo]
for algo in feat_algo:
    feat_algo_data_prec[algo] = convert_df(feat_algo_data_prec[algo], 'prec')
   
feat_algo_data_rec = { }
for algo in feat_algo:
    feat_algo_data_rec[algo] = df[df['feat_algo'] == algo]
for algo in feat_algo:
    feat_algo_data_rec[algo] = convert_df(feat_algo_data_rec[algo], 'recall')
 
feat_algo_data_f1 = { }
for algo in feat_algo:
    feat_algo_data_f1[algo] = df[df['feat_algo'] == algo]
for algo in feat_algo:
    feat_algo_data_f1[algo] = convert_df(feat_algo_data_f1[algo], 'f1')
    
feat_algo_data_roc = { }
for algo in feat_algo:
    feat_algo_data_roc[algo] = df[df['feat_algo'] == algo]
for algo in feat_algo:
    feat_algo_data_roc[algo] = convert_df(feat_algo_data_roc[algo], 'roc_auc')
    

feat_algo_data_bal = { }
for algo in feat_algo:
    feat_algo_data_bal[algo] = df[df['feat_algo'] == algo]
for algo in feat_algo:
    feat_algo_data_bal[algo] = convert_df(feat_algo_data_bal[algo], 'bal')
    
print("KNN")
print(tabulate(get_mean_res(feat_algo_data_prec, feat_algo_data_rec, feat_algo_data_f1, feat_algo_data_roc, feat_algo_data_bal)))

KNN
----------------------------  ----------  ---------  ------  --------  -------  -------
Techniques                    Comparison  Precision  Recall  F1-score  Roc-Auc  Balance
GSMB vs IAMB                  p-value     0.896      0.808   0.814     0.856    0.924
                              mean-diff   0.0↓       0.0↓    0.0↑      0.0↓     0.0↓
GSMB vs ATE-FS                p-value     0.003      0.003   0.002     0.1      0.004
                              mean-diff   0.04↓      0.05↓   0.05↓     0.03↓    0.04↓
GSMB vs Mutual-Info           p-value     0.367      0.0     0.145     0.0      0.0
                              mean-diff   0.01↓      0.09↓   0.02↓     0.07↓    0.04↓
GSMB vs Forward               p-value     0.083      0.0     0.008     0.0      0.0
                              mean-diff   0.02↓      0.09↓   0.04↓     0.08↓    0.05↓
GSMB vs Auto-Spearman         p-value     0.124      0.0     0.012     0.477    0.0
                              mean-diff   0.02↓      

### Decision Tree

In [8]:
df = pd.read_csv("/kaggle/input/resuts-all/Copy-Final-Results-with-CFS - Decision Tree.csv")
df = parsedf(df)

In [9]:
feat_algo_data_prec = { }
for algo in feat_algo:
    feat_algo_data_prec[algo] = df[df['feat_algo'] == algo]
for algo in feat_algo:
    feat_algo_data_prec[algo] = convert_df(feat_algo_data_prec[algo], 'prec')
   
feat_algo_data_rec = { }
for algo in feat_algo:
    feat_algo_data_rec[algo] = df[df['feat_algo'] == algo]
for algo in feat_algo:
    feat_algo_data_rec[algo] = convert_df(feat_algo_data_rec[algo], 'recall')
 
feat_algo_data_f1 = { }
for algo in feat_algo:
    feat_algo_data_f1[algo] = df[df['feat_algo'] == algo]
for algo in feat_algo:
    feat_algo_data_f1[algo] = convert_df(feat_algo_data_f1[algo], 'f1')
    
feat_algo_data_roc = { }
for algo in feat_algo:
    feat_algo_data_roc[algo] = df[df['feat_algo'] == algo]
for algo in feat_algo:
    feat_algo_data_roc[algo] = convert_df(feat_algo_data_roc[algo], 'roc_auc')
    

feat_algo_data_bal = { }
for algo in feat_algo:
    feat_algo_data_bal[algo] = df[df['feat_algo'] == algo]
for algo in feat_algo:
    feat_algo_data_bal[algo] = convert_df(feat_algo_data_bal[algo], 'bal')
    
print("Decision Tree")
print(tabulate(get_mean_res(feat_algo_data_prec, feat_algo_data_rec, feat_algo_data_f1, feat_algo_data_roc, feat_algo_data_bal)))

Decision Tree
----------------------------  ----------  ---------  ------  --------  -------  -------
Techniques                    Comparison  Precision  Recall  F1-score  Roc-Auc  Balance
GSMB vs IAMB                  p-value     0.945      0.715   0.869     0.883    0.841
                              mean-diff   0.0↓       0.0↓    0.0↑      0.0↑     0.0↑
GSMB vs ATE-FS                p-value     0.002      0.221   0.008     0.027    0.012
                              mean-diff   0.06↓      0.02↓   0.04↓     0.04↓    0.03↓
GSMB vs Mutual-Info           p-value     0.014      0.001   0.008     0.002    0.0
                              mean-diff   0.04↓      0.04↓   0.04↓     0.05↓    0.04↓
GSMB vs Forward               p-value     0.006      0.002   0.002     0.0      0.0
                              mean-diff   0.05↓      0.04↓   0.05↓     0.06↓    0.04↓
GSMB vs Auto-Spearman         p-value     0.003      0.005   0.001     0.012    0.001
                              mean-diff  

### Logistic Regression

In [10]:
df = pd.read_csv("/kaggle/input/resuts-all/Copy-Final-Results-with-CFS - Logistic Regression.csv")
df = parsedf(df)

In [11]:
feat_algo_data_prec = { }
for algo in feat_algo:
    feat_algo_data_prec[algo] = df[df['feat_algo'] == algo]
for algo in feat_algo:
    feat_algo_data_prec[algo] = convert_df(feat_algo_data_prec[algo], 'prec')
   
feat_algo_data_rec = { }
for algo in feat_algo:
    feat_algo_data_rec[algo] = df[df['feat_algo'] == algo]
for algo in feat_algo:
    feat_algo_data_rec[algo] = convert_df(feat_algo_data_rec[algo], 'recall')
 
feat_algo_data_f1 = { }
for algo in feat_algo:
    feat_algo_data_f1[algo] = df[df['feat_algo'] == algo]
for algo in feat_algo:
    feat_algo_data_f1[algo] = convert_df(feat_algo_data_f1[algo], 'f1')
    
feat_algo_data_roc = { }
for algo in feat_algo:
    feat_algo_data_roc[algo] = df[df['feat_algo'] == algo]
for algo in feat_algo:
    feat_algo_data_roc[algo] = convert_df(feat_algo_data_roc[algo], 'roc_auc')
    

feat_algo_data_bal = { }
for algo in feat_algo:
    feat_algo_data_bal[algo] = df[df['feat_algo'] == algo]
for algo in feat_algo:
    feat_algo_data_bal[algo] = convert_df(feat_algo_data_bal[algo], 'bal')
    
print("Logistic Regression")
print(tabulate(get_mean_res(feat_algo_data_prec, feat_algo_data_rec, feat_algo_data_f1, feat_algo_data_roc, feat_algo_data_bal)))

Logistic Regression
----------------------------  ----------  ---------  ------  --------  -------  -------
Techniques                    Comparison  Precision  Recall  F1-score  Roc-Auc  Balance
GSMB vs IAMB                  p-value     1.0        0.835   0.972     0.835    0.972
                              mean-diff   0.0↑       0.0↑    0.0↑      0.0↑     0.0↑
GSMB vs ATE-FS                p-value     0.004      0.007   0.0       0.0      0.0
                              mean-diff   0.05↓      0.05↓   0.08↓     0.12↓    0.09↓
GSMB vs Mutual-Info           p-value     0.006      0.002   0.001     0.002    0.0
                              mean-diff   0.04↓      0.05↓   0.06↓     0.05↓    0.06↓
GSMB vs Forward               p-value     0.003      0.001   0.0       0.001    0.0
                              mean-diff   0.04↓      0.06↓   0.09↓     0.06↓    0.09↓
GSMB vs Auto-Spearman         p-value     0.005      0.001   0.0       0.001    0.0
                              mean-diff

### Random Forest

In [12]:
df = pd.read_csv("/kaggle/input/resuts-all/Copy-Final-Results-with-CFS - RandomForest.csv")
df = parsedf(df)

In [13]:
feat_algo_data_prec = { }
for algo in feat_algo:
    feat_algo_data_prec[algo] = df[df['feat_algo'] == algo]
for algo in feat_algo:
    feat_algo_data_prec[algo] = convert_df(feat_algo_data_prec[algo], 'prec')
   
feat_algo_data_rec = { }
for algo in feat_algo:
    feat_algo_data_rec[algo] = df[df['feat_algo'] == algo]
for algo in feat_algo:
    feat_algo_data_rec[algo] = convert_df(feat_algo_data_rec[algo], 'recall')
 
feat_algo_data_f1 = { }
for algo in feat_algo:
    feat_algo_data_f1[algo] = df[df['feat_algo'] == algo]
for algo in feat_algo:
    feat_algo_data_f1[algo] = convert_df(feat_algo_data_f1[algo], 'f1')
    
feat_algo_data_roc = { }
for algo in feat_algo:
    feat_algo_data_roc[algo] = df[df['feat_algo'] == algo]
for algo in feat_algo:
    feat_algo_data_roc[algo] = convert_df(feat_algo_data_roc[algo], 'roc_auc')
    

feat_algo_data_bal = { }
for algo in feat_algo:
    feat_algo_data_bal[algo] = df[df['feat_algo'] == algo]
for algo in feat_algo:
    feat_algo_data_bal[algo] = convert_df(feat_algo_data_bal[algo], 'bal')
    
print("Random Forest")
print(tabulate(get_mean_res(feat_algo_data_prec, feat_algo_data_rec, feat_algo_data_f1, feat_algo_data_roc, feat_algo_data_bal)))

Random Forest
----------------------------  ----------  ---------  ------  --------  -------  -------
Techniques                    Comparison  Precision  Recall  F1-score  Roc-Auc  Balance
GSMB vs IAMB                  p-value     0.788      0.876   0.715     0.89     0.626
                              mean-diff   0.0↑       0.0↑    0.0↑      0.0↑     0.0↑
GSMB vs ATE-FS                p-value     0.019      0.282   0.094     0.023    0.097
                              mean-diff   0.05↓      0.02↓   0.04↓     0.04↓    0.02↓
GSMB vs Mutual-Info           p-value     0.014      0.001   0.005     0.0      0.0
                              mean-diff   0.04↓      0.05↓   0.05↓     0.06↓    0.04↓
GSMB vs Forward               p-value     0.017      0.005   0.005     0.001    0.001
                              mean-diff   0.04↓      0.04↓   0.05↓     0.07↓    0.04↓
GSMB vs Auto-Spearman         p-value     0.0        0.066   0.001     0.0      0.006
                              mean-diff

### SVM

In [14]:
df = pd.read_csv("/kaggle/input/resuts-all/Copy-Final-Results-with-CFS - SVM.csv")
df = parsedf(df)

In [15]:
feat_algo_data_prec = { }
for algo in feat_algo:
    feat_algo_data_prec[algo] = df[df['feat_algo'] == algo]
for algo in feat_algo:
    feat_algo_data_prec[algo] = convert_df(feat_algo_data_prec[algo], 'prec')
   
feat_algo_data_rec = { }
for algo in feat_algo:
    feat_algo_data_rec[algo] = df[df['feat_algo'] == algo]
for algo in feat_algo:
    feat_algo_data_rec[algo] = convert_df(feat_algo_data_rec[algo], 'recall')
 
feat_algo_data_f1 = { }
for algo in feat_algo:
    feat_algo_data_f1[algo] = df[df['feat_algo'] == algo]
for algo in feat_algo:
    feat_algo_data_f1[algo] = convert_df(feat_algo_data_f1[algo], 'f1')
    
feat_algo_data_roc = { }
for algo in feat_algo:
    feat_algo_data_roc[algo] = df[df['feat_algo'] == algo]
for algo in feat_algo:
    feat_algo_data_roc[algo] = convert_df(feat_algo_data_roc[algo], 'roc_auc')
    

feat_algo_data_bal = { }
for algo in feat_algo:
    feat_algo_data_bal[algo] = df[df['feat_algo'] == algo]
for algo in feat_algo:
    feat_algo_data_bal[algo] = convert_df(feat_algo_data_bal[algo], 'bal')
    
print("SVM")
print(tabulate(get_mean_res(feat_algo_data_prec, feat_algo_data_rec, feat_algo_data_f1, feat_algo_data_roc, feat_algo_data_bal)))

SVM
----------------------------  ----------  ---------  ------  --------  -------  -------
Techniques                    Comparison  Precision  Recall  F1-score  Roc-Auc  Balance
GSMB vs IAMB                  p-value     0.896      0.924   0.876     0.924    0.89
                              mean-diff   0.0↑       0.0↑    0.0↑      0.0↑     0.0↑
GSMB vs ATE-FS                p-value     0.001      0.008   0.0       0.008    0.0
                              mean-diff   0.05↓      0.06↓   0.11↓     0.06↓    0.13↓
GSMB vs Mutual-Info           p-value     0.003      0.003   0.0       0.003    0.0
                              mean-diff   0.04↓      0.06↓   0.09↓     0.06↓    0.12↓
GSMB vs Forward               p-value     0.002      0.014   0.0       0.014    0.0
                              mean-diff   0.04↓      0.05↓   0.1↓      0.05↓    0.11↓
GSMB vs Auto-Spearman         p-value     0.007      0.048   0.0       0.048    0.0
                              mean-diff   0.03↓      0.0

### XgBoost

In [16]:
df = pd.read_csv("/kaggle/input/resuts-all/Copy-Final-Results-with-CFS - XgBoost.csv")
df = parsedf(df)

In [17]:
feat_algo_data_prec = { }
for algo in feat_algo:
    feat_algo_data_prec[algo] = df[df['feat_algo'] == algo]
for algo in feat_algo:
    feat_algo_data_prec[algo] = convert_df(feat_algo_data_prec[algo], 'prec')
   
feat_algo_data_rec = { }
for algo in feat_algo:
    feat_algo_data_rec[algo] = df[df['feat_algo'] == algo]
for algo in feat_algo:
    feat_algo_data_rec[algo] = convert_df(feat_algo_data_rec[algo], 'recall')
 
feat_algo_data_f1 = { }
for algo in feat_algo:
    feat_algo_data_f1[algo] = df[df['feat_algo'] == algo]
for algo in feat_algo:
    feat_algo_data_f1[algo] = convert_df(feat_algo_data_f1[algo], 'f1')
    
feat_algo_data_roc = { }
for algo in feat_algo:
    feat_algo_data_roc[algo] = df[df['feat_algo'] == algo]
for algo in feat_algo:
    feat_algo_data_roc[algo] = convert_df(feat_algo_data_roc[algo], 'roc_auc')
    

feat_algo_data_bal = { }
for algo in feat_algo:
    feat_algo_data_bal[algo] = df[df['feat_algo'] == algo]
for algo in feat_algo:
    feat_algo_data_bal[algo] = convert_df(feat_algo_data_bal[algo], 'bal')
    
print("XgBoost")
print(tabulate(get_mean_res(feat_algo_data_prec, feat_algo_data_rec, feat_algo_data_f1, feat_algo_data_roc, feat_algo_data_bal)))

XgBoost
----------------------------  ----------  ---------  ------  --------  -------  -------
Techniques                    Comparison  Precision  Recall  F1-score  Roc-Auc  Balance
GSMB vs IAMB                  p-value     0.993      0.903   0.931     0.952    0.807
                              mean-diff   0.0↓       0.0     0.0↑      0.0↑     0.0↑
GSMB vs ATE-FS                p-value     0.001      0.113   0.013     0.012    0.02
                              mean-diff   0.07↓      0.02↓   0.04↓     0.04↓    0.02↓
GSMB vs Mutual-Info           p-value     0.001      0.01    0.004     0.0      0.001
                              mean-diff   0.06↓      0.04↓   0.05↓     0.06↓    0.03↓
GSMB vs Forward               p-value     0.002      0.009   0.003     0.001    0.001
                              mean-diff   0.06↓      0.04↓   0.05↓     0.06↓    0.03↓
GSMB vs Auto-Spearman         p-value     0.0        0.012   0.001     0.0      0.001
                              mean-diff   0.