In [1]:
import sys, os
sys.path.insert(1, os.path.join(sys.path[0], ".."))
from modt.modt import MoDT
from modt._initialization import *
from modt.visualization import *
from modt.utility import *

import pickle
from timeit import default_timer as timer

import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import normalize
from sklearn.model_selection import RepeatedKFold

In [2]:
parameters = {
    "X": None,
    "y": None,
    "n_experts": 2,
    "iterations": 1,
    "max_depth": 2,
    "init_learning_rate": 100,
    "learning_rate_decay": 1,
    "initialization_method": "random",
    "feature_names": None,
    "class_names": None,
    "use_2_dim_gate_based_on": None,
    "use_2_dim_clustering": False,
    "black_box_algorithm": None,
    }

In [3]:
datasets = [
    ["abalone_input.pd","abalone_target.pd"], 
    ["adult_input.pd","adult_target.pd"], # Large
    ["banknote_input.pd","banknote_target.pd"], # Easy
    ["bank_input.pd","bank_target.pd"], # Large
    ["breast_cancer_input.np","breast_cancer_target.np"],
    ["cars_input.pd","cars_target.pd"], 
    ["contraceptive_input.pd","contraceptive_target.pd"], 
    ["generated6_input.np","generated6_target.np"],
    ["hrss_input.pd","hrss_target.pd"], # Large
    ["iris_input.pd","iris_target.pd"],
    ["steel_input.pd","steel_target.pd"],
    ["students_input.pd","students_target.pd"],
    #["sensorless_input.pd","sensorless_target.pd"], # Very Large dataset
]

In [4]:
gate_reduction_methods = [
    "feature_importance",
    "feature_importance_lda",
    "feature_importance_lda_max",
    "feature_importance_lr",
    "feature_importance_lr_max",
    "feature_importance_xgb",
    "feature_importance_pca_loadings",
]

In [5]:
runs = 10
dfs = []

for dataset in datasets:
    data_input = pickle.load(open("../datasets/" + dataset[0], "rb"))
    data_target = pickle.load(open("../datasets/" + dataset[1], "rb"))
    parameters["X"] = data_input
    parameters["y"] = data_target

    results = []
    for gate_reduction_method in gate_reduction_methods:
        parameters["use_2_dim_gate_based_on"] = gate_reduction_method
        selected_features_idx = []
        for _ in range(runs):
            modt = MoDT(**parameters)
            selected_features_idx.append(modt.X_top_2_mask[:-1])
        dict1 = {
            "gate_reduction_method" : gate_reduction_method,
            "features" : selected_features_idx,
            }
        results.append(dict1)
    df = pd.DataFrame(results)

    #  Calculate intersections
    rows = []
    for g1 in gate_reduction_methods:
        for g2 in gate_reduction_methods:
            name = g1 + " vs. " + g2
            intersection = len(np.intersect1d(df[df['gate_reduction_method'] == g1].iloc[0].features,df[df['gate_reduction_method'] == g2].iloc[0].features))/2.0
            dict1 = {
                "dataset" : dataset[0],
                "name" : name,
                "intersection" : intersection,
            }
            rows.append(dict1)
    df_i = pd.DataFrame(rows)
    dfs.append(df_i)


In [6]:
df = pd.concat(dfs)
df

Unnamed: 0,dataset,name,intersection
0,abalone_input.pd,feature_importance vs. feature_importance,1.0
1,abalone_input.pd,feature_importance vs. feature_importance_lda,0.5
2,abalone_input.pd,feature_importance vs. feature_importance_lda_max,0.5
3,abalone_input.pd,feature_importance vs. feature_importance_lr,0.5
4,abalone_input.pd,feature_importance vs. feature_importance_lr_max,0.5
...,...,...,...
44,students_input.pd,feature_importance_pca_loadings vs. feature_im...,0.0
45,students_input.pd,feature_importance_pca_loadings vs. feature_im...,0.0
46,students_input.pd,feature_importance_pca_loadings vs. feature_im...,0.0
47,students_input.pd,feature_importance_pca_loadings vs. feature_im...,0.0


In [7]:
combinations = np.array([
       'feature_importance vs. feature_importance_lda',
       'feature_importance vs. feature_importance_lda_max',
       'feature_importance vs. feature_importance_lr',
       'feature_importance vs. feature_importance_lr_max',
       'feature_importance vs. feature_importance_xgb',
       'feature_importance vs. feature_importance_pca_loadings',
       'feature_importance_lda vs. feature_importance_lda_max',
       'feature_importance_lda vs. feature_importance_lr',
       'feature_importance_lda vs. feature_importance_lr_max',
       'feature_importance_lda vs. feature_importance_xgb',
       'feature_importance_lda vs. feature_importance_pca_loadings',
       'feature_importance_lda_max vs. feature_importance_lr',
       'feature_importance_lda_max vs. feature_importance_lr_max',
       'feature_importance_lda_max vs. feature_importance_xgb',
       'feature_importance_lda_max vs. feature_importance_pca_loadings',
       'feature_importance_lr vs. feature_importance_lr_max',
       'feature_importance_lr vs. feature_importance_xgb',
       'feature_importance_lr vs. feature_importance_pca_loadings',
       'feature_importance_lr_max vs. feature_importance_xgb',
       'feature_importance_lr_max vs. feature_importance_pca_loadings',
       'feature_importance_xgb vs. feature_importance_pca_loadings',
])

In [8]:
rows = []
for dataset in datasets:
    dataset = dataset[0]
    dict1 = {
        "dataset" : dataset,
    }
    for combination in combinations:
        dict1[combination] = df[(df["dataset"] == dataset) & (df["name"] == combination)]["intersection"].values[0]
    rows.append(dict1)
df_results = pd.DataFrame(rows)

In [9]:
df_results

Unnamed: 0,dataset,feature_importance vs. feature_importance_lda,feature_importance vs. feature_importance_lda_max,feature_importance vs. feature_importance_lr,feature_importance vs. feature_importance_lr_max,feature_importance vs. feature_importance_xgb,feature_importance vs. feature_importance_pca_loadings,feature_importance_lda vs. feature_importance_lda_max,feature_importance_lda vs. feature_importance_lr,feature_importance_lda vs. feature_importance_lr_max,...,feature_importance_lda_max vs. feature_importance_lr,feature_importance_lda_max vs. feature_importance_lr_max,feature_importance_lda_max vs. feature_importance_xgb,feature_importance_lda_max vs. feature_importance_pca_loadings,feature_importance_lr vs. feature_importance_lr_max,feature_importance_lr vs. feature_importance_xgb,feature_importance_lr vs. feature_importance_pca_loadings,feature_importance_lr_max vs. feature_importance_xgb,feature_importance_lr_max vs. feature_importance_pca_loadings,feature_importance_xgb vs. feature_importance_pca_loadings
0,abalone_input.pd,0.5,0.5,0.5,0.5,1.0,0.0,1.0,1.0,1.0,...,1.0,1.0,0.5,0.5,1.0,0.5,0.5,0.5,0.5,0.0
1,adult_input.pd,0.5,0.5,0.5,0.5,0.5,0.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
2,banknote_input.pd,0.5,0.5,1.0,1.0,1.0,0.0,1.0,0.5,0.5,...,0.5,0.5,0.5,0.5,1.0,1.0,0.0,1.0,0.0,0.0
3,bank_input.pd,0.5,0.5,0.5,0.5,1.0,0.0,1.0,1.0,1.0,...,1.0,1.0,0.5,0.0,1.0,0.5,0.0,0.5,0.0,0.0
4,breast_cancer_input.np,0.0,0.0,1.0,1.0,0.5,0.5,1.0,0.0,0.0,...,0.0,0.0,0.5,0.0,1.0,0.5,0.5,0.5,0.5,0.5
5,cars_input.pd,1.0,0.5,1.0,1.0,1.0,0.5,0.5,1.0,1.0,...,0.5,0.5,0.5,1.0,1.0,1.0,0.5,1.0,0.5,0.5
6,contraceptive_input.pd,1.0,1.0,1.0,1.0,0.5,0.0,1.0,1.0,1.0,...,1.0,1.0,0.5,0.0,1.0,0.5,0.0,0.5,0.0,0.0
7,generated6_input.np,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,hrss_input.pd,0.0,0.0,0.0,0.0,0.5,0.5,1.0,1.0,1.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.5
9,iris_input.pd,1.0,1.0,1.0,1.0,1.0,0.5,1.0,1.0,1.0,...,1.0,1.0,1.0,0.5,1.0,1.0,0.5,1.0,0.5,0.5


In [10]:
df_results['DT-LDA'] = df_results['feature_importance vs. feature_importance_lda'] 
df_results['DT-LDAm'] = df_results['feature_importance vs. feature_importance_lda_max'] 
df_results['DT-LR'] = df_results['feature_importance vs. feature_importance_lr'] 
df_results['DT-LRm'] = df_results['feature_importance vs. feature_importance_lr_max'] 
df_results['DT-XGB'] = df_results['feature_importance vs. feature_importance_xgb'] 
df_results['DT-L'] = df_results['feature_importance vs. feature_importance_pca_loadings'] 
df_results['LDA-LDAm'] = df_results['feature_importance_lda vs. feature_importance_lda_max'] 
df_results['LDA-LR'] = df_results['feature_importance_lda vs. feature_importance_lr'] 
df_results['LDA-LRm'] = df_results['feature_importance_lda vs. feature_importance_lr_max'] 
df_results['LDA-XGB'] = df_results['feature_importance_lda vs. feature_importance_xgb'] 
df_results['LDA-L'] = df_results['feature_importance_lda vs. feature_importance_pca_loadings'] 
df_results['LDAm-LR'] = df_results['feature_importance_lda_max vs. feature_importance_lr'] 
df_results['LDAm-LRm'] = df_results['feature_importance_lda_max vs. feature_importance_lr_max'] 
df_results['LDAm-XGB'] = df_results['feature_importance_lda_max vs. feature_importance_xgb'] 
df_results['LDAm-L'] = df_results['feature_importance_lda_max vs. feature_importance_pca_loadings'] 
df_results['LR-LRm'] = df_results['feature_importance_lr vs. feature_importance_lr_max'] 
df_results['LR-XGB'] = df_results['feature_importance_lr vs. feature_importance_xgb'] 
df_results['LR-L'] = df_results['feature_importance_lr vs. feature_importance_pca_loadings'] 
df_results['LRm-XGB'] = df_results['feature_importance_lr_max vs. feature_importance_xgb'] 
df_results['LRm-L'] = df_results['feature_importance_lr_max vs. feature_importance_pca_loadings'] 
df_results['XGB-L'] = df_results['feature_importance_xgb vs. feature_importance_pca_loadings']

In [11]:
df_results.drop(columns=combinations,inplace=True)

In [12]:
df_results

Unnamed: 0,dataset,DT-LDA,DT-LDAm,DT-LR,DT-LRm,DT-XGB,DT-L,LDA-LDAm,LDA-LR,LDA-LRm,...,LDAm-LR,LDAm-LRm,LDAm-XGB,LDAm-L,LR-LRm,LR-XGB,LR-L,LRm-XGB,LRm-L,XGB-L
0,abalone_input.pd,0.5,0.5,0.5,0.5,1.0,0.0,1.0,1.0,1.0,...,1.0,1.0,0.5,0.5,1.0,0.5,0.5,0.5,0.5,0.0
1,adult_input.pd,0.5,0.5,0.5,0.5,0.5,0.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
2,banknote_input.pd,0.5,0.5,1.0,1.0,1.0,0.0,1.0,0.5,0.5,...,0.5,0.5,0.5,0.5,1.0,1.0,0.0,1.0,0.0,0.0
3,bank_input.pd,0.5,0.5,0.5,0.5,1.0,0.0,1.0,1.0,1.0,...,1.0,1.0,0.5,0.0,1.0,0.5,0.0,0.5,0.0,0.0
4,breast_cancer_input.np,0.0,0.0,1.0,1.0,0.5,0.5,1.0,0.0,0.0,...,0.0,0.0,0.5,0.0,1.0,0.5,0.5,0.5,0.5,0.5
5,cars_input.pd,1.0,0.5,1.0,1.0,1.0,0.5,0.5,1.0,1.0,...,0.5,0.5,0.5,1.0,1.0,1.0,0.5,1.0,0.5,0.5
6,contraceptive_input.pd,1.0,1.0,1.0,1.0,0.5,0.0,1.0,1.0,1.0,...,1.0,1.0,0.5,0.0,1.0,0.5,0.0,0.5,0.0,0.0
7,generated6_input.np,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,hrss_input.pd,0.0,0.0,0.0,0.0,0.5,0.5,1.0,1.0,1.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.5
9,iris_input.pd,1.0,1.0,1.0,1.0,1.0,0.5,1.0,1.0,1.0,...,1.0,1.0,1.0,0.5,1.0,1.0,0.5,1.0,0.5,0.5


In [13]:
pickle.dump(df_results, open("dataframes/ex4a_df_intersections_feature_selection.pd", "wb"))