In [1]:
import sys, os
sys.path.insert(1, os.path.join(sys.path[0], ".."))
from modt.modt import MoDT
from modt._initialization import *
from modt.visualization import *
from modt.utility import *

import pickle
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import normalize
from sklearn.model_selection import RepeatedKFold

In [2]:
def fi_DT(x,y):
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(x, y)
    importances = clf.tree_.compute_feature_importances()
    return (-importances).argsort()[:2]

def fi_LDA(x,y):
    clf = LinearDiscriminantAnalysis()
    clf.fit(x, y)
    weights = np.abs(clf.coef_) / np.sum(np.abs(clf.coef_))
    return (-weights[0]).argsort()[:2]

def fi_intersect(x,y):
    return len(np.intersect1d(fi_LDA(x,y), fi_DT(x,y))) / 2.0

In [3]:
parameters = {
    "X": None,
    "y": None,
    "n_experts": 2,
    "iterations": 1,
    "max_depth": 2,
    "init_learning_rate": 100,
    "learning_rate_decay": 1,
    "initialize_with": "random",
    "initialization_method": None,
    "feature_names": None,
    "class_names": None,
    "use_2_dim_gate_based_on": None,
    "use_2_dim_clustering": False,
    "black_box_algorithm": None,
    }

In [4]:
datasets = [
    ["banknote_input.pd","banknote_target.pd"],
    ["adult_input.pd","adult_target.pd"],
    ["bank_input.pd","bank_target.pd"],
    ["breast_cancer_input.np","breast_cancer_target.np"],
    ["hrss_input.pd","hrss_target.pd"],
    ["iris_input.pd","iris_target.pd"],
    ["occupancy_input.pd","occupancy_target.pd"],
    ["pdm6_input.pd","pdm6_target.pd"],
    ["sensorless_input.pd","sensorless_target.pd"],
    ["steel_input.pd","steel_target.pd"],
]

In [5]:
if True:
    df_intersections = pickle.load(open("df_intersections.pd", "rb"))
else:
    runs = 100
    rows = []
    for dataset in datasets:
        data_input = pickle.load(open("../datasets/" + dataset[0], "rb"))
        data_target = pickle.load(open("../datasets/" + dataset[1], "rb"))
        parameters["X"] = data_input
        parameters["y"] = data_target
        modt = MoDT(**parameters)
        intersections = []
        for _ in range(runs):
            intersections.append(fi_intersect(modt.X,modt.y))
        intersection = np.sum(intersections) / runs
        dict1 = {
            "dataset" : dataset[0],
            "n_features" : modt.X.shape[1],
            "intersection" : intersection,
        }
        rows.append(dict1)
    df = pd.DataFrame(rows)
    df_intersections = df

In [6]:
df_intersections["n_features"] = df_intersections["n_features"] -1
df_intersections

Unnamed: 0,dataset,n_features,intersection
0,banknote_input.pd,4,0.5
1,adult_input.pd,104,0.0
2,bank_input.pd,63,0.0
3,breast_cancer_input.np,10,0.0
4,hrss_input.pd,18,0.0
5,iris_input.pd,4,1.0
6,occupancy_input.pd,5,0.0
7,pdm6_input.pd,8,0.5
8,sensorless_input.pd,48,0.0
9,steel_input.pd,27,0.0


In [14]:
parameters = {
    "X": None,
    "y": None,
    "n_experts": 3,
    "iterations": 60,
    "max_depth": 2,
    "init_learning_rate": 100,
    "learning_rate_decay": 1,
    "initialize_with": "random",
    "initialization_method": None,
    "feature_names": None,
    "class_names": None,
    "use_2_dim_gate_based_on": None,
    "use_2_dim_clustering": False,
    "black_box_algorithm": None,
    }
parameters_fit = {
    "optimization_method": "least_squares_linear_regression",
    "add_noise": False,
    "use_posterior": False,
    }

In [15]:
runs = 10
rows = []
for dataset in datasets:
    data_input = pickle.load(open("../datasets/" + dataset[0], "rb"))
    data_target = pickle.load(open("../datasets/" + dataset[1], "rb"))
    
    use_dataframe = False
    if isinstance(data_input, pd.core.frame.DataFrame):
        use_dataframe = True
        
      
    dimensionality_reduction = ["feature_importance", "feature_importance_lda", "PCA", None]
    dict_results = {
        "dataset" : dataset[0]
    } 
    for method in dimensionality_reduction:
        parameters["use_2_dim_gate_based_on"] = method

        train_accuracies = []
        val_accuracies = []
        rkf = RepeatedKFold(n_splits=5, n_repeats=1)
        for train_idx, val_idx in rkf.split(data_input):
            if use_dataframe:
                X_temp = data_input.iloc[train_idx]
                y_temp = data_target.iloc[train_idx]
                X_temp.reset_index(inplace=True, drop=True)
                y_temp.reset_index(inplace=True, drop=True)
            else:
                X_temp = data_input[train_idx]
                y_temp = data_target[train_idx]

            parameters["X"] = X_temp
            parameters["y"] = y_temp
            modt = MoDT(**parameters)
            modt.fit(**parameters_fit)
            train_accuracies.append(modt.score_internal_disjoint())

            if use_dataframe:
                X_temp = data_input.iloc[val_idx]
                y_temp = data_target.iloc[val_idx]
                X_temp.reset_index(inplace=True, drop=True)
                y_temp.reset_index(inplace=True, drop=True)
            else:
                X_temp = data_input[val_idx]
                y_temp = data_target[val_idx]
            val_accuracies.append(modt.score(X_temp, y_temp))

        train_accuracy = np.mean(train_accuracies)
        val_accuracy = np.mean(val_accuracies)
        dict_results[str(method) + "_train"] = train_accuracy
        dict_results[str(method) + "_test"] = val_accuracy
        
    rows.append(dict_results)
    
df_performance = pd.DataFrame(rows)

In [16]:
df_performance

Unnamed: 0,dataset,feature_importance_train,feature_importance_test,feature_importance_lda_train,feature_importance_lda_test,PCA_train,PCA_test,None_train,None_test
0,banknote_input.pd,0.984877,0.978856,0.996174,0.98834,0.984148,0.969399,0.997813,0.989078
1,adult_input.pd,0.83417,0.833533,0.839856,0.831973,0.841315,0.835853,0.84962,0.783501
2,bank_input.pd,0.909051,0.906551,0.908232,0.908032,0.907437,0.887007,0.913227,0.902423
3,breast_cancer_input.np,0.966609,0.926207,0.960897,0.9034,0.962219,0.915665,0.982861,0.91565
4,hrss_input.pd,0.771358,0.771157,0.768355,0.767858,0.772785,0.774075,0.774138,0.771114
5,iris_input.pd,0.99,0.96,0.991667,0.96,0.99,0.92,0.995,0.973333
6,occupancy_input.pd,0.992816,0.991158,0.9895,0.988456,0.989347,0.98772,0.992785,0.990666
7,pdm6_input.pd,0.975325,0.9726,0.9747,0.9717,0.973425,0.9716,0.984275,0.9822
8,sensorless_input.pd,0.712852,0.711377,0.367204,0.36502,0.603433,0.601839,0.747116,0.74486
9,steel_input.pd,0.64065,0.613079,0.611155,0.590943,0.615016,0.597637,0.706342,0.673358


In [17]:
pickle.dump(df_performance, open("df_fi_performance1.pd", "wb"))

In [13]:
df_c = pd.concat([df_intersections.reset_index(drop=True), df_performance], axis=1)
df_c

Unnamed: 0,dataset,n_features,intersection,dataset.1,feature_importance_train,feature_importance_test,feature_importance_lda_train,feature_importance_lda_test,PCA_train,PCA_test,None_train,None_test
0,banknote_input.pd,4,0.5,banknote_input.pd,0.949343,0.931488,0.97613,0.970845,0.962464,0.951883,0.971024,0.962126
1,adult_input.pd,104,0.0,adult_input.pd,0.825252,0.823486,0.840685,0.831343,0.836574,0.837312,0.833516,0.804457
2,bank_input.pd,63,0.0,bank_input.pd,0.906793,0.905337,0.907206,0.905385,0.90672,0.90354,0.906113,0.903103
3,breast_cancer_input.np,10,0.0,breast_cancer_input.np,0.96178,0.927993,0.94552,0.910309,0.947268,0.922621,0.961341,0.896305
4,hrss_input.pd,18,0.0,hrss_input.pd,0.768344,0.767477,0.766822,0.766758,0.76974,0.76955,0.770914,0.770057
5,iris_input.pd,4,1.0,iris_input.pd,0.981667,0.92,0.975,0.953333,0.976667,0.946667,0.986667,0.946667
6,occupancy_input.pd,5,0.0,occupancy_input.pd,0.989776,0.989193,0.989347,0.988702,0.989224,0.987965,0.990083,0.98993
7,pdm6_input.pd,8,0.5,pdm6_input.pd,0.97275,0.9715,0.97155,0.9697,0.971025,0.9697,0.971925,0.9702
8,sensorless_input.pd,48,0.0,sensorless_input.pd,0.61138,0.612283,0.349861,0.348938,0.463211,0.46133,0.508695,0.508913
9,steel_input.pd,27,0.0,steel_input.pd,0.576381,0.559498,0.582563,0.563116,0.607936,0.581178,0.638072,0.594025


In [19]:
df_c = df_c.loc[:,~df_c.columns.duplicated()]

In [22]:
df_c[["dataset","n_features","intersection","feature_importance_test","feature_importance_lda_test","PCA_test","None_test"]]

Unnamed: 0,dataset,n_features,intersection,feature_importance_test,feature_importance_lda_test,PCA_test,None_test
0,banknote_input.pd,4,0.5,0.931488,0.970845,0.951883,0.962126
1,adult_input.pd,104,0.0,0.823486,0.831343,0.837312,0.804457
2,bank_input.pd,63,0.0,0.905337,0.905385,0.90354,0.903103
3,breast_cancer_input.np,10,0.0,0.927993,0.910309,0.922621,0.896305
4,hrss_input.pd,18,0.0,0.767477,0.766758,0.76955,0.770057
5,iris_input.pd,4,1.0,0.92,0.953333,0.946667,0.946667
6,occupancy_input.pd,5,0.0,0.989193,0.988702,0.987965,0.98993
7,pdm6_input.pd,8,0.5,0.9715,0.9697,0.9697,0.9702
8,sensorless_input.pd,48,0.0,0.612283,0.348938,0.46133,0.508913
9,steel_input.pd,27,0.0,0.559498,0.563116,0.581178,0.594025


In [29]:
for index, row in df_c.iterrows():
    print(row.dataset, "&", row.n_features, "&", row.intersection, "&", "%.3f" % round(row.feature_importance_test,3), "&", "%.3f" % round(row.feature_importance_lda_test,3), "&","%.3f" % round(row.PCA_test,3),"&", "%.3f" % round(row.None_test,3), "\\\\")

banknote_input.pd & 4 & 0.5 & 0.931 & 0.971 & 0.952 & 0.962 \\
adult_input.pd & 104 & 0.0 & 0.823 & 0.831 & 0.837 & 0.804 \\
bank_input.pd & 63 & 0.0 & 0.905 & 0.905 & 0.904 & 0.903 \\
breast_cancer_input.np & 10 & 0.0 & 0.928 & 0.910 & 0.923 & 0.896 \\
hrss_input.pd & 18 & 0.0 & 0.767 & 0.767 & 0.770 & 0.770 \\
iris_input.pd & 4 & 1.0 & 0.920 & 0.953 & 0.947 & 0.947 \\
occupancy_input.pd & 5 & 0.0 & 0.989 & 0.989 & 0.988 & 0.990 \\
pdm6_input.pd & 8 & 0.5 & 0.972 & 0.970 & 0.970 & 0.970 \\
sensorless_input.pd & 48 & 0.0 & 0.612 & 0.349 & 0.461 & 0.509 \\
steel_input.pd & 27 & 0.0 & 0.559 & 0.563 & 0.581 & 0.594 \\
