In [1]:
import sys, os
sys.path.insert(1, os.path.join(sys.path[0], ".."))
from modt.modt import MoDT
from modt._initialization import *
from modt.visualization import *
from modt.utility import *

import pickle
from timeit import default_timer as timer

import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import normalize
from sklearn.model_selection import RepeatedKFold

In [2]:
def fi_DT(x,y):
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(x, y)
    importances = clf.tree_.compute_feature_importances()
    return (-importances).argsort()[:2]

def fi_LDA(x,y):
    clf = LinearDiscriminantAnalysis()
    clf.fit(x, y)
    weights = np.abs(clf.coef_) / np.sum(np.abs(clf.coef_))
    return (-weights[0]).argsort()[:2]

def fi_intersect(x,y):
    return len(np.intersect1d(fi_LDA(x,y), fi_DT(x,y))) / 2.0

In [3]:
parameters = {
    "X": None,
    "y": None,
    "n_experts": 2,
    "iterations": 1,
    "max_depth": 2,
    "init_learning_rate": 100,
    "learning_rate_decay": 1,
    "initialize_with": "random",
    "initialization_method": None,
    "feature_names": None,
    "class_names": None,
    "use_2_dim_gate_based_on": None,
    "use_2_dim_clustering": False,
    "black_box_algorithm": None,
    }

In [4]:
datasets = [
    ["banknote_input.pd","banknote_target.pd"],
    ["adult_input.pd","adult_target.pd"],
    ["bank_input.pd","bank_target.pd"],
    ["breast_cancer_input.np","breast_cancer_target.np"],
    ["hrss_input.pd","hrss_target.pd"],
    ["iris_input.pd","iris_target.pd"],
    ["occupancy_input.pd","occupancy_target.pd"],
    ["pdm6_input.pd","pdm6_target.pd"],
    ["sensorless_input.pd","sensorless_target.pd"],
    ["steel_input.pd","steel_target.pd"],
]

In [5]:
if True:
    df_intersections = pickle.load(open("df_intersections.pd", "rb"))
else:
    runs = 100
    rows = []
    for dataset in datasets:
        data_input = pickle.load(open("../datasets/" + dataset[0], "rb"))
        data_target = pickle.load(open("../datasets/" + dataset[1], "rb"))
        parameters["X"] = data_input
        parameters["y"] = data_target
        modt = MoDT(**parameters)
        intersections = []
        for _ in range(runs):
            intersections.append(fi_intersect(modt.X,modt.y))
        intersection = np.sum(intersections) / runs
        dict1 = {
            "dataset" : dataset[0],
            "n_features" : modt.X.shape[1],
            "intersection" : intersection,
        }
        rows.append(dict1)
    df = pd.DataFrame(rows)
    df_intersections = df

In [6]:
df_intersections["n_features"] = df_intersections["n_features"] -1
df_intersections

Unnamed: 0,dataset,n_features,intersection
0,banknote_input.pd,4,0.5
1,adult_input.pd,104,0.0
2,bank_input.pd,63,0.0
3,breast_cancer_input.np,10,0.0
4,hrss_input.pd,18,0.0
5,iris_input.pd,4,1.0
6,occupancy_input.pd,5,0.0
7,pdm6_input.pd,8,0.5
8,sensorless_input.pd,48,0.0
9,steel_input.pd,27,0.0


In [7]:
parameters = {
    "X": None,
    "y": None,
    "n_experts": 3,
    "iterations": 100,
    "max_depth": 2,
    "init_learning_rate": 100,
    "learning_rate_decay": 0.995,
    "initialize_with": "random",
    "initialization_method": None,
    "feature_names": None,
    "class_names": None,
    "use_2_dim_gate_based_on": None,
    "use_2_dim_clustering": False,
    "black_box_algorithm": None,
    }
parameters_fit = {
    "optimization_method": "least_squares_linear_regression",
    "add_noise": False,
    "use_posterior": False,
    }

In [8]:
start = timer()
runs = 1
rows = []
for dataset in datasets:
    print("Starting",dataset[0],"...")
    data_input = pickle.load(open("../datasets/" + dataset[0], "rb"))
    data_target = pickle.load(open("../datasets/" + dataset[1], "rb"))
    
    use_dataframe = False
    if isinstance(data_input, pd.core.frame.DataFrame):
        use_dataframe = True
        
      
    dimensionality_reduction = ["feature_importance", "feature_importance_lda", "PCA", None]
    dict_results = {
        "dataset" : dataset[0]
    } 
    
    for method in dimensionality_reduction:
        print("Starting",method,"...")
        parameters["use_2_dim_gate_based_on"] = method

        train_accuracies = []
        val_accuracies = []
        rkf = RepeatedKFold(n_splits=5, n_repeats=runs)
        for train_idx, val_idx in rkf.split(data_input):
            if use_dataframe:
                X_temp = data_input.iloc[train_idx]
                y_temp = data_target.iloc[train_idx]
                X_temp.reset_index(inplace=True, drop=True)
                y_temp.reset_index(inplace=True, drop=True)
            else:
                X_temp = data_input[train_idx]
                y_temp = data_target[train_idx]

            parameters["X"] = X_temp
            parameters["y"] = y_temp
            modt = MoDT(**parameters)
            modt.fit(**parameters_fit)
            train_accuracies.append(modt.score_internal_disjoint())

            if use_dataframe:
                X_temp = data_input.iloc[val_idx]
                y_temp = data_target.iloc[val_idx]
                X_temp.reset_index(inplace=True, drop=True)
                y_temp.reset_index(inplace=True, drop=True)
            else:
                X_temp = data_input[val_idx]
                y_temp = data_target[val_idx]
            val_accuracies.append(modt.score(X_temp, y_temp))

        train_accuracy = np.mean(train_accuracies)
        val_accuracy = np.mean(val_accuracies)
        dict_results[str(method) + "_train"] = train_accuracy
        dict_results[str(method) + "_test"] = val_accuracy
        
    rows.append(dict_results)
    
print("Duration", timer() - start)
df_performance = pd.DataFrame(rows)

Starting banknote_input.pd ...
Starting feature_importance ...
Starting feature_importance_lda ...
Starting PCA ...
Starting None ...
Starting adult_input.pd ...
Starting feature_importance ...
Starting feature_importance_lda ...
Starting PCA ...
Starting None ...
Starting bank_input.pd ...
Starting feature_importance ...
Starting feature_importance_lda ...
Starting PCA ...
Starting None ...
Starting breast_cancer_input.np ...
Starting feature_importance ...
Starting feature_importance_lda ...
Starting PCA ...
Starting None ...
Starting hrss_input.pd ...
Starting feature_importance ...
Starting feature_importance_lda ...
Starting PCA ...
Starting None ...
Starting iris_input.pd ...
Starting feature_importance ...
Starting feature_importance_lda ...
Starting PCA ...
Starting None ...
Starting occupancy_input.pd ...
Starting feature_importance ...
Starting feature_importance_lda ...
Starting PCA ...
Starting None ...
Starting pdm6_input.pd ...
Starting feature_importance ...
Starting fea

In [9]:
df_performance

Unnamed: 0,dataset,feature_importance_train,feature_importance_test,feature_importance_lda_train,feature_importance_lda_test,PCA_train,PCA_test,None_train,None_test
0,banknote_input.pd,0.981958,0.97449,0.993256,0.990522,0.984694,0.974487,0.999271,0.996353
1,adult_input.pd,0.834071,0.826504,0.839533,0.839434,0.843387,0.817484,0.851096,0.802302
2,bank_input.pd,0.908044,0.908347,0.908693,0.906575,0.907437,0.905337,0.911267,0.903322
3,breast_cancer_input.np,0.969684,0.924406,0.957819,0.910371,0.960459,0.903369,0.984187,0.920975
4,hrss_input.pd,0.769201,0.76938,0.77085,0.770649,0.771812,0.772341,0.7762,0.775851
5,iris_input.pd,0.988333,0.953333,0.99,0.926667,0.981667,0.94,0.995,0.96
6,occupancy_input.pd,0.992754,0.992386,0.989347,0.987229,0.989899,0.988088,0.992632,0.990791
7,pdm6_input.pd,0.9744,0.9727,0.9743,0.9715,0.973775,0.9713,0.984275,0.9806
8,sensorless_input.pd,0.723397,0.71912,0.387701,0.385102,0.604565,0.602813,0.695205,0.691262
9,steel_input.pd,0.64168,0.617202,0.601879,0.589907,0.626995,0.582693,0.730035,0.675418


In [10]:
pickle.dump(df_performance, open("df_fi_performance1.pd", "wb"))

In [11]:
df_c = pd.concat([df_intersections.reset_index(drop=True), df_performance], axis=1)
df_c

Unnamed: 0,dataset,n_features,intersection,dataset.1,feature_importance_train,feature_importance_test,feature_importance_lda_train,feature_importance_lda_test,PCA_train,PCA_test,None_train,None_test
0,banknote_input.pd,4,0.5,banknote_input.pd,0.981958,0.97449,0.993256,0.990522,0.984694,0.974487,0.999271,0.996353
1,adult_input.pd,104,0.0,adult_input.pd,0.834071,0.826504,0.839533,0.839434,0.843387,0.817484,0.851096,0.802302
2,bank_input.pd,63,0.0,bank_input.pd,0.908044,0.908347,0.908693,0.906575,0.907437,0.905337,0.911267,0.903322
3,breast_cancer_input.np,10,0.0,breast_cancer_input.np,0.969684,0.924406,0.957819,0.910371,0.960459,0.903369,0.984187,0.920975
4,hrss_input.pd,18,0.0,hrss_input.pd,0.769201,0.76938,0.77085,0.770649,0.771812,0.772341,0.7762,0.775851
5,iris_input.pd,4,1.0,iris_input.pd,0.988333,0.953333,0.99,0.926667,0.981667,0.94,0.995,0.96
6,occupancy_input.pd,5,0.0,occupancy_input.pd,0.992754,0.992386,0.989347,0.987229,0.989899,0.988088,0.992632,0.990791
7,pdm6_input.pd,8,0.5,pdm6_input.pd,0.9744,0.9727,0.9743,0.9715,0.973775,0.9713,0.984275,0.9806
8,sensorless_input.pd,48,0.0,sensorless_input.pd,0.723397,0.71912,0.387701,0.385102,0.604565,0.602813,0.695205,0.691262
9,steel_input.pd,27,0.0,steel_input.pd,0.64168,0.617202,0.601879,0.589907,0.626995,0.582693,0.730035,0.675418


In [15]:
df_c = df_c.loc[:,~df_c.columns.duplicated()]

In [16]:
df_c[["dataset","n_features","intersection","feature_importance_test","feature_importance_lda_test","PCA_test","None_test"]]

Unnamed: 0,dataset,n_features,intersection,feature_importance_test,feature_importance_lda_test,PCA_test,None_test
0,banknote_input.pd,4,0.5,0.97449,0.990522,0.974487,0.996353
1,adult_input.pd,104,0.0,0.826504,0.839434,0.817484,0.802302
2,bank_input.pd,63,0.0,0.908347,0.906575,0.905337,0.903322
3,breast_cancer_input.np,10,0.0,0.924406,0.910371,0.903369,0.920975
4,hrss_input.pd,18,0.0,0.76938,0.770649,0.772341,0.775851
5,iris_input.pd,4,1.0,0.953333,0.926667,0.94,0.96
6,occupancy_input.pd,5,0.0,0.992386,0.987229,0.988088,0.990791
7,pdm6_input.pd,8,0.5,0.9727,0.9715,0.9713,0.9806
8,sensorless_input.pd,48,0.0,0.71912,0.385102,0.602813,0.691262
9,steel_input.pd,27,0.0,0.617202,0.589907,0.582693,0.675418


In [14]:
for index, row in df_c.iterrows():
    print(row.dataset, "&", row.n_features, "&", row.intersection, "&", "%.3f" % round(row.feature_importance_test,3), "&", "%.3f" % round(row.feature_importance_lda_test,3), "&","%.3f" % round(row.PCA_test,3),"&", "%.3f" % round(row.None_test,3), "\\\\")

banknote_input.pd & 4 & 0.5 & 0.974 & 0.991 & 0.974 & 0.996 \\
adult_input.pd & 104 & 0.0 & 0.827 & 0.839 & 0.817 & 0.802 \\
bank_input.pd & 63 & 0.0 & 0.908 & 0.907 & 0.905 & 0.903 \\
breast_cancer_input.np & 10 & 0.0 & 0.924 & 0.910 & 0.903 & 0.921 \\
hrss_input.pd & 18 & 0.0 & 0.769 & 0.771 & 0.772 & 0.776 \\
iris_input.pd & 4 & 1.0 & 0.953 & 0.927 & 0.940 & 0.960 \\
occupancy_input.pd & 5 & 0.0 & 0.992 & 0.987 & 0.988 & 0.991 \\
pdm6_input.pd & 8 & 0.5 & 0.973 & 0.972 & 0.971 & 0.981 \\
sensorless_input.pd & 48 & 0.0 & 0.719 & 0.385 & 0.603 & 0.691 \\
steel_input.pd & 27 & 0.0 & 0.617 & 0.590 & 0.583 & 0.675 \\
