In [1]:
import pickle
from timeit import default_timer as timer

import numpy as np
import pandas as pd

import sys, os
sys.path.insert(1, os.path.join(sys.path[0], ".."))
from modt.modt import MoDT

In [2]:
datasets = [
    ["abalone_input.pd","abalone_target.pd"], 
    ["adult_input.pd","adult_target.pd"], # Large
    ["banknote_input.pd","banknote_target.pd"], # Easy
    ["bank_input.pd","bank_target.pd"], # Large
    ["breast_cancer_input.np","breast_cancer_target.np"],
    ["cars_input.pd","cars_target.pd"], 
    ["contraceptive_input.pd","contraceptive_target.pd"], 
    ["generated6_input.np","generated6_target.np"],
    ["hrss_input.pd","hrss_target.pd"], # Large
    ["iris_input.pd","iris_target.pd"],
    ["steel_input.pd","steel_target.pd"],
    ["students_input.pd","students_target.pd"],
    ["sensorless_input.pd","sensorless_target.pd"], # Very Large dataset
]

In [3]:
runs=5

In [4]:
def get_info(dataset,time=False):
    X = pickle.load(open("../datasets/" + dataset[0], "rb"))
    Y = pickle.load(open("../datasets/" + dataset[1], "rb"))
    if time:
        seconds = []
        for _ in range(runs):
            start = timer()
            modt = MoDT(X,Y,3,100,2)
            modt.fit(early_stopping=False)
            seconds.append(timer() - start)
        seconds_FG = int(round(np.mean(seconds),1))
        seconds = []
        for _ in range(runs):
            start = timer()
            modt = MoDT(X,Y,3,100,2)
            modt.fit(early_stopping="likelihood")
            seconds.append(timer() - start)
        seconds_FG_l = int(round(np.mean(seconds),1))
        seconds = []
        for _ in range(runs):
            start = timer()
            modt = MoDT(X,Y,3,100,2)
            modt.fit(early_stopping="accuracy")
            seconds.append(timer() - start)
        seconds_FG_a = int(round(np.mean(seconds),1))
        seconds = []
        for _ in range(runs):
            start = timer()
            modt = MoDT(X,Y,3,100,2)
            modt.fit(early_stopping=False)
            seconds.append(timer() - start)
        seconds_2D = int(round(np.mean(seconds),1))
    if not isinstance(X, pd.core.frame.DataFrame):
        X = pd.DataFrame(X)
    if not isinstance(Y, pd.core.frame.DataFrame):
        Y = pd.DataFrame(Y)

    name = dataset[0].split("_")[0]
    n = X.shape[0]
    f = X.shape[1]
    t = len(Y.iloc[:,0].unique())
    num = len(X.select_dtypes(include=["number"]).columns)
    cat = len(X.select_dtypes(exclude=["number"]).columns)
    modt = MoDT(X,Y,2,1,1)
    f_transformed = modt.X.shape[1] - 1 

    if time:
        return [name,n,f_transformed,seconds_2D,seconds_FG,seconds_FG_l,seconds_FG_a]
    else:
        return [name,n,t,f,num,cat,f_transformed]

In [5]:
for dataset in datasets:
    print(' & '.join(map(str, get_info(dataset))), "& \\\\")
    print("\\hline")

abalone & 4177 & 3 & 8 & 7 & 1 & 10 & \\
\hline
adult & 30162 & 2 & 14 & 6 & 8 & 104 & \\
\hline
banknote & 1372 & 2 & 4 & 4 & 0 & 4 & \\
\hline
bank & 41188 & 2 & 20 & 10 & 10 & 63 & \\
\hline
breast & 569 & 2 & 10 & 10 & 0 & 10 & \\
\hline
cars & 1728 & 4 & 6 & 0 & 6 & 21 & \\
\hline
contraceptive & 1473 & 3 & 9 & 2 & 7 & 24 & \\
\hline
generated6 & 5000 & 5 & 2 & 2 & 0 & 2 & \\
\hline
hrss & 23645 & 2 & 18 & 18 & 0 & 18 & \\
\hline
iris & 150 & 3 & 4 & 4 & 0 & 4 & \\
\hline
steel & 1941 & 7 & 27 & 27 & 0 & 27 & \\
\hline
students & 666 & 4 & 11 & 0 & 11 & 49 & \\
\hline


KeyboardInterrupt: 

In [None]:
for dataset in datasets:
    print(' & '.join(map(str, get_info(dataset, time=True))), "\\\\")

abalone & 4177 & 10 & 1 & 1 & 1 & 1 \\
adult & 30162 & 104 & 22 & 21 & 23 & 22 \\
banknote & 1372 & 4 & 0 & 1 & 1 & 0 \\
bank & 41188 & 63 & 22 & 22 & 24 & 22 \\
breast & 569 & 10 & 0 & 0 & 0 & 0 \\
cars & 1728 & 21 & 0 & 0 & 1 & 0 \\
contraceptive & 1473 & 24 & 0 & 0 & 1 & 0 \\
generated6 & 5000 & 2 & 1 & 1 & 1 & 1 \\
hrss & 23645 & 18 & 9 & 8 & 8 & 6 \\
iris & 150 & 4 & 0 & 0 & 0 & 0 \\
steel & 1941 & 27 & 2 & 2 & 2 & 2 \\
students & 666 & 49 & 0 & 0 & 0 & 0 \\
