In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Set geomstats background
# export GEOMSTATS_BACKEND=numpy
import os
os.environ['GEOMSTATS_BACKEND'] = 'numpy'

import numpy as np
import hyperdt.benchmarking as benchmarking
from hyperdt.product_space_DT import ProductSpace, ProductSpaceDT
from hyperdt.forest import ProductSpaceRF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

INFO: Using numpy backend


In [3]:
NUM_POINTS = 1000
NUM_CLASSES = 4
signatures = [
    # [(3, -1.)],
    # [(3, 1.)],
    # [(3, -1.), (4, -.5)],
    # [(3, -1.), (4, 1.)],
    # [(3, -1.), (4, 1.), (5, 0.)],
    # [(3, -1.), (4, -.5), (3, .5), (4, 1.)],
    # [(3, -1.), (4, -.5), (3, .5), (4, 1.), (5, 0.)],
    [(5,-1), (5,-1)],
    [(5, 1), (5, 1)],
    [(5, -1), (5, 1)],
    [(2,-1), (2,-1), (2,-1), (2,-1), (2,-1)],
    [(2, 1), (2, 1), (2, 1), (2, 1), (2, 1)],
    [(2, -1), (2, -1), (2, 0), (2, 1), (2, 1)]
    # [(10,-1)],
    # [(10, 1)],
    # [(10, 0)],
]


In [4]:
import pandas as pd
from tqdm import tqdm

seed = 0
results = []

rnd_seeds, psdt_scores_by_signature, dt_scores_by_signature = benchmarking.compute_scores_by_signature(
    signatures, 1000, NUM_CLASSES, seed=seed, max_depth=3, n_seeds=100)
for signature, psdt_score, dt_score in zip(signatures, psdt_scores_by_signature, dt_scores_by_signature):
    results.append({
        "seed": seed,
        "signature": signature,
        "psdt_score": psdt_score,
        "dt_score": dt_score
    })

results = pd.DataFrame(results)
results

INFO: NumExpr defaulting to 8 threads.
100%|██████████| 600/600 [09:04<00:00,  1.10it/s]


Unnamed: 0,seed,signature,psdt_score,dt_score
0,0,"[(5, -1), (5, -1)]","[0.97, 0.98, 0.915, 0.925, 0.98, 1.0, 0.995, 0...","[0.95, 0.89, 0.865, 0.835, 0.9, 0.845, 0.985, ..."
1,0,"[(5, 1), (5, 1)]","[0.675, 0.54, 0.56, 0.6, 0.51, 0.545, 0.67, 0....","[0.675, 0.535, 0.545, 0.625, 0.535, 0.5, 0.665..."
2,0,"[(5, -1), (5, 1)]","[0.97, 0.98, 0.915, 0.925, 0.98, 1.0, 0.995, 0...","[0.95, 0.885, 0.865, 0.835, 0.9, 0.87, 0.985, ..."
3,0,"[(2, -1), (2, -1), (2, -1), (2, -1), (2, -1)]","[0.705, 0.955, 0.72, 0.805, 0.6, 0.66, 0.84, 0...","[0.72, 0.935, 0.71, 0.805, 0.615, 0.665, 0.8, ..."
4,0,"[(2, 1), (2, 1), (2, 1), (2, 1), (2, 1)]","[0.655, 0.715, 0.49, 0.685, 0.45, 0.575, 0.665...","[0.6, 0.695, 0.495, 0.65, 0.42, 0.6, 0.66, 0.5..."
5,0,"[(2, -1), (2, -1), (2, 0), (2, 1), (2, 1)]","[0.77, 0.955, 0.75, 0.805, 0.625, 0.69, 0.84, ...","[0.735, 0.95, 0.735, 0.78, 0.625, 0.65, 0.825,..."


In [5]:
import scipy.stats as stats

# for signature, psdt_scores, dt_scores in zip(signatures, psdt_scores_by_signature, dt_scores_by_signature):
for signature, (i, row) in zip(signatures, results.iterrows()):
    dt_scores = row["dt_score"]
    psdt_scores = row["psdt_score"]
    s = []
    for component in signature:
        if component[1] < 0:
            s.append("\H{" + f"{component[0]}, {-component[1]}" + "}")
        elif component[1] == 0:
            s.append("\E{" + f"{component[0]}" + "}")
        else:
            s.append("\S{" + f"{component[0]}, {component[1]}" + "}")
    print("$" + " \\times ".join(s) + "$", end=" & ")
    print(f"${np.mean(dt_scores) * 100:.1f} \pm {np.std(dt_scores) / np.sqrt(len(dt_scores)) * 1.96 * 100 :.1f}$", end=" & ")
    print(f"${np.mean(psdt_scores) * 100:.1f} \pm {np.std(psdt_scores) / np.sqrt(len(psdt_scores)) * 1.96 * 100 :.1f}", end="")

    t, p = stats.ttest_rel(dt_scores, psdt_scores)
    if p < .05:
        print("^*$", end="")
    else:
        print("$", end="")

    print("\\\\")

$\H{5, 1} \times \H{5, 1}$ & $93.6 \pm 1.1$ & $97.3 \pm 0.5^*$\\
$\S{5, 1} \times \S{5, 1}$ & $62.2 \pm 1.6$ & $64.1 \pm 1.5^*$\\
$\H{5, 1} \times \S{5, 1}$ & $93.7 \pm 1.1$ & $97.3 \pm 0.5^*$\\
$\H{2, 1} \times \H{2, 1} \times \H{2, 1} \times \H{2, 1} \times \H{2, 1}$ & $76.9 \pm 1.9$ & $78.6 \pm 1.9^*$\\
$\S{2, 1} \times \S{2, 1} \times \S{2, 1} \times \S{2, 1} \times \S{2, 1}$ & $60.4 \pm 1.9$ & $60.6 \pm 1.9$\\
$\H{2, 1} \times \H{2, 1} \times \E{2} \times \S{2, 1} \times \S{2, 1}$ & $78.2 \pm 1.9$ & $79.5 \pm 1.9^*$\\


In [6]:
# RF version
import pandas as pd

results_rf = []

rnd_seeds, psdt_scores_by_signature, dt_scores_by_signature = benchmarking.compute_scores_by_signature(
    signatures, 1000, NUM_CLASSES, seed=0, max_depth=3, n_seeds=20, rf=True
)
for signature, psdt_score, dt_score in zip(signatures, psdt_scores_by_signature, dt_scores_by_signature):
    results_rf.append({"signature": signature, "psdt_score": psdt_score, "dt_score": dt_score})

results_rf = pd.DataFrame(results_rf)
results_rf

  0%|          | 0/120 [00:00<?, ?it/s]

100%|██████████| 120/120 [12:42<00:00,  6.36s/it]


Unnamed: 0,signature,psdt_score,dt_score
0,"[(5, -1), (5, -1)]","[0.97, 0.98, 0.915, 0.95, 0.985, 1.0, 1.0, 0.9...","[0.96, 0.905, 0.875, 0.875, 0.88, 0.855, 0.995..."
1,"[(5, 1), (5, 1)]","[0.675, 0.575, 0.585, 0.62, 0.5, 0.575, 0.675,...","[0.675, 0.58, 0.585, 0.63, 0.52, 0.58, 0.67, 0..."
2,"[(5, -1), (5, 1)]","[0.97, 0.98, 0.915, 0.95, 0.985, 1.0, 1.0, 0.9...","[0.89, 0.935, 0.895, 0.835, 0.875, 0.845, 0.98..."
3,"[(2, -1), (2, -1), (2, -1), (2, -1), (2, -1)]","[0.78, 0.95, 0.725, 0.805, 0.625, 0.735, 0.85,...","[0.745, 0.94, 0.715, 0.79, 0.625, 0.68, 0.775,..."
4,"[(2, 1), (2, 1), (2, 1), (2, 1), (2, 1)]","[0.705, 0.7, 0.51, 0.69, 0.49, 0.615, 0.62, 0....","[0.645, 0.725, 0.49, 0.705, 0.465, 0.61, 0.72,..."
5,"[(2, -1), (2, -1), (2, 0), (2, 1), (2, 1)]","[0.79, 0.95, 0.735, 0.805, 0.63, 0.74, 0.85, 0...","[0.765, 0.94, 0.73, 0.81, 0.575, 0.71, 0.795, ..."


In [7]:
import scipy.stats as stats

# for signature, psdt_scores, dt_scores in zip(signatures, psdt_scores_by_signature, dt_scores_by_signature):
for signature, (i, row) in zip(signatures, results_rf.iterrows()):
    dt_scores = row["dt_score"]
    psdt_scores = row["psdt_score"]
    s = []
    for component in signature:
        if component[1] < 0:
            s.append("\H{" + f"{component[0]}, {-component[1]}" + "}")
        elif component[1] == 0:
            s.append("\E{" + f"{component[0]}" + "}")
        else:
            s.append("\S{" + f"{component[0]}, {component[1]}" + "}")
    # print("$" + " \\times ".join(s) + "$", end=" & ")
    print(f"${np.mean(dt_scores) * 100:.1f} \pm {np.std(dt_scores) / np.sqrt(len(dt_scores)) * 1.96 * 100 :.1f}$", end=" & ")
    print(f"${np.mean(psdt_scores) * 100:.1f} \pm {np.std(psdt_scores) / np.sqrt(len(psdt_scores)) * 1.96 * 100 :.1f}", end="")

    t, p = stats.ttest_rel(dt_scores, psdt_scores)
    if p < .05:
        print("^*$", end="")
    else:
        print("$", end="")

    # print("\\\\")
    print()

$94.0 \pm 1.9$ & $98.0 \pm 1.0^*$
$64.5 \pm 3.4$ & $64.1 \pm 3.6$
$92.5 \pm 2.1$ & $98.0 \pm 1.0^*$
$81.1 \pm 4.1$ & $82.0 \pm 4.1$
$62.4 \pm 4.6$ & $61.2 \pm 4.2$
$81.7 \pm 4.4$ & $82.7 \pm 4.0$
