In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Set geomstats background
# export GEOMSTATS_BACKEND=numpy
import os
os.environ['GEOMSTATS_BACKEND'] = 'numpy'

import numpy as np
import hyperdt.benchmarking as benchmarking
from hyperdt.product_space_DT import ProductSpace, ProductSpaceDT
from hyperdt.forest import ProductSpaceRF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

INFO: Using numpy backend


In [3]:
NUM_POINTS = 1000
NUM_CLASSES = 4
signatures = [
    # [(3, -1.)],
    # [(3, 1.)],
    # [(3, -1.), (4, -.5)],
    # [(3, -1.), (4, 1.)],
    # [(3, -1.), (4, 1.), (5, 0.)],
    # [(3, -1.), (4, -.5), (3, .5), (4, 1.)],
    # [(3, -1.), (4, -.5), (3, .5), (4, 1.), (5, 0.)],
    [(5,-1), (5,-1)],
    [(5, 1), (5, 1)],
    [(5, -1), (5, 1)],
    [(2,-1), (2,-1), (2,-1), (2,-1), (2,-1)],
    [(2, 1), (2, 1), (2, 1), (2, 1), (2, 1)],
    [(2, -1), (2, -1), (2, 0), (2, 1), (2, 1)]
    # [(10,-1)],
    # [(10, 1)],
    # [(10, 0)],
]


In [6]:
import pandas as pd
from tqdm import tqdm

results = []

rnd_seeds, psdt_scores_by_signature, dt_scores_by_signature = benchmarking.compute_scores_by_signature(
    signatures, 1000, NUM_CLASSES, seed=seed, max_depth=3, n_seeds=100)
for signature, psdt_score, dt_score in zip(signatures, psdt_scores_by_signature, dt_scores_by_signature):
    results.append({
        "seed": seed,
        "signature": signature,
        "psdt_score": psdt_score,
        "dt_score": dt_score
    })

results = pd.DataFrame(results)
results

NameError: name 'seed' is not defined

In [37]:
import scipy.stats as stats

# for signature, psdt_scores, dt_scores in zip(signatures, psdt_scores_by_signature, dt_scores_by_signature):
for signature, (i, row) in zip(signatures, results.iterrows()):
    dt_scores = row["dt_score"]
    psdt_scores = row["psdt_score"]
    s = []
    for component in signature:
        if component[1] < 0:
            s.append("\H{" + f"{component[0]}, {-component[1]}" + "}")
        elif component[1] == 0:
            s.append("\E{" + f"{component[0]}" + "}")
        else:
            s.append("\S{" + f"{component[0]}, {component[1]}" + "}")
    print("$" + " \\times ".join(s) + "$", end=" & ")
    print(f"${np.mean(dt_scores) * 100:.1f} \pm {np.std(dt_scores) / np.sqrt(len(dt_scores)) * 1.96 * 100 :.1f}$", end=" & ")
    print(f"${np.mean(psdt_scores) * 100:.1f} \pm {np.std(psdt_scores) / np.sqrt(len(psdt_scores)) * 1.96 * 100 :.1f}", end="")

    t, p = stats.ttest_rel(dt_scores, psdt_scores)
    if p < .05:
        print("^*$", end="")
    else:
        print("$", end="")

    print("\\\\")

$\H{5, 1} \times \H{5, 1}$ & $93.6 \pm 1.1$ & $97.3 \pm 0.5^*$\\
$\S{5, 1} \times \S{5, 1}$ & $62.2 \pm 1.6$ & $64.1 \pm 1.5^*$\\
$\H{5, 1} \times \S{5, 1}$ & $93.7 \pm 1.1$ & $97.3 \pm 0.5^*$\\
$\H{2, 1} \times \H{2, 1} \times \H{2, 1} \times \H{2, 1} \times \H{2, 1}$ & $76.9 \pm 1.9$ & $78.6 \pm 1.9^*$\\
$\S{2, 1} \times \S{2, 1} \times \S{2, 1} \times \S{2, 1} \times \S{2, 1}$ & $60.4 \pm 1.9$ & $60.6 \pm 1.9$\\
$\H{2, 1} \times \H{2, 1} \times \E{2} \times \S{2, 1} \times \S{2, 1}$ & $78.2 \pm 1.9$ & $79.5 \pm 1.9^*$\\


In [4]:
# RF version
import pandas as pd

results_rf = []

rnd_seeds, psdt_scores_by_signature, dt_scores_by_signature = benchmarking.compute_scores_by_signature(
    signatures, 1000, NUM_CLASSES, seed=0, max_depth=3, n_seeds=20, rf=True
)
for signature, psdt_score, dt_score in zip(signatures, psdt_scores_by_signature, dt_scores_by_signature):
    results_rf.append({"signature": signature, "psdt_score": psdt_score, "dt_score": dt_score})

results_rf = pd.DataFrame(results_rf)
results_rf

100%|██████████| 120/120 [10:42<00:00,  5.35s/it]


Unnamed: 0,signature,psdt_score,dt_score
0,"[(5, -1), (5, -1)]","[0.975, 0.99, 0.865, 0.925, 0.975, 0.97, 0.995...","[0.875, 0.835, 0.84, 0.835, 0.88, 0.845, 0.96,..."
1,"[(5, 1), (5, 1)]","[0.685, 0.485, 0.535, 0.665, 0.525, 0.555, 0.6...","[0.66, 0.475, 0.535, 0.63, 0.455, 0.465, 0.675..."
2,"[(5, -1), (5, 1)]","[0.975, 0.99, 0.865, 0.95, 0.975, 0.97, 0.995,...","[0.885, 0.88, 0.85, 0.86, 0.87, 0.875, 0.92, 0..."
3,"[(2, -1), (2, -1), (2, -1), (2, -1), (2, -1)]","[0.73, 0.94, 0.69, 0.8, 0.65, 0.7, 0.83, 0.825...","[0.715, 0.935, 0.705, 0.795, 0.54, 0.665, 0.70..."
4,"[(2, 1), (2, 1), (2, 1), (2, 1), (2, 1)]","[0.585, 0.73, 0.445, 0.645, 0.435, 0.545, 0.6,...","[0.65, 0.69, 0.45, 0.715, 0.44, 0.575, 0.61, 0..."
5,"[(2, -1), (2, -1), (2, 0), (2, 1), (2, 1)]","[0.785, 0.94, 0.73, 0.8, 0.65, 0.74, 0.82, 0.8...","[0.73, 0.94, 0.635, 0.78, 0.545, 0.71, 0.765, ..."


In [7]:
import scipy.stats as stats

# for signature, psdt_scores, dt_scores in zip(signatures, psdt_scores_by_signature, dt_scores_by_signature):
for signature, (i, row) in zip(signatures, results_rf.iterrows()):
    dt_scores = row["dt_score"]
    psdt_scores = row["psdt_score"]
    s = []
    for component in signature:
        if component[1] < 0:
            s.append("\H{" + f"{component[0]}, {-component[1]}" + "}")
        elif component[1] == 0:
            s.append("\E{" + f"{component[0]}" + "}")
        else:
            s.append("\S{" + f"{component[0]}, {component[1]}" + "}")
    # print("$" + " \\times ".join(s) + "$", end=" & ")
    print(f"${np.mean(dt_scores) * 100:.1f} \pm {np.std(dt_scores) / np.sqrt(len(dt_scores)) * 1.96 * 100 :.1f}$", end=" & ")
    print(f"${np.mean(psdt_scores) * 100:.1f} \pm {np.std(psdt_scores) / np.sqrt(len(psdt_scores)) * 1.96 * 100 :.1f}", end="")

    t, p = stats.ttest_rel(dt_scores, psdt_scores)
    if p < .05:
        print("^*$", end="")
    else:
        print("$", end="")

    # print("\\\\")
    print()

$90.2 \pm 3.2$ & $97.0 \pm 1.4^*$
$60.3 \pm 4.4$ & $61.1 \pm 4.0$
$90.3 \pm 2.5$ & $97.1 \pm 1.4^*$
$78.0 \pm 4.7$ & $80.8 \pm 4.2^*$
$58.2 \pm 4.4$ & $58.3 \pm 4.7$
$78.1 \pm 4.8$ & $81.6 \pm 3.9^*$
