In [1]:
import matplotlib
matplotlib.use('pdf')

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
import os
plt.style.use('default')

In [3]:
def human_format(x):
    sigpart = 0
    suffix = ""
    if x < 1000:
        return "{:.3g}".format(x)
    elif x < 1000000:
        sigpart = x / 1000
        suffix = "k"
    elif x < 1000000000:
        sigpart = x / 1000000
        suffix = "M"
    else:
        sigpart = x / 1000000000
        suffix = "B"
    return "{:.3g} {}".format(sigpart, suffix)
def get_algorithm(path):
    algs = ["rkde", "ic2", "sklearn", "nocut", "simple"]
    for alg in algs:
        if alg in path:
            return alg
def get_dataset(path):
    datasets = {
        "bgauss": "gauss",
        "hep": "hep",
        "tmy3_4": "tmy3_4",
        "tmy3.": "tmy3",
        "mnist_64": "mnist",
        "home": "home",
    }
    for dataset in datasets.keys():
        if dataset in path:
            return datasets[dataset]

In [4]:
def get_p(path):
    underscore_idx = path.index("_")
    dot_idx = path.index(".out")
    return int(path[underscore_idx+1:dot_idx])

In [7]:
df = pd.read_csv("../experiments/p_raiders/scale_p.csv")

In [8]:
df["r_algorithm"] = df["out_path"].map(get_algorithm)
df["r_dataset"] = df["out_path"].map(get_dataset)
df["throughput"] = (
    1000 * df["num_test"] / df["test_time"]
)
df["a_throughput"] = (
    1000 * df["num_train"] 
    / (df["test_time"]*df["num_train"]/df["num_test"] + df["train_time"])
)
df["p"] = df["out_path"].map(get_p)

In [13]:
df = df.sort_values(by=["r_algorithm", "p"])

In [18]:
algorder = ["ic2"]

In [20]:
fig = plt.figure(figsize=(6,4),dpi=300)
ax = plt.subplot(111)
for alg in algorder:
    cur_df = df[df["r_algorithm"] == alg]
    ax.semilogy(
        cur_df["p"]/100,
        cur_df["throughput"],
        marker="o",
        label=alg)
xs = np.arange(0,1.1,.1)
cur_df = df[df["r_algorithm"] == "simple"]
ax.semilogy(
    xs,
    np.repeat(cur_df["throughput"].mean(), len(xs)),
    label=alg,
    ls="--"
)

ax.semilogy(
    xs,
    np.repeat(11.3, len(xs)),
    label="sklearn",
    ls="-."
)

ax.set_xlabel("Quantile Threshold")
ax.set_ylabel("Query Throughput")
ax.set_title("Adjusting Threshold: tmy3, n={}, d={}".format(
        human_format(df.num_train.iloc[0]),
        df.dim.iloc[0]))
ax.legend(loc=0)
ax.grid(True, ls="dotted", alpha=0.5)
fig.tight_layout()
fig.savefig("p_scale.pdf")
fig.clear()



In [33]:
df[["r_algorithm", "p", "train_time", "test_time", "throughput", "a_throughput"]]

Unnamed: 0,r_algorithm,p,train_time,test_time,throughput,a_throughput
4,ic2,1,10072,60001,51183.663606,39895.89989
2,ic2,5,13934,60001,13777.620373,12464.355286
3,ic2,10,13777,60001,8657.889035,8125.93676
10,ic2,20,23152,60005,2286.059495,2221.529637
7,ic2,30,65996,60001,840.852652,816.000695
0,ic2,40,120613,60053,432.517942,420.479356
1,ic2,50,336867,60090,180.512564,174.682831
8,ic2,60,814921,60003,78.312751,75.662652
5,ic2,70,398399,60030,51.940696,51.357437
9,ic2,80,439092,60140,32.357832,32.107467
