In [1]:
import matplotlib
matplotlib.use('pdf')

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
import os
plt.style.use('default')

In [3]:
def human_format(x):
    sigpart = 0
    suffix = ""
    if x < 1000:
        return "{:.3g}".format(x)
    elif x < 1000000:
        sigpart = x / 1000
        suffix = "k"
    elif x < 1000000000:
        sigpart = x / 1000000
        suffix = "M"
    else:
        sigpart = x / 1000000000
        suffix = "B"
    return "{:.3g}{}".format(sigpart, suffix)
def get_algorithm(path):
    algs = ["rkde", "ic2", "sklearn", "nocut", "simple", "ks"]
    for alg in algs:
        if alg in path:
            if alg == "ic2":
                return "tkdc"
            else:
                return alg
def get_dataset(path):
    datasets = {
        "bgauss": "gauss",
        "hep": "hep",
        "tmy3_4": "tmy3_4",
        "tmy3.": "tmy3",
        "mnist_64": "mnist",
        "home": "home",
    }
    for dataset in datasets.keys():
        if dataset in path:
            return datasets[dataset]
markers = {
    "tkdc": "o",
    "simple": "^",
    "sklearn": "s",
    "rkde": "d",
    "nocut": "8",
    "ks": "*",
}

In [4]:
df = pd.read_csv("../experiments/r_raiders/scale_r.csv")

In [5]:
df["r_algorithm"] = "rkde"
df["r_dataset"] = df["out_path"].map(get_dataset)
df["throughput"] = (
    1000 * df["num_test"] / df["test_time"]
)
df["a_throughput"] = (
    1000 * df["num_train"] 
    / (df["test_time"]*df["num_train"]/df["num_test"] + df["train_time"])
)
df["error"] = np.abs(df["percentile"] - 4.2e-5) / (4.2e-5)

In [6]:
df = df.sort_values(by=["r_algorithm", "radius"])

In [7]:
fig = plt.figure(figsize=(6,3.5),dpi=300)
ax = plt.subplot(111)
ax.semilogy(
    df["radius"],
    df["a_throughput"],
    marker=markers["rkde"],
    label="rkde",
)
xs = np.arange(0,6,1)
ys = np.repeat(35800, len(xs))
ax.plot(xs, ys, marker=markers["tkdc"], label="tkdc")


# ax2 = ax.twinx()
# ax2.plot(
#     df["radius"],
#     df["error"],
#     marker="x",
#     color="C2",
#     alpha=0.5,
#     label="error"
# )
# ax2.set_ylabel("Relative Error", color="C1")
# ax2.legend(loc=0, bbox_to_anchor = [0.7, 1.0])
ax.set_xlabel("Radius Cutoff (multiples of bandwidth)")
ax.set_ylabel("Queries / s")
ax.set_title("Adjusting Radial Distance: tmy3, n={}, d={}".format(
        human_format(df.num_train.iloc[0]),
        df.dim.iloc[0]))
ax.legend(loc=0, bbox_to_anchor = [1.0, 0.7])
ax.yaxis.grid(True, ls="dotted", alpha=0.5)
fig.tight_layout()
fig.savefig("r_scale.pdf")
fig.clear()



In [47]:
df

Unnamed: 0,algorithm,dataset,dim,num_kernels,num_test,num_train,out_path,percentile,radius,test_time,train_time,r_algorithm,r_dataset,throughput,a_throughput,acc,error
4,ic2,bigdata/otmy3.csv,4,710802848,10437,1822080,./out/tmy3_t0.out,5.094524e-07,0.459044,60012,3448,rkde,tmy3,173.915217,173.857999,0.98787,0.98787
0,ic2,bigdata/otmy3.csv,4,704743383,2714,1822080,./out/tmy3_t1.out,1.469739e-05,1.17741,60023,3438,rkde,tmy3,45.216001,45.212143,0.650062,0.650062
7,ic2,bigdata/otmy3.csv,4,675657781,1569,1822080,./out/tmy3_t2.out,3.943299e-05,1.794123,60012,3964,rkde,tmy3,26.144771,26.143284,0.061119,0.061119
5,ic2,bigdata/otmy3.csv,4,756396720,1470,1822080,./out/tmy3_t3.out,4.821091e-05,2.145966,60012,3458,rkde,tmy3,24.495101,24.493962,0.147879,0.147879
2,ic2,bigdata/otmy3.csv,4,698734077,1253,1822080,./out/tmy3_t4.out,5.009675e-05,2.447747,60018,3191,rkde,tmy3,20.87707,20.876307,0.19278,0.19278
6,ic2,bigdata/otmy3.csv,4,739760240,1194,1822080,./out/tmy3_t5.out,4.9798e-05,3.034854,60002,3208,rkde,tmy3,19.899337,19.89864,0.185667,0.185667
3,ic2,bigdata/otmy3.csv,4,714154475,1051,1822080,./out/tmy3_t6.out,4.503997e-05,3.716922,60046,3722,rkde,tmy3,17.503248,17.502622,0.07238,0.07238
1,ic2,bigdata/otmy3.csv,4,429092807,557,1822080,./out/tmy3_t7.out,4.554112e-05,4.798526,60059,3454,rkde,tmy3,9.274214,9.274051,0.084312,0.084312
