In [18]:
import matplotlib
matplotlib.use('pdf')

In [19]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
import os

In [20]:
def human_format(x):
    sigpart = 0
    suffix = ""
    if x < 1000:
        return "{:.3g}".format(x)
    elif x < 1000000:
        sigpart = x / 1000
        suffix = "k"
    elif x < 1000000000:
        sigpart = x / 1000000
        suffix = "M"
    else:
        sigpart = x / 1000000000
        suffix = "B"
    return "{:.3g}{}".format(sigpart, suffix)
def get_algorithm(path):
    algs = ["rkde", "ic2", "sklearn", "nocut", "simple"]
    for alg in algs:
        if alg in path:
            return alg
def get_dataset(path):
    datasets = {
        "bgauss": "gauss",
        "hep": "hep",
        "tmy3_4": "tmy3_4",
        "tmy3.": "tmy3",
        "mnist_64": "mnist",
        "home": "home",
    }
    for dataset in datasets.keys():
        if dataset in path:
            return datasets[dataset]
markers = {
    "ic2": "o",
    "simple": "^",
    "sklearn": "s",
    "rkde": "d",
    "nocut": "8",
    "ks": "*",
}

In [33]:
df = pd.read_csv("../experiments/n_raiders/scale_n.csv")

In [34]:
df["r_algorithm"] = df["out_path"].map(get_algorithm)
df["r_dataset"] = df["out_path"].map(get_dataset)
df["throughput"] = (
    1000 * df["num_test"] / df["test_time"]
)
df["a_throughput"] = (
    1000 * df["num_train"] 
    / (df["test_time"]*df["num_train"]/df["num_test"] + df["train_time"])
)

In [35]:
df = df.sort_values(by=["r_algorithm", "num_train"])

In [36]:
algorder = ["ic2", "sklearn", "simple", "rkde"]

In [37]:
xs = np.arange(10000, 10**8, 1000000)
ys = xs ** (-1.0)

In [38]:
matplotlib.rcParams.update({'font.size': 11})
fig = plt.figure(figsize=(6,4),dpi=300)
ax = plt.subplot(111)
for i,alg in enumerate(algorder):
    cur_df = df[df["r_algorithm"] == alg]
    ax.loglog(
        cur_df["num_train"],
        cur_df["throughput"],
        marker=markers[alg],
        linewidth=1.0,
        markersize=4.0,
        label=alg)
ax.loglog(xs, xs**(-1.0)*10**7, ls="dashed", label="$n^{-1}$",
          linewidth=1.0,
         )
# ax.loglog(xs, xs**(-0.5)*10**6, ls="dashed", label=r'$n^{-\frac{1}{2}}$')
ax.loglog(xs, xs**(-0.5)*10**8, ls="dotted", label=r'$n^{-\frac{1}{2}}$',
          linewidth=1.0,
         )
ax.set_xlabel("Dataset Size")
ax.set_ylabel("Query Throughput")
ax.set_title("Adjusting Data Size, gauss, d=2")
ax.legend(loc=0, ncol=2)
ax.set_xlim(8000,10**8)
fig.tight_layout()
fig.savefig("n_scale.pdf")
fig.clear()

