In [1]:
import matplotlib
matplotlib.use('pdf')

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
import os
plt.style.use('default')

In [3]:
def human_format(x):
    sigpart = 0
    suffix = ""
    if x < 1000:
        return "{:.3g}".format(x)
    elif x < 1000000:
        sigpart = x / 1000
        suffix = "k"
    elif x < 1000000000:
        sigpart = x / 1000000
        suffix = "M"
    else:
        sigpart = x / 1000000000
        suffix = "B"
    return "{:.3g} {}".format(sigpart, suffix)
def get_algorithm(path):
    algs = ["rkde", "ic2", "sklearn", "nocut", "simple"]
    for alg in algs:
        if alg in path:
            return alg
def get_dataset(path):
    datasets = {
        "bgauss": "gauss",
        "hep": "hep",
        "tmy3_4": "tmy3_4",
        "tmy3.": "tmy3",
        "mnist_64": "mnist",
        "home": "home",
    }
    for dataset in datasets.keys():
        if dataset in path:
            return datasets[dataset]

In [4]:
df = pd.read_csv("../experiments/d_raiders/scale_d.csv")

In [5]:
df

Unnamed: 0,algorithm,dataset,dim,num_kernels,num_test,num_train,out_path,test_time,train_time
0,ic2,bigdata/pmnist.csv,128,13761047,497,70000,./ic2/out/mnist_128.out,20027,245429
1,ic2,bigdata/pmnist.csv,4,161574342,97876,70000,./ic2/out/mnist_4.out,20001,1963
2,ic2,bigdata/pmnist.csv,64,23286200,1185,70000,./ic2/out/mnist_64.out,20010,77559
3,ic2,bigdata/pmnist.csv,32,54142113,2664,70000,./ic2/out/mnist_32.out,20002,37366
4,ic2,bigdata/pmnist.csv,2,261528485,1215929,70000,./ic2/out/mnist_2.out,20001,712
5,ic2,bigdata/pmnist.csv,1,291987663,6977570,70000,./ic2/out/mnist_1.out,20001,477
6,ic2,bigdata/pmnist.csv,16,82687978,4947,70000,./ic2/out/mnist_16.out,20003,18618
7,ic2,bigdata/pmnist.csv,8,123788131,11921,70000,./ic2/out/mnist_8.out,20001,8855
8,ic2,bigdata/pmnist.csv,256,9538099,316,70000,./ic2/out/mnist_256.out,20029,679610


In [6]:
df["r_algorithm"] = df["out_path"].map(get_algorithm)
df["r_dataset"] = df["out_path"].map(get_dataset)
df["throughput"] = (
    1000 * df["num_test"] / df["test_time"]
)
df["a_throughput"] = (
    1000 * df["num_train"] 
    / (df["test_time"]*df["num_train"]/df["num_test"] + df["train_time"])
)

In [11]:
df = df.sort_values(by=["r_algorithm", "dim"])

In [9]:
algorder = ["ic2"]

In [13]:
fig = plt.figure(figsize=(6,4),dpi=300)
ax = plt.subplot(111)
for alg in algorder:
    cur_df = df[df["r_algorithm"] == alg]
    ax.loglog(
        cur_df["dim"],
        cur_df["throughput"],
        marker="o",
        label=alg)
ax.set_xlabel("Dataset Dimension")
ax.set_ylabel("Query Throughput")
ax.set_title("MNIST Dataset")
ax.legend(loc=0)
fig.tight_layout()
fig.savefig("d_scale.pdf")
fig.clear()

