In [2]:
import matplotlib
matplotlib.use('PDF')

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
import pickle

In [4]:
plt.style.use('grayscale')

In [5]:
def human_format(x):
    sigpart = 0
    suffix = ""
    if x < 1000:
        return "{:.3g}".format(x)
    elif x < 1000000:
        sigpart = x / 1000
        suffix = "k"
    elif x < 1000000000:
        sigpart = x / 1000000
        suffix = "M"
    else:
        sigpart = x / 1000000000
        suffix = "B"
    return "{:.3g} {}".format(sigpart, suffix)

# Throughput

In [16]:
throughput = pd.read_csv("./throughput.csv")
throughput.head(2)

Unnamed: 0,algorithm,dataset,dimension,dataset size,num_scored,train,score,effective_time,throughput
0,ic2,energy,1,500000,500000,2.68,1.62,4.3,116279.0698
1,ic2,energy,2,500000,500000,2.02,0.91,2.93,170648.4642


In [17]:
algorithms = ["ic2", "ks", "sklearn", "naive"]
datasets = ["energy", "home"]

In [18]:
markers = ["o","^","s","."]
linestyles = ["-","--","-","-"]

In [1]:
fig, axs = plt.subplots(nrows=2, figsize=(7,8),dpi=300)

for di in range(2):
    d = datasets[di]
    ax = axs[di]
    for i in range(4):
        a = algorithms[i]
        m = markers[i]
        data = throughput[(throughput.dataset == d) & (throughput.algorithm == a)]
        ax.semilogy(data.dimension, data.throughput, 
                    label=a, marker=m, linestyle=linestyles[i])
    ax.set_xlim(0.5,8.5)
    ax.yaxis.grid(True)
    ax.set_xlabel("Dataset Dimensionality")
    ax.set_ylabel("Throughput (pts / s)")
    ax.set_title("Data: {}".format(d))
    ax.legend(loc=0, ncol=2)
fig.tight_layout()
fig.savefig("../figures/throughput_mpl.pdf")
fig.clear()

NameError: name 'plt' is not defined

In [None]:
        baseline = throughput[(throughput.dataset == d) & (throughput.algorithm == 'naive')]

        if a != 'naive':
            for curdim in data.dimension:
                cur_throughput = data[data.dimension == curdim].throughput.values[0]
                base_throughput = baseline[baseline.dimension == curdim].throughput.values[0]
                curratio = cur_throughput / base_throughput
                ax.annotate(
                    human_format(curratio),
                    (curdim,cur_throughput),
                    xytext=(3, 3), textcoords='offset points')

# Accuracy

In [22]:
acc = pd.read_csv("./rawacc_2.csv")
acc.head(2)

Unnamed: 0,algorithm,dataset,dimension,true_below,est_below,tp_below,precision,recall,f_score
0,sklearn_t0,energy,2,500,500,500,1.0,1.0,1.0
1,sklearn_t0,energy,4,500,500,500,1.0,1.0,1.0


In [108]:
algorithms = ["sklearn", "ic2", "ks"]
datasets = ["energy", "home", "shuttle"]
colors = ["0.2", "0.7", "1.0"]
dimensions = [[2], [4], [7,8]]

In [112]:
fig, axs = plt.subplots(nrows=3, ncols=1, figsize=(7,8),dpi=300)
fig.subplots_adjust(hspace=0.5)
for di in range(len(dimensions)):
    dims = dimensions[di]
    ax = axs[di]
    for si in range(len(datasets)):
        dname = datasets[si]
        data = acc[(acc.dataset == dname) & (acc.dimension.isin(dims))]
        data = data[data.algorithm.isin(algorithms)]
        xlocs = np.arange(0,len(algorithms)) * 5 + si
        if len(data) == 2:
            xlocs = np.arange(0,len(algorithms)-1) * 5 + si
        ax.bar(left=xlocs,height=data.f_score,label=dname, color=colors[si])
    ax.set_xticks(np.arange(0,len(algorithms)) * 5 + 1.5)
    ax.set_xlim(np.array([-0.3,3])*5)
    ax.set_xticklabels(algorithms)
    if di == 0:
        ax.set_ylim(0.90,1.0)
    if di == 2:
        ax.set_ylim(0.80,1.0)
    ax.yaxis.grid(True)
    ax.set_xlabel("Algorithm")
    ax.set_ylabel("F-score")
    ax.set_title("Dimensions: {}".format(dims))
    if di == 2:
        ax.legend(loc=0)
fig.savefig("../figures/rawacc_mpl.pdf")
plt.tight_layout()
fig.clear()



# Scalability

In [15]:
dimensions = pd.read_csv("./scale_dim.csv")
dimensions

Unnamed: 0,dim,train,score,data
0,1,2.72,1.11,energy500k
1,2,1.93,1.1,
2,3,2.29,1.76,
3,4,2.89,3.71,
4,5,4.13,5.31,
5,6,5.78,7.48,
6,7,9.57,10.06,
7,8,10.07,12.38,


In [19]:
fig = plt.figure(figsize=(6,3),dpi=300)
ax = plt.subplot(111)
ax.plot(dimensions["dim"],dimensions["train"],marker='s',linestyle='--')
ax.plot(dimensions["dim"],dimensions["score"],marker='o')
ax.set_xlim(0.5,8.5)
ax.yaxis.grid(True)
ax.set_xlabel("Dataset Dimensionality")
ax.set_ylabel("Runtime (s)")
ax.set_title("US Energy 500k Dataset")
ax.legend(loc=2)
fig.tight_layout()
fig.savefig("../figures/scalability_dim.pdf")
fig.clear()



In [6]:
size = pd.read_csv("../results/scale_size.csv")
size

Unnamed: 0,Dataset Size,train,score
0,100000,1.78,0.389
1,200000,1.85,1.0
2,300000,2.78,1.96
3,400000,3.73,2.84
4,500000,3.8,4.0
5,700000,4.51,7.22
6,1000000,6.22,12.0


In [10]:
size["size100"] = (size["Dataset Size"] / 100000).astype(int)

In [13]:
fig = plt.figure(figsize=(6,3),dpi=300)
ax = plt.subplot(111)
ax.plot(size["size100"],size["train"], marker='s',linestyle='--')
ax.plot(size["size100"],size["score"], marker='o')
ax.set_xlim(0,11)
ax.set_ylim(0,13)
ticklocs = np.arange(1,11,1)
ax.yaxis.grid(True)
ax.xaxis.set_ticks(ticklocs)
ax.xaxis.set_ticklabels(map(human_format, ticklocs))
ax.set_xlabel("Dataset Size (100k Points)")
ax.set_ylabel("Runtime (s)")
ax.set_title("US Energy 4d Dataset")
ax.legend(loc=2)
fig.tight_layout()
fig.savefig("../figures/scalability_size.pdf")
fig.clear()



# Impact of Quantile Estimation Precision

In [20]:
quantiles = pd.read_csv("./pscaling.csv")
quantiles

Unnamed: 0,p,Num Scored,Training Time,Scoring Time,Throughput
0,0.01,500000,4.5,4.23,57273.76861
1,0.05,500000,5.52,11.5,29377.20329
2,0.1,500000,5.02,23.04,17818.95937
3,0.2,500000,13.81,55.57,7206.687806
4,0.3,500000,17.34,100.07,4258.581041
5,0.4,500000,28.51,143.68,2903.769092
6,0.5,500000,30.95,116.08,3400.666531
7,0.6,500000,69.0,181.6,1995.211492
8,0.7,500000,159.8,158.9,1568.873549
9,0.8,500000,170.4,170.0,1468.860165


In [23]:
fig = plt.figure(figsize = (6, 3), dpi=300)
ax = plt.subplot(111)
ax.semilogy(
    quantiles["p"],
    quantiles["Throughput"], 
    marker='o', 
    label="IC2")
ax.semilogy(
    [0.01,1],
    [65, 65],
    linestyle="--",
    label="sk-learn"
)
ax.set_xlabel("Threshold Quantile p")
ax.set_ylim(10 ,2 * 10**5)
ax.set_ylabel("Throughput (pts / s)")
ax.yaxis.grid(True)
ax.set_title("Energy Dataset, 500k points, Throughput vs Quantile")
ax.legend(loc=0, ncol=2)

fig.tight_layout()
fig.savefig("../figures/pscaling.pdf")
fig.clear()



# Prob Estimation

In [115]:
pacc = pd.read_csv("./pacc.csv")

In [116]:
pacc

Unnamed: 0,algorithm,params,MSE
0,kde,bw=0.266,6.4e-07
1,kde,bw=0.1,3.98e-06
2,kde,bw=0.4,2.54e-06
3,histogram,bw=0.35,1.520558e-05
4,knn,k=10,1.190581e-05
5,knn,k=350,3.339049e-07
6,knn,k=5000,2.503313e-06


In [117]:
algorithms = ["histogram", "kde", "knn"]

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=1, dpi=300)
ax = axs[0]
for ai, alg in enumerate():
    data = pacc[pacc.algorithm == alg]
    xlocs = np.arange(0,len(algorithms)) * 5 + si
    ax.bar(left=xlocs,height=pacc.mse,label=dname, color=colors[si])
ax.set_xticks(np.arange(0,len(algorithms)) * 5 + 1.5)
ax.set_xlim(np.array([-0.3,3])*5)
ax.set_xticklabels(algorithms)
ax.yaxis.grid(True)
ax.set_xlabel("Algorithm")
ax.set_ylabel("MSE")
ax.set_title("MSE of pdf Estimates")
ax.legend(loc=0)
fig.savefig("../figures/pdfacc.pdf")
plt.tight_layout()
fig.clear()