In [1]:
%matplotlib inline

In [2]:
import numpy as np
import sklearn
import scipy.stats
import matplotlib
import matplotlib.pyplot as plt
import timeit
import pandas as pd
import math
import itertools
import time
import pickle

from sklearn.neighbors import (
    KernelDensity,
    KDTree,
)
from sklearn.preprocessing import (
    RobustScaler
)

In [12]:
expdir = "../../experiments/smallacc"
datasets = [
    "energy_d2",
    "energy_d4",
    "energy_d8",
    "home_d2",
    "home_d4",
    "home_d8",
    "shuttle_d2",
    "shuttle_d4",
    "shuttle_d7"
]
algorithms = {
    "ks_binned": ("ks", "bTRUE"),
    "ks_unbinned": ("ks", "bFALSE"),
    "sklearn_t0": ("sklearn", "t0.0"),
    "sklearn_t1": ("sklearn", "t0.1"),
    "tkde": ("tkde", None)
}
csvs = {k: {d:None for d in datasets} for k in algorithms.keys()}

In [48]:
for k,v in algorithms.items():
    prefix = v[0]
    suffix = v[1]
    for d in datasets:
        if suffix is not None:
            path = "{expdir}/{prefix}/scores/{dataset}_{suffix}.csv".format(
                expdir=expdir,
                prefix=prefix,
                dataset=d,
                suffix=v[1]
            )
        else:
            path = "{expdir}/{prefix}/scores/{dataset}.csv".format(
                expdir=expdir,
                prefix=prefix,
                dataset=d
            )
        if prefix == "ks":
            if "d8" in d or "d7" in d:
                data = None
            else:
                data = pd.read_csv(path).iloc[:,1]
        else:
            data = pd.read_csv(path).iloc[:,0]
        csvs[k][d] = data

In [108]:
def compare(d1, d2):
    if d2 is None:
        return None
    assert(len(d1) == len(d2))
    d1q = np.percentile(d1, 1.0)
    d2q = np.percentile(d2, 1.0)
    d1o = d1 < d1q
    d2o = d2 < d2q
    return (np.sum(d1o & d2o) / np.sum(d1o))

In [109]:
base = csvs["sklearn_t0"]

In [119]:
rows = []
for alg in algorithms.keys():
    for d in datasets:
        uloc = d.rfind("_")
        rows.append({
                "data": d[:uloc],
                "dim": int(d[uloc+2:]),
                "dataset": d,
                "alg": alg,
                "recall": compare(base[d], csvs[alg][d]),
            })

In [120]:
df = pd.DataFrame(rows)
columnsTitles = ["data", "dim", "dataset", "alg", "recall"]
df = df.reindex(columns = columnsTitles)

In [121]:
df.to_csv("../results/rawacc.csv", index=False)