In [2]:
%matplotlib inline

In [56]:
import numpy as np
import sklearn
import scipy.stats
import matplotlib
import matplotlib.pyplot as plt
import timeit
import pandas as pd
import math
import itertools
import time
import pickle

from sklearn.neighbors import (
    KernelDensity,
    KDTree,
)
from sklearn.preprocessing import (
    RobustScaler
)

# Helper Functions

In [43]:
def get_self_density(d, n):
    return scipy.stats.multivariate_normal.pdf(
        np.zeros(d), 
        mean=np.zeros(d), 
        cov=np.identity(d)) / n;
def estimate_kde_bw(data):
    q3 = np.percentile(data, 75, axis=0)
    q1 = np.percentile(data, 25, axis=0)
    iqr = q3 - q1
    bw = iqr * (data.shape[0])**(-1.0/(data.shape[1]+4))
    return bw
def get_scores(data, tol=0, bw=None):
    print("Starting", flush=True)
    if bw is None:
        bw = estimate_kde_bw(data)
    print("BW calculated", flush=True)
    scaled_data = data / bw
    
    # Normalized Computations
    kde = KernelDensity(
        bandwidth=1,
        kernel='gaussian',
        algorithm='kd_tree',
        rtol=tol,
    )
    kde.fit(scaled_data)
    print("Trained", flush=True)
    scores = np.exp(kde.score_samples(scaled_data))
    print("Scored", flush=True)
    
    self_density = get_self_density(data.shape[1], data.shape[0])
    scores_minus_self = scores - self_density
    
    # Denormalize
    denorm_scores = scores_minus_self / np.prod(bw)
    return denorm_scores
def compare_outliers(scores1, scores2):
    cut1 = np.percentile(scores1, 1.0)
    flag1 = scores1 < cut1
    
    cut2 = np.percentile(scores2, 1.0)
    flag2 = scores2 < cut2
    return np.sum(flag1 & flag2) / np.sum(flag1 | flag2)

In [5]:
energy = pd.read_csv("../us_energy_1p0_metrics.csv")

In [5]:
taxi = pd.read_csv("../taxi_data.csv")

# Perf + Jaccard Benchmarks

In [46]:
def run_benchmarks(data, ns=[10000, 50000], tols=[0.0, 0.1], dims=[1, 2, 4, 8]):
    timings = {}
    scores = {}
    combinations = itertools.product(ns, tols, dims)
    for t in combinations:
        n = t[0]
        tol = t[1]
        dim = t[2]
        print("n: {}, tol: {}, dim: {}".format(n, tol, dim))
        columns = list(range(dim))
        data = data[columns].iloc[:n].values
        start = time.time()
        scores[t] = get_scores(data, tol=tol)
        elapsed = time.time() - start
        print("Elapsed: {}".format(elapsed))
        timings[t] = elapsed
    return {"timings": timings, "scores": scores}

In [53]:
results = run_benchmarks(
    data=energy,
    ns=[10000,50000],
    tols=[0.0,0.1],
    dims=[1,2,4,8]
)

n: 10000, tol: 0.0, dim: 1
Starting
BW calculated
Trained
Scored
Elapsed: 4.1302809715271
n: 10000, tol: 0.0, dim: 2
Starting
BW calculated
Trained
Scored
Elapsed: 3.8664679527282715
n: 10000, tol: 0.0, dim: 4
Starting
BW calculated
Trained
Scored
Elapsed: 4.413316011428833
n: 10000, tol: 0.0, dim: 8
Starting
BW calculated
Trained
Scored
Elapsed: 5.624083042144775
n: 10000, tol: 0.1, dim: 1
Starting
BW calculated
Trained
Scored
Elapsed: 1.5577480792999268
n: 10000, tol: 0.1, dim: 2
Starting
BW calculated
Trained
Scored
Elapsed: 1.5478770732879639
n: 10000, tol: 0.1, dim: 4
Starting
BW calculated
Trained
Scored
Elapsed: 2.051440954208374
n: 10000, tol: 0.1, dim: 8
Starting
BW calculated
Trained
Scored
Elapsed: 2.9179069995880127
n: 50000, tol: 0.0, dim: 1
Starting
BW calculated
Trained
Scored
Elapsed: 127.07230186462402
n: 50000, tol: 0.0, dim: 2
Starting
BW calculated
Trained
Scored
Elapsed: 122.14469289779663
n: 50000, tol: 0.0, dim: 4
Starting
BW calculated
Trained
Scored
Elapsed: 13

In [54]:
results["timings"]

{(10000, 0.0, 1): 4.1302809715271,
 (10000, 0.0, 2): 3.8664679527282715,
 (10000, 0.0, 4): 4.413316011428833,
 (10000, 0.0, 8): 5.624083042144775,
 (10000, 0.1, 1): 1.5577480792999268,
 (10000, 0.1, 2): 1.5478770732879639,
 (10000, 0.1, 4): 2.051440954208374,
 (10000, 0.1, 8): 2.9179069995880127,
 (50000, 0.0, 1): 127.07230186462402,
 (50000, 0.0, 2): 122.14469289779663,
 (50000, 0.0, 4): 131.79677200317383,
 (50000, 0.0, 8): 198.65502095222473,
 (50000, 0.1, 1): 38.09627103805542,
 (50000, 0.1, 2): 38.75493597984314,
 (50000, 0.1, 4): 42.627718925476074,
 (50000, 0.1, 8): 72.32149314880371}

In [59]:
with open("energy_benchmark.pickle", 'wb') as f:
    pickle.dump(results, f)

In [66]:
for d in [2,4,8]:
    jaccard = compare_outliers(
        results["scores"][(50000,0.0,d)],
        results["scores"][(50000,0.1,d)]
    )
    print("Jaccard similarity between 0.0 and 0.1 for d{}: {}".format(
         d,
         jaccard))

Jaccard similarity between 0.0 and 0.1 for d2: 0.9801980198019802
Jaccard similarity between 0.0 and 0.1 for d4: 0.9801980198019802
Jaccard similarity between 0.0 and 0.1 for d8: 0.9157088122605364


In [81]:
prefix = "/Users/egan/Documents/Projects/tKDE/experiments/end2end/scores/"
filenames = {
    (50000, 0.01, 2): "energy_n50_d2_tol1.csv",
    (50000, 0.01, 4): "energy_n50_d4_tol1.csv",
    (50000, 0.01, 8): "energy_n50_d8_tol1.csv",
    (50000, 0.1, 2): "energy_n50_d2_tol10.csv",
    (50000, 0.1, 4): "energy_n50_d4_tol10.csv",
    (50000, 0.1, 8): "energy_n50_d8_tol10.csv"
}
files = {k: prefix+filenames[k] for k in filenames}

In [84]:
for k,f in files.items():
    densities = pd.read_csv(f, header=None).values.flatten()
    n=k[0]
    d=k[2]
    jaccard = compare_outliers(
        results["scores"][(n,0.0,d)],
        densities
    )
    print("Jaccard similarity between exact and {}: {}".format(
         k,
         jaccard))

Jaccard similarity between exact and (50000, 0.01, 4): 0.9960079840319361
Jaccard similarity between exact and (50000, 0.1, 2): 0.9920318725099602
Jaccard similarity between exact and (50000, 0.1, 4): 0.9920318725099602
Jaccard similarity between exact and (50000, 0.01, 2): 1.0
Jaccard similarity between exact and (50000, 0.1, 8): 0.9960079840319361
Jaccard similarity between exact and (50000, 0.01, 8): 0.9960079840319361
