In [1]:
%matplotlib inline

In [2]:
import numpy as np
import sklearn
import scipy.stats
import matplotlib
import matplotlib.pyplot as plt
import timeit
import pandas as pd
import math
import itertools
import time
import pickle

from sklearn.neighbors import (
    KernelDensity,
    KDTree,
)
from sklearn.preprocessing import (
    RobustScaler
)

# Helper Functions

In [23]:
def get_self_density(d, n):
    return scipy.stats.multivariate_normal.pdf(
        np.zeros(d), 
        mean=np.zeros(d), 
        cov=np.identity(d)) / n;
def estimate_kde_bw(data):
    q3 = np.percentile(data, 75, axis=0)
    q1 = np.percentile(data, 25, axis=0)
    iqr = q3 - q1
    bw = iqr * (data.shape[0])**(-1.0/(data.shape[1]+4))
    return bw
def get_scores(data, numScore=None, tol=0, bw=None):
    print("Starting", flush=True)
    trainstart = time.time()
    if bw is None:
        bw = estimate_kde_bw(data)
    if numScore is None:
        numScore = len(data)
    print("BW calculated", flush=True)
    scaled_data = data / bw
    
    # Normalized Computations
    kde = KernelDensity(
        bandwidth=1,
        kernel='gaussian',
        algorithm='kd_tree',
        rtol=tol,
    )
    kde.fit(scaled_data)
    print("Trained", flush=True)
    print("Trained in {}".format(time.time()-trainstart), flush=True)
    scorestart = time.time()
    scores = np.exp(kde.score_samples(scaled_data[:numScore]))
    print("Scored", flush=True)
    print("Scored in {}".format(time.time()-scorestart), flush=True)
    
    self_density = get_self_density(data.shape[1], data.shape[0])
    scores_minus_self = scores - self_density
    
    # Denormalize
    denorm_scores = scores_minus_self / np.prod(bw)
    return denorm_scores
def compare_outliers(scores1, scores2):
    cut1 = np.percentile(scores1, 1.0)
    flag1 = scores1 < cut1
    
    cut2 = np.percentile(scores2, 1.0)
    flag2 = scores2 < cut2
    return np.sum(flag1 & flag2) / np.sum(flag1 | flag2)

In [4]:
energy = pd.read_csv("../us_energy_1p0_metrics.csv")

In [5]:
taxi = pd.read_csv("../taxi_data.csv")

# Perf + Jaccard Benchmarks

In [24]:
def run_benchmarks(df, ns, numScores, tols, dims):
    timings = {}
    scores = {}
    combinations = itertools.product(ns, numScores, tols, dims)
    for t in combinations:
        n = t[0]
        numScore = t[1]
        tol = t[2]
        dim = t[3]
        print("n: {}, numScore: {}, tol: {}, dim: {}".format(n, numScore, tol, dim))
        columns = list(range(dim))
        data = df[columns].iloc[:n].values
        start = time.time()
        scores[t] = get_scores(data, numScore=numScore, tol=tol)
        elapsed = time.time() - start
        print("Elapsed: {}".format(elapsed))
        timings[t] = elapsed
    return {"timings": timings, "scores": scores}

# Perf Benchmark

In [26]:
presults = run_benchmarks(
    df=energy,
    ns=[500000],
    numScores=[5000],
    tols=[0.0,0.1],
    dims=[2,4,8]
)

n: 500000, numScore: 5000, tol: 0.0, dim: 2
Starting
BW calculated
Trained
Trained in 3.829257011413574
Scored
Scored in 212.3846960067749
Elapsed: 216.21685004234314
n: 500000, numScore: 5000, tol: 0.0, dim: 4
Starting
BW calculated
Trained
Trained in 4.26601505279541
Scored
Scored in 274.46328496932983
Elapsed: 278.73914909362793
n: 500000, numScore: 5000, tol: 0.0, dim: 8
Starting
BW calculated
Trained
Trained in 46.50823497772217
Scored
Scored in 314.4873068332672
Elapsed: 361.0077340602875
n: 500000, numScore: 5000, tol: 0.1, dim: 2
Starting
BW calculated
Trained
Trained in 5.352800130844116
Scored
Scored in 58.92997694015503
Elapsed: 64.28789591789246
n: 500000, numScore: 5000, tol: 0.1, dim: 4
Starting
BW calculated
Trained
Trained in 6.848196029663086
Scored
Scored in 79.82740187644958
Elapsed: 86.68513584136963
n: 500000, numScore: 5000, tol: 0.1, dim: 8
Starting
BW calculated
Trained
Trained in 45.6171441078186
Scored
Scored in 76.96474409103394
Elapsed: 122.59193396568298


# Accuracy Benchmark

In [53]:
results = run_benchmarks(
    data=energy,
    ns=[10000,50000],
    tols=[0.0,0.1],
    dims=[1,2,4,8]
)

n: 10000, tol: 0.0, dim: 1
Starting
BW calculated
Trained
Scored
Elapsed: 4.1302809715271
n: 10000, tol: 0.0, dim: 2
Starting
BW calculated
Trained
Scored
Elapsed: 3.8664679527282715
n: 10000, tol: 0.0, dim: 4
Starting
BW calculated
Trained
Scored
Elapsed: 4.413316011428833
n: 10000, tol: 0.0, dim: 8
Starting
BW calculated
Trained
Scored
Elapsed: 5.624083042144775
n: 10000, tol: 0.1, dim: 1
Starting
BW calculated
Trained
Scored
Elapsed: 1.5577480792999268
n: 10000, tol: 0.1, dim: 2
Starting
BW calculated
Trained
Scored
Elapsed: 1.5478770732879639
n: 10000, tol: 0.1, dim: 4
Starting
BW calculated
Trained
Scored
Elapsed: 2.051440954208374
n: 10000, tol: 0.1, dim: 8
Starting
BW calculated
Trained
Scored
Elapsed: 2.9179069995880127
n: 50000, tol: 0.0, dim: 1
Starting
BW calculated
Trained
Scored
Elapsed: 127.07230186462402
n: 50000, tol: 0.0, dim: 2
Starting
BW calculated
Trained
Scored
Elapsed: 122.14469289779663
n: 50000, tol: 0.0, dim: 4
Starting
BW calculated
Trained
Scored
Elapsed: 13

In [59]:
with open("energy_benchmark.pickle", 'wb') as f:
    pickle.dump(results, f)

In [29]:
with open("energy_benchmark.pickle", 'rb') as f:
    results = pickle.load(f)

In [30]:
for d in [2,4,8]:
    jaccard = compare_outliers(
        results["scores"][(50000,0.0,d)],
        results["scores"][(50000,0.1,d)]
    )
    print("Jaccard similarity between 0.0 and 0.1 for d{}: {}".format(
         d,
         jaccard))

Jaccard similarity between 0.0 and 0.1 for d2: 0.9801980198019802
Jaccard similarity between 0.0 and 0.1 for d4: 0.9801980198019802
Jaccard similarity between 0.0 and 0.1 for d8: 0.9157088122605364


In [31]:
prefix = "/Users/egan/Documents/Projects/tKDE/experiments/end2end/scores/"
filenames = {
    (50000, 0.01, 2): "energy_n50_d2_tol1.csv",
    (50000, 0.01, 4): "energy_n50_d4_tol1.csv",
    (50000, 0.01, 8): "energy_n50_d8_tol1.csv",
    (50000, 0.1, 2): "energy_n50_d2_tol10.csv",
    (50000, 0.1, 4): "energy_n50_d4_tol10.csv",
    (50000, 0.1, 8): "energy_n50_d8_tol10.csv"
}
files = {k: prefix+filenames[k] for k in filenames}

In [32]:
for k,f in files.items():
    densities = pd.read_csv(f, header=None).values.flatten()
    n=k[0]
    d=k[2]
    jaccard = compare_outliers(
        results["scores"][(n,0.0,d)],
        densities
    )
    print("Jaccard similarity between exact and {}: {}".format(
         k,
         jaccard))

Jaccard similarity between exact and (50000, 0.01, 4): 0.9960079840319361
Jaccard similarity between exact and (50000, 0.1, 2): 0.9880715705765407
Jaccard similarity between exact and (50000, 0.1, 4): 0.9880715705765407
Jaccard similarity between exact and (50000, 0.01, 2): 1.0
Jaccard similarity between exact and (50000, 0.1, 8): 0.9960079840319361
Jaccard similarity between exact and (50000, 0.01, 8): 0.9960079840319361
