In [1]:
%matplotlib inline
import numpy as np
import sklearn
import scipy.stats
import matplotlib
import matplotlib.pyplot as plt
import timeit
import pandas as pd
import math
import itertools
import time
import pickle

from sklearn.neighbors import (
    KernelDensity,
    KDTree,
)
from sklearn.preprocessing import (
    RobustScaler
)

In [2]:
def estimate_kde_bw(data):
    q3 = np.percentile(data, 75, axis=0)
    q1 = np.percentile(data, 25, axis=0)
    iqr = q3 - q1
    bw = iqr * (data.shape[0])**(-1.0/(data.shape[1]+4))
    return bw
def get_scores(data, numScore=None, k=10, bw=None):
    print("Starting", flush=True)
    trainstart = time.time()
    if bw is None:
        bw = estimate_kde_bw(data)
    if numScore is None:
        numScore = len(data)
    print("BW calculated {}".format(bw), flush=True)
    scaled_data = data / bw
    
    # Normalized Computations
    kdtree = KDTree(scaled_data)
    print("Trained", flush=True)
    print("Trained in {}".format(time.time()-trainstart), flush=True)
    scorestart = time.time()
    scores,_ = kdtree.query(
        scaled_data[:numScore],
        k=k,
        dualtree=True
    )
    
    print("Scored {}".format(numScore), flush=True)
    print("Scored in {}".format(time.time()-scorestart), flush=True)
        
    # Denormalize
    return scores[:,k-1]


In [3]:
energy = pd.read_csv("../data/us_energy_1p0_metrics.csv")

In [7]:
home = pd.read_csv("../data/home_sensor.csv")

In [11]:
shuttle = pd.read_csv("../data/shuttle.csv")

In [13]:
print("energy")
for d in [2,4,8]:
    print("d: {}".format(d))
    scores = get_scores(
        energy[list(range(d))].iloc[:1000000], k=10)
    p = np.percentile(scores, 99.0) ** 2
    print("percentile: {}".format(p))

energy
d: 2
Starting
BW calculated [ 23.25610346   5.44144229]
Trained
Trained in 30.69838809967041
Scored 1000000
Scored in 76.91624593734741
percentile: 0.011750693237512998
d: 4
Starting
BW calculated [ 41.35584993   9.67640479   7.05733137   5.10227938]
Trained
Trained in 33.21978712081909
Scored 1000000
Scored in 96.37732315063477
percentile: 0.16678952431251357
d: 8
Starting
BW calculated [ 73.54225642  17.2073514   12.54990707   9.07327836  12.53429871
  32.21027153   4.02154726  19.15801935]
Trained
Trained in 211.20607995986938
Scored 1000000
Scored in 454.22492718696594
percentile: 5.952844557163472


In [10]:
print("home")
for d in [2,4,8]:
    print("d: {}".format(d))
    scores = get_scores(
        home[list(range(3,d+3))].iloc[:1000000], k=10)
    p = np.percentile(scores, 99.0) ** 2
    print("percentile: {}".format(p))

home
d: 2
Starting
BW calculated [ 0.07445847  0.18192618]
Trained
Trained in 2.605396032333374
Scored 928991
Scored in 52.430914878845215
percentile: 0.09881739664872093
d: 4
Starting
BW calculated [ 0.13200222  0.32252423  0.3471802   0.36840824]
Trained
Trained in 2.8492391109466553
Scored 928991
Scored in 149.36745405197144
percentile: 0.6485160239999515
d: 8
Starting
BW calculated [ 0.23401753  0.5717807   0.61549155  0.65312526  1.12627497  0.81856817
  0.49597398  0.54345201]
Trained
Trained in 5.992555141448975
Scored 928991
Scored in 146.45897698402405
percentile: 1.0547936762814547


In [12]:
print("shuttle")
print("d: 2")
scores = get_scores(
    home[[0,2]].iloc[:43500], k=10)
p = np.percentile(scores, 99.0) ** 2
print("percentile: {}".format(p))
print("d: 4")
scores = get_scores(
    home[[0,2,4,5]].iloc[:43500], k=10)
p = np.percentile(scores, 99.0) ** 2
print("percentile: {}".format(p))
print("d: 7")
scores = get_scores(
    home[[0,2,4,5,6,7,8]].iloc[:43500], k=10)
p = np.percentile(scores, 99.0) ** 2
print("percentile: {}".format(p))

shuttle
d: 2
Starting
BW calculated [  7.77746182e+04   2.33287348e-01]
Trained
Trained in 0.07338595390319824
Scored 43500
Scored in 0.5004849433898926
percentile: 0.027775475279007568
d: 4
Starting
BW calculated [  1.21369140e+05   3.64050449e-01   4.73221284e-01   5.05329115e-01]
Trained
Trained in 0.06426000595092773
Scored 43500
Scored in 1.446791172027588
percentile: 1.4588345538925607
d: 7
Starting
BW calculated [  1.74678152e+05   5.23952464e-01   6.81074445e-01   7.27285010e-01
   7.77416283e-01   1.34257471e+00   9.75887759e-01]
Trained
Trained in 0.08105611801147461
Scored 43500
Scored in 2.2270028591156006
percentile: 6.793650606503634
