In [1]:
%matplotlib inline
import numpy as np
import sklearn
import scipy.stats
import matplotlib
import matplotlib.pyplot as plt
import timeit
import pandas as pd
import math
import itertools
import time
import pickle

from sklearn.neighbors import (
    KernelDensity,
    KDTree,
)
from sklearn.preprocessing import (
    RobustScaler
)

In [10]:
def estimate_kde_bw(data):
    q3 = np.percentile(data, 75, axis=0)
    q1 = np.percentile(data, 25, axis=0)
    iqr = q3 - q1
    bw = iqr * (data.shape[0])**(-1.0/(data.shape[1]+4))
    return bw
def get_scores(data, numScore=None, k=10, bw=None):
    print("Starting", flush=True)
    trainstart = time.time()
    if bw is None:
        bw = estimate_kde_bw(data)
    if numScore is None:
        numScore = len(data)
    print("BW calculated {}".format(bw), flush=True)
    scaled_data = data / bw
    
    # Normalized Computations
    kdtree = KDTree(scaled_data)
    print("Trained", flush=True)
    print("Trained in {}".format(time.time()-trainstart), flush=True)
    scorestart = time.time()
    scores,_ = kdtree.query(
        scaled_data[:numScore],
        k=k,
        dualtree=True
    )
    
    print("Scored {}".format(numScore), flush=True)
    print("Scored in {}".format(time.time()-scorestart), flush=True)
        
    # Denormalize
    return scores[:,k-1]


In [3]:
energy = pd.read_csv("../data/us_energy_1p0_metrics.csv")

In [19]:
scores = get_scores(energy[[0,1]].iloc[:1000000], k=20)

Starting
BW calculated [ 23.25610346   5.44144229]
Trained
Trained in 24.151832103729248
Scored 1000000
Scored in 71.62647700309753


In [16]:
np.percentile(scores, 99.0) ** 2

5.5295806120391768