In [1]:
%matplotlib inline

In [35]:
import numpy as np
import sklearn
import scipy.stats
import matplotlib
import matplotlib.pyplot as plt
import timeit
import pandas as pd
import math

from sklearn.neighbors import (
    KernelDensity,
    KDTree,
)
from sklearn.preprocessing import (
    RobustScaler
)

# Helper Functions

In [42]:
def get_self_density(d, n):
    return scipy.stats.multivariate_normal.pdf(
        np.zeros(d), 
        mean=np.zeros(d), 
        cov=np.identity(d)) / n;

In [79]:
def estimate_kde_bw(data):
    q3 = np.percentile(data, 75, axis=0)
    q1 = np.percentile(data, 25, axis=0)
    print("Q1: {}\nQ3: {}".format(q1, q3))
    iqr = q3 - q1
    bw = iqr * (data.shape[0])**(-1.0/(data.shape[1]+4))
    return bw

In [82]:
def get_scores(data, bw=None):
    if bw is None:
        bw = estimate_kde_bw(data)
    scaled_data = data / bw
    
    # Normalized Computations
    kde = KernelDensity(
        bandwidth=1,
        kernel='gaussian',
        algorithm='kd_tree',
        rtol=0.0,
    )
    kde.fit(scaled_data)
    scores = np.exp(kde.score_samples(scaled_data))
    
    self_density = get_self_density(data.shape[1], data.shape[0])
    scores_minus_self = scores - self_density
    
    # Denormalize
    denorm_scores = scores_minus_self / np.prod(bw)
    return denorm_scores

# KDE Density Percentiles

In [3]:
energy = pd.read_csv("../us_energy_1p0_metrics.csv")

In [66]:
n=10000
e2 = energy[[0,1]].iloc[:n]

In [80]:
bw = estimate_kde_bw(e2)
bw

Q1: [ 35.09384733  15.98888948]
Q3: [ 266.46675362   65.87145206]


array([ 49.84778156,  10.74687233])

In [83]:
scores = get_scores(e2, bw = [49.8570585894351, 10.746872325671712])
np.percentile(scores, q=1.0)

2.4597108911493338e-07

In [88]:
sorted(scores)[0:3]

[5.9297633902026493e-08, 6.0179169670397597e-08, 7.6291235621811238e-08]