In [1]:
import numpy as np
import sklearn
import scipy.stats
import matplotlib
matplotlib.use('PDF')
%matplotlib inline
import matplotlib.pyplot as plt
import timeit
import pandas as pd
import math
import itertools
import time

import sklearn.neighbors
from sklearn.neighbors import (
    KernelDensity,
    KDTree,
)

In [2]:
d = 4
mu = np.zeros(d)
cov = np.identity(d)

In [3]:
rv = scipy.stats.multivariate_normal(mu, cov)

In [4]:
samples = rv.rvs(size=10000000)
s_pdfs = rv.pdf(samples)
q_p = np.percentile(s_pdfs, 1)

In [5]:
n = 100000
m = 20000
score_samples = rv.rvs(size=n)
score_pdfs = rv.pdf(score_samples)

In [11]:
def eval(t, pdfs):
    predict = pdfs < t
    true = score_pdfs[:m] < t
    both = predict & true
    return (np.sum(both), np.sum(predict), np.sum(true))

# KDE

In [77]:
bw = (4/(d+4))**(1/(d+4)) * m ** (-1.0/(d+4))
bw

0.26591479484724945

In [78]:
kde = KernelDensity(
    bandwidth=bw,
    kernel='gaussian',
    algorithm='kd_tree',
    rtol=1e-3,
)
kde.fit(samples[:n])
kde_scores = kde.score_samples(score_samples[:m])
kde_pdfs = np.exp(kde_scores)
kde_deltas = kdf_pdfs-score_pdfs[:m]
print((np.mean(kde_deltas**2)))
print(eval(q_p, kde_pdfs))

7.26484595601e-07
(143, 161, 193)


In [72]:
kde = KernelDensity(
    bandwidth=0.1,
    kernel='gaussian',
    algorithm='kd_tree',
    rtol=1e-3,
)
kde.fit(samples[:n])
kde_scores = kde.score_samples(score_samples[:m])
kde_pdfs = np.exp(kde_scores)
kde_deltas = kde_pdfs-score_pdfs[:m]
print((np.mean(kde_deltas**2)))
print(eval(q_p, kde_pdfs))

4.03996208344e-06
(182, 844, 193)


In [79]:
kde = KernelDensity(
    bandwidth=0.4,
    kernel='gaussian',
    algorithm='kd_tree',
    rtol=1e-3,
)
kde.fit(samples[:n])
kde_scores = kde.score_samples(score_samples[:m])
kde_pdfs = np.exp(kde_scores)
kde_deltas = kde_pdfs-score_pdfs[:m]
print((np.mean(kde_deltas**2)))
print(eval(q_p, kde_pdfs))

2.70921269421e-06
(105, 105, 193)


# Histogram

In [80]:
bw = 3.5 * 1 * (n) ** (-1.0/(2+d))
bw

0.5137297436677244

In [47]:
math.log(n) * n ** (-1.0/3)

0.2480384600548332

In [27]:
def calc_hist(bdelta):
    bins_1d = np.arange(-4, 4.1, bdelta)
    bins = np.array([bins_1d] * d)
    def to_bin(x):
        idxs = np.searchsorted(bins_1d, x)
        return np.minimum(np.maximum(idxs,0),len(bins_1d)-2)
    H, edges = np.histogramdd(samples[:n], bins=bins, normed=True)
    hist_scores = [
        H[tuple(to_bin(cursample))] for cursample in score_samples[:m]
    ]
    return hist_scores

In [81]:
hist_scores = calc_hist(0.51)
deltas = hist_scores-score_pdfs[:m]
print((np.mean(deltas**2)))
print(eval(q_p, hist_scores))

2.23197325223e-05
(150, 1437, 193)


In [51]:
hist_scores = calc_hist(0.3)
deltas = hist_scores-score_pdfs[:m]
print((np.mean(deltas**2)))
print(eval(q_p, hist_scores))

1.54572120755e-05
(187, 4213, 193)


In [52]:
hist_scores = calc_hist(0.7)
deltas = hist_scores-score_pdfs[:m]
print((np.mean(deltas**2)))
print(eval(q_p, hist_scores))

3.49451432945e-05
(112, 1079, 193)


# KNN

In [54]:
bk = n ** (4/(4+d))
bk

316.22776601683796

In [19]:
import scipy.special
def vol_sphere(n,r):
    return (math.pi)**(n/2)/(scipy.special.gamma(n/2+1)) * r**n

In [20]:
vsphere = vol_sphere(4,1)

In [21]:
tree = KDTree(samples[:n])

In [22]:
def calc_knn(k):
    distances, _ = tree.query(score_samples[:m], k=k)
    knndistances = distances[:,k-1]
    knn_scores = (k/n) / (vsphere * np.power(knndistances, d))
    return knn_scores

In [71]:
knn_scores = calc_knn(3)
knn_deltas = knn_scores-score_pdfs[:m]
print((np.mean(knn_deltas**2)))
print(eval(q_p, knn_scores))

0.000174160834315
(124, 142, 193)


In [82]:
knn_scores = calc_knn(316)
knn_deltas = knn_scores-score_pdfs[:m]
print((np.mean(knn_deltas**2)))
print(eval(q_p, knn_scores))

3.77459536199e-07
(19, 19, 193)


In [63]:
knn_scores = calc_knn(1000)
knn_deltas = knn_scores-score_pdfs[:m]
print((np.mean(knn_deltas**2)))
print(eval(q_p, knn_scores))

6.20704965401e-07
(6, 6, 193)
