# Initialization

In [None]:
# %load init.ipy
%reload_ext autoreload
%autoreload 2

# %load init.ipy
import os, sys, logging, datetime, warnings, shutil
from importlib import reload

import numpy as np
import scipy as sp
import scipy.stats
import matplotlib as mpl
import matplotlib.pyplot as plt
from nose import tools

import kalepy as kale
import kalepy.utils
import kalepy.plot
# The `nbshow` command runs `plt.show()` in interactive jupyter notebooks, but closes
#   figures when run from the command-line (notebooks are converted to scripts and run as tests)
from kalepy.plot import nbshow

import warnings
# warnings.simplefilter("error")   # WARNING: this is breaking jupyter at the moment (2021-02-14)

# Complex

## Different bandwidths per variable

Examine the PDFs

In [None]:
np.random.seed(1122)
data = kale.utils._random_data_2d_02(num=1e3)

corner = kale.Corner(len(data))

kde_def = kale.KDE(data)
corner.plot_kde(kde_def)
corner.plot_data(data)

kde_test = kale.KDE(data, bandwidth=[1.5, 0.02])
corner.plot_kde(kde_test, color='r')

nbshow()

Try resampling:

In [None]:
np.random.seed(1122)
NUM = 1e3
data = kale.utils._random_data_2d_02(num=NUM)

corner = kale.Corner(len(data))
corner.plot_data(data, color='k')

kde_def = kale.KDE(data)
# corner.plot_kde(kde_def, color='b')
samp_def = kde_def.resample(NUM)
corner.plot_data(samp_def, color='b')

kde_test = kale.KDE(data, bandwidth=[2.0, 0.02])
# corner.plot_kde(kde_test, color='r')
samp_test = kde_test.resample(NUM)
corner.plot_data(samp_test, color='r')

nbshow()

## Keep Dimensions in Resample

In [None]:
np.random.seed(9235)
NUM = 200
a1 = np.random.normal(6.0, 1.0, NUM//2)
# a2 = np.random.lognormal(0, 0.5, size=NUM//2)
a2 = np.random.lognormal(1.0, 0.5, size=NUM//2)
aa = np.concatenate([a1, a2])

bb = np.random.normal(3.0, 0.02, NUM) + aa/100

data = [aa, bb]
edges = [kale.utils.spacing(dd, 'lin', np.sqrt(NUM), stretch=1.0) for dd in data]
kde2d = kale.KDE(data)


fig, axes = plt.subplots(figsize=[15, 8], nrows=2, ncols=3)

norm = 2.3

for ii in range(3):
    test = np.array(data)
    test = np.insert(test, ii, norm*np.ones_like(test[0]), axis=0)

    kde3d = kale.KDE(test)
    
    samples = kde3d.resample(NUM, keep=ii)
    param_samp = samples[ii]
    if not np.allclose(param_samp, norm):
        raise ValueError("Failure!")

    samples = np.delete(samples, ii, axis=0)
    
    for jj in range(2):
        ax = axes[jj, ii]
        stuff = [samples[jj], data[jj]]
        for vals, cc in zip(stuff, ['r', 'b']):
            ax.hist(vals, bins=edges[jj], rwidth=0.9, alpha=0.5, color=cc)

        ks, pv = sp.stats.ks_2samp(*stuff)
        msg = "{} {} :: {:.2e} {:.2e}".format(ii, jj, ks, pv)
        # print(msg)
        # zplot.text(ax, msg, loc='ur')
    
nbshow()

# Reflecting Boundaries

## PDF

### 1D

In [None]:
np.random.seed(5142)
NUM = 1000
aa = np.random.lognormal(size=NUM)
# aa = aa[aa < 10.0]

extr = [0.0, 10.0]
edges = kale.utils.spacing(extr, 'lin', 30)
grid = kale.utils.spacing(extr, 'lin', 200, stretch=0.5)

fig, ax = plt.subplots(figsize=[8, 4])

ax.hist(aa, bins=edges, alpha=0.5, color='0.3', edgecolor='k', density=True)


# boundaries = [None, [0.0], [2.0], [0.0, 2.0]]
# lines = ['-', '--', ':', '-.']
boundaries = [None, [0.0, None], [True, None]]
lines = ['-', '-.', ':']
colors = ['r', 'b', 'g']
pdf = sp.stats.lognorm(1.0).pdf(grid)
ax.plot(grid, pdf, 'k-', label='True', alpha=0.5, lw=2.0)
for bnd, cc, ls in zip(boundaries, colors, lines):
    kde = kale.KDE(aa)
    _, pdf = kde.density(grid, reflect=bnd, probability=True)
    ax.plot(grid, pdf, color=cc, ls=ls, lw=2.0, alpha=0.8, label=str(bnd))

    
ax.set_xlim([-2, 8])
plt.legend()
nbshow()

In [None]:
np.random.seed(124)
NUM = 300
extr = [0.0, 2.0]
aa = np.random.uniform(*extr, NUM)

edges = kale.utils.spacing(extr, 'lin', np.sqrt(NUM))
egrid = kale.utils.spacing(extr, 'lin', 200, stretch=0.5)
cgrid = kale.utils.midpoints(egrid, 'lin')
delta = np.diff(egrid)
fig, ax = plt.subplots()

ax.hist(aa, bins=edges, alpha=0.5, color='0.3', edgecolor='k', density=True)

boundaries = [None, [0.0, None], [None, 2.0], extr]
lines = ['-', '--', ':', '-.']
colors = ['0.5', 'red', 'blue', 'orange']
for bnd, cc, ls in zip(boundaries, colors, lines):
    kde = kale.KDE(aa)
    _, pdf = kde.density(cgrid, reflect=bnd, probability=True)
    ax.plot(cgrid, pdf, color=cc, ls=ls, lw=4.0, alpha=0.8, label=str(bnd))
    
    tot = np.sum(pdf * delta)
    print("Total = {:.4e}".format(tot))
    assert np.isclose(tot, 1.0, rtol=1e-2), "Unitarity is violated!"
    
plt.legend()
nbshow()

### 2D

In [None]:
np.random.seed(124)
NUM = 400
xx = np.random.uniform(0.0, 2.0, NUM)
yy = np.random.normal(1.0, 1.0, NUM)
yy = yy[yy < 2.0]
yy = np.concatenate([yy, np.random.choice(yy, NUM-yy.size)])

data = [xx, yy]
edges = [kale.utils.spacing(aa, 'lin', np.sqrt(NUM)) for aa in [xx, yy]]
egrid = [kale.utils.spacing(ee, 'lin', 60, stretch=0.5) for ee in edges]
cgrid = [kale.utils.midpoints(ee, 'lin') for ee in egrid]
width = [np.diff(ee) for ee in egrid]

xc, yc = np.meshgrid(*cgrid, indexing='ij')

grid = np.vstack([xc.ravel(), yc.ravel()])

fig, axes = plt.subplots(figsize=[15, 4], ncols=4)

for ax in axes:
    ax.scatter(xx, yy, alpha=0.2)

hist, *_ = np.histogram2d(*data, bins=egrid, density=True)


kde = kale.KDE(data)
reflect = [[0.0, 2.0], [None, 2.0]]
_, pdf_1d = kde.pdf(grid, reflect=reflect)
pdf = pdf_1d.reshape(hist.shape)
# ax.contour(*cgrid, pdf)

inside = True
outside = True
for ii, ref in enumerate(reflect):
    if ref[0] is None:
        ref[0] = -np.inf
    if ref[1] is None:
        ref[1] = np.inf
    inside = inside & (ref[0] < grid[ii]) & (grid[ii] < ref[1])
    outside = outside & ((grid[ii] < ref[0]) | (ref[1] < grid[ii]))
    
assert np.all(pdf_1d[inside] > 0.0), "Inside has zero values!"
assert np.allclose(pdf_1d[outside], 0.0), "Outside has non-zero values!"

area = width[0][:, np.newaxis] * width[1][np.newaxis, :]
prob_tot = np.sum(pdf * area)
assert np.isclose(prob_tot, 1.0, rtol=3e-2), "`prob_tot` = '{}' is not unity!".format(prob_tot)
        
reflections = [
    [[0.0, 2.0], [None, 2.0]],
    [[0.0, 2.0], None],
    [None, [None, 2.0]],
    None
]
for ii, (ax, reflect) in enumerate(zip(axes, reflections)):
    _, pdf_1d = kde.pdf(grid, reflect=reflect)
    pdf = pdf_1d.reshape(hist.shape)
    ax.contour(*cgrid, pdf.T)
    
    ax.set(xlim=[-1.0, 3.0], ylim=[-2, 3], title=str(reflect))
    
nbshow()

## Resampling

### 1D

In [None]:
np.random.seed(1245)
NUM = 1000

extr = [0.0, 2.0]
data = np.random.uniform(*extr, NUM)
fig, axes = plt.subplots(figsize=[10, 5], ncols=2)

# edges = np.linspace(-0.4, 2.4, 20)
edges = np.linspace(-0.2, 2.2, 25)

for ax in axes:
    ax.scatter(data, -0.05*np.ones_like(data), alpha=0.1, marker='|')
    ax.hist(data, bins=edges, density=True, edgecolor='k', alpha=0.5)

kde = kale.KDE(data)

reflections = [None, extr]
for ii, reflect in enumerate(reflections):
    samp = kde.resample(NUM, reflect=reflect)
    axes[ii].scatter(samp, -0.07*np.ones_like(samp), alpha=0.1, color='r', marker='|')
    axes[ii].hist(samp, bins=edges, density=True, edgecolor='k', alpha=0.4, color='r', rwidth=0.5)
    
    some_outside = np.any((samp < extr[0]) + (extr[1] < samp))
    print("reflect = '{}', outside = '{}'".format(reflect, some_outside))
    if reflect is None:
        # There should be some samples outside
        tools.assert_true(some_outside)
    else:
        # There should not be any samples outside
        tools.assert_false(some_outside)
        ks, pv = sp.stats.ks_2samp(data, samp)
        print("ks = '{}', pv = '{}'".format(ks, pv))
        # Check new sample is consistent
        tools.assert_true(pv > 0.1)


In [None]:
np.random.seed(1243)
NUM = 300

extr = [0.0, 2.0]
data = np.random.uniform(*extr, NUM)

edges = kale.utils.spacing(extr, 'lin', 20)
edges = np.concatenate([
    edges[0] - np.arange(1, 5)[::-1]*np.diff(edges)[0],
    edges,
    edges[-1] + np.arange(1, 5)*np.diff(edges)[-1],
])

egrid = kale.utils.spacing(data, 'lin', 100, stretch=0.5)
cgrid = kale.utils.midpoints(egrid, 'lin')
delta = np.diff(egrid)

fig, axes = plt.subplots(figsize=[10, 4], ncols=2, sharex=True, sharey=True)
plt.subplots_adjust(hspace=0.05, wspace=0.05, left=0.08, right=0.98, bottom=0.08, top=0.92)

kde = kale.KDE(data)

reflections = [None, extr]
titles = ['No Reflection', 'Reflection']
for ii, reflect in enumerate(reflections):
    ax = axes[ii]
    ax.grid(alpha=0.2)
    ax.set_title(titles[ii])
    *_, l0 = ax.hist(aa, bins=edges, alpha=0.75, color='0.3', edgecolor='k', density=True)
    pdf = kde.pdf(cgrid, reflect=reflect)[1]
    l1, = ax.plot(cgrid, pdf, color='r', ls='-', lw=2.0, alpha=0.8)
    for dd in data:
        l2, = ax.plot([dd, dd], [0.0, -0.03], color='0.5', alpha=0.5, lw=0.5)
    
    samp = kde.resample(NUM, reflect=reflect)
    *_, l3 = ax.hist(samp, bins=edges, alpha=0.25, color='r', edgecolor='b', density=True)
    for ss in samp:
        l4, = ax.plot([ss, ss], [-0.03, -0.06], color='r', alpha=0.5, lw=0.5)
    
    some_outside = np.any((samp < extr[0]) + (extr[1] < samp))
    if reflect is None:
        assert some_outside, "There should be some samples outside!"
    else:
        assert (not some_outside), "There should not be any samples outside!"
        ks, pv = sp.stats.ks_2samp(data, samp)
        assert (pv > 0.4), "New sample is inconsistent!"
        
lines = [l1, l0[0], l3[0]]
names = ['KDE', 'Data', 'Samples']
plt.legend(lines, names)
nbshow()

# fname = 'kde_1d_reflect.png'
# fname = os.path.abspath(fname)
# fig.savefig(fname)
# print("Saved to '{}'".format(fname))

### 2D

In [None]:
seed = np.random.randint(int(1e4))
seed = 8067
# print(seed)
np.random.seed(seed)
NUM = 1000

# Create 2D data
xx = np.random.uniform(0.0, 2.0, NUM)
yy = np.random.normal(1.0, 1.5, NUM)
yy = yy[yy < 2.0]
yy = np.concatenate([yy, np.random.choice(yy, NUM-yy.size)])
data = [xx, yy]

# Create bin-edges, centers, grid, and bin-widths for each dimension
# edges = [kale.utils.spacing(aa, 'lin', 30) for aa in [xx, yy]]
# egrid = [kale.utils.spacing(ee, 'lin', 100, stretch=0.5) for ee in edges]
egrid = [kale.utils.spacing(aa, 'lin', 60, stretch=0.5) for aa in data]
cgrid = [kale.utils.midpoints(ee, 'lin') for ee in egrid]
width = [np.diff(ee) for ee in egrid]

xc, yc = np.meshgrid(*cgrid, indexing='ij')

grid = np.vstack([xc.ravel(), yc.ravel()])

fig, axes = plt.subplots(figsize=[10, 6], ncols=4, nrows=2, sharex=True, sharey=True)
plt.subplots_adjust(hspace=0.05, wspace=0.05, left=0.08, right=0.98, bottom=0.08, top=0.92)

hist, *_ = np.histogram2d(*data, bins=egrid, density=True)

levels = [0.05, 0.1, 0.15, 0.2, 0.25]

kde = kale.KDE(data)

reflections = [
    [[0.0, 2.0], [None, 2.0]],
    [[0.0, 2.0], None],
    [None, [None, 2.0]],
    None
]
kw = dict(edgecolor='0.5', alpha=0.1)
fs = 14
for ii, (axcol, reflect) in enumerate(zip(axes.T, reflections[::-1])):
    title = "Reflection:\n" + str(reflect)
    ylab = None
    pdf_1d = kde.pdf(grid, reflect=reflect)[1]
    pdf = pdf_1d.reshape(hist.shape)
    uu, vv = kde.resample(reflect=reflect)
    for jj, ax in enumerate(axcol):
        if jj == 0:
            if ii == 0:
                ylab = 'Data'                 
            ax.scatter(xx, yy, facecolor='b', **kw)
            ax.set_title(title)
        else:
            if ii == 0:
                ylab = 'Samples'                 
            ax.scatter(uu, vv, facecolor='r', **kw)

        ax.contour(*cgrid, pdf.T, zorder=10, levels=levels)
        ax.set(xlim=[-0.5, 2.5], ylim=[-3, 2.8])
        ax.set_ylabel(ylab, size=fs)
        
nbshow()

# fname = 'kde_2d_reflect.png'
# fname = os.path.abspath(fname)
# fig.savefig(fname)
# print("Saved to '{}'".format(fname))

# Particular Axes/Dimensions

Construct Data

In [None]:
NUM = 3000
# bandwidth = 0.2

sigma = [1.0, 0.2, 1.5]
corr = [
    [+1.4, +0.8, +0.4],
    [+0.8, +1.0, -0.25],
    [+0.4, -0.25, +1.0]
]

s2 = np.square(sigma)

cov = np.zeros_like(corr)
for (ii, jj), cc in np.ndenumerate(corr):
    cov[ii, jj] = cc * sigma[ii] * sigma[jj]

data = np.random.multivariate_normal(np.zeros_like(sigma), cov, NUM).T
dd = data[1, :]
dd = (dd - dd.min())/dd.max()
data *= np.sqrt(dd)[np.newaxis, :]

pc = 0
extr = [np.percentile(dd, [0+pc, 100-pc]) for dd in data]
noise = [np.random.uniform(*ex, NUM//5) for ex in extr]
data = np.append(data, noise, axis=1)

Plot Data and KDE Contours

In [None]:
NEDGE = 100
FIGSIZE = 8

edges = [np.linspace(xx.min(), xx.max(), NEDGE+1+ii) for ii, xx in enumerate(data)]
kde = kale.KDE(data)
npars = len(edges)

fig, axes = plt.subplots(figsize=[FIGSIZE, FIGSIZE], ncols=npars, nrows=npars)
for (ii, jj), ax in np.ndenumerate(axes):
    if jj > ii:
        ax.set_visible(False)
        continue
        
    xe = edges[jj]
    dx = data[jj]
    
    if ii == jj:
        pdf = kde.pdf(xe, params=[jj])[1]
        ax.plot(xe, pdf, lw=2.0, alpha=0.7)
    
        bw = kde.kernel.bandwidth[jj, jj]
        kde_1d = kale.KDE(data[jj], bandwidth=bw)
        ax.plot(*kde_1d.pdf(xe), 'r--', alpha=0.5)
    
        kale.plot.carpet(dx, ax=ax, color='r')
    
    else:
        ye = edges[ii]
        dy = data[ii]
        pdf = kde.pdf([xe, ye], params=[jj, ii], grid=True)[1]
        xx, yy = np.meshgrid(xe, ye, indexing='ij')

        # ax.pcolormesh(xx, yy, pdf, cmap='Blues_r')
        ax.contour(xx, yy, pdf, cmap='Blues', alpha=0.8, zorder=4)
        ax.scatter(dx, dy, color='r', alpha=0.03, s=4, zorder=10)
    
nbshow()

# Test

In [None]:
# data = kale.utils._random_data_3d_02()


# reflect = [None, [-0.15, None], None]

# kde = kale.KDE(data, reflect=reflect)

# ndim = len(data)
# print(ndim)

# fig, axes = plt.subplots(figsize=[10, 8], nrows=ndim, ncols=ndim)
# extrema = [None, None, None]
# for (ii, jj), ax in np.ndenumerate(axes):
#     if jj > ii:
#         continue
        
#     if jj == ii:
#         xx, yy = kde._density(params=ii)
#         ax.plot(xx, yy, 'k-', alpha=0.5)
#         extrema[ii] = kale.utils.minmax(xx, stretch=0.2)
        
#         # zz = kde.density(xx, params=ii)
#         # ax.plot(xx, yy, 'b--', alpha=0.5, lw=2.0)

#         bw = kde.kernel.bandwidth[jj, jj]
#         test = kale.KDE(data[ii], reflect=reflect[ii], bandwidth=bw)
#         _, zz = test.density(xx)
#         ax.plot(xx, zz, 'r--', alpha=0.5, lw=2.0)
#     else:
#         pars = (jj, ii)
#         grid, yy = kde._density(params=pars)
#         cnt = ax.contour(*grid, yy.T, alpha=0.5, colors='k')
#         # print(cnt.levels)

#         bw = kde.kernel.bandwidth[np.ix_(pars, pars)]
#         test = kale.KDE([data[jj], data[ii]], reflect=[reflect[jj], reflect[ii]], bandwidth=bw)
#         grid, zz = test.density()
#         print([np.shape(uu) for uu in grid], np.shape(zz))
#         ax.contour(*grid, zz.T, linestyles='--', color='r', alpha=0.5, lw=2.0)
        

# for (ii, jj), ax in np.ndenumerate(axes):
#     if jj > ii:
#         continue
#     if ii == jj:
#         ax.set_xlim(extrema[jj])
#     else:
#         ax.set(xlim=extrema[jj], ylim=extrema[ii])

    
        
# #         grid, yy = kde.density(params=pars)
# #         cnt = ax.contour(*grid, yy.T, linestyles='--', alpha=0.5, linewidths=2.0, colors='b', levels=cnt.levels)

# plt.show()

# Alternative Constructor

Construct a KDE from a histogram instead.

## 1D

In [None]:
# Create input histograms and output kdes
ns = 10**np.arange( 2, 5 )
xx_all = np.linspace(-5, 5, ns.max() + 1)

xxs = []
pdf_kdes = []
for n in ns:
    print( n )
    xx = np.linspace(-5, 5, n + 1)                                            
    pdf = np.exp(-xx*xx/2) / np.sqrt(2*np.pi)
    dx = xx[1] - xx[0]                                                      
    bins = np.zeros(xx.size + 1)                                            
    bins[:-1] = xx - 0.5 * dx                                               
    bins[-1] = xx[-1] + 0.5 * dx  
    
    # Construct a KDE from the histogram                                    
    kde = kale.KDE.from_hist(bins, pdf) 
    
    # Check that the KDE pdf matches the true pdf                           
    xx_all, pdf_kde = kde.density(xx_all, probability=True)  
    
    xxs.append( xx )
    pdf_kdes.append( pdf_kde )

In [None]:
fig = plt.figure( figsize=(8,6), facecolor='w' )
ax = plt.gca()

ax.plot(
    xx_all,
    pdf,
    color = 'k',
    linewidth = 15,
    zorder = -10,
    label = 'true',
)

for i, pdf_kde in enumerate( pdf_kdes ):
    
    ax.plot(
        xx_all,
        pdf_kde,
        color = plt.get_cmap( 'viridis' )( np.log10( ns[i] ) / np.log10( ns.max() ) ),
        label = 'dx = {:.3g}'.format( xxs[i][2] - xxs[i][1] ),
        zorder = 10 - i,
        linewidth = 3 + i*2,
    )
    
ax.legend(
    prop = { 'size': 16, },
)

## 2D

In [None]:
# Create input histograms and output kdes
xx = np.linspace(-5, 5, 101)                                            
yy = np.linspace(-5, 5, 101)                                            
xx_grid, yy_grid = np.meshgrid( xx, yy )                                
hist = np.exp(-xx_grid*xx_grid/2 - yy_grid*yy_grid/2 - xx_grid*yy_grid/2) 
pdf = hist / ( hist.sum() * dx * dx )
bins = np.zeros(xx.size + 1)                                            
dx = xx[1] - xx[0]                                                      
bins[:-1] = xx - 0.5 * dx                                               
bins[-1] = xx[-1] + 0.5 * dx                                            
bins = np.array([ bins, ]*2)                                            

# Construct a KDE from the histogram                                    
kde = kale.KDE.from_hist(bins, hist)                                    

# Check that the KDE pdf matches the true pdf                           
points = [ xx_grid.flatten(), yy_grid.flatten() ]                       
points, pdf_kde = kde.density( points, probability=True) 

In [None]:
fig = plt.figure( figsize=(8,8), facecolor='w' )
ax = plt.gca()

cs = ax.contour(
    xx_grid,
    yy_grid,
    pdf,
    colors = 'k',
    linewidths = 10,
    labels = 'true',
)
cs.collections[0].set_label( 'true' )

cs_kde = ax.contour(
    xx_grid,
    yy_grid,
    pdf_kde.reshape( xx_grid.shape ),
    linewidths = 3,
    colors = 'r',
)
cs_kde.collections[0].set_label( 'KDE' )

ax.set_aspect( 'equal' )

ax.legend(
    prop = { 'size': 16, },
)

# Other Kernels

In [None]:
np.random.seed(12345)
NUM = 200

extr = [0.0, 2.0]
data = np.random.uniform(*extr, NUM)

edges = kale.utils.spacing(extr, 'lin', 14)
edges = np.concatenate([
    edges[0] - np.arange(1, 5)[::-1]*np.diff(edges)[0],
    edges,
    edges[-1] + np.arange(1, 5)*np.diff(edges)[-1],
])

egrid = kale.utils.spacing(data, 'lin', 100, stretch=0.5)
cgrid = kale.utils.midpoints(egrid, 'lin')
delta = np.diff(egrid)

fig, axes = plt.subplots(figsize=[10, 4], ncols=2, sharex=True, sharey=True)
plt.subplots_adjust(hspace=0.05, wspace=0.05, left=0.08, right=0.98, bottom=0.08, top=0.92)

kde = kale.KDE(data, kernel=kale.kernels.Triweight, bandwidth=0.5)

reflections = [None, extr]
titles = ['No Reflection', 'Reflection']
for ii, reflect in enumerate(reflections):
    print(ii, reflect)
    ax = axes[ii]
    ax.grid(alpha=0.2)
    ax.set_title(titles[ii])
    *_, l0 = ax.hist(data, bins=edges, alpha=0.75, color='0.3', edgecolor='k', density=True)
    pdf = kde.pdf(cgrid, reflect=reflect)[1]
    l1, = ax.plot(cgrid, pdf, color='r', ls='-', lw=2.0, alpha=0.8)
#     for dd in data:
#         l2, = ax.plot([dd, dd], [0.0, -0.03], color='0.5', alpha=0.5, lw=0.5)
    kale.plot.carpet(data, ax=ax, color='0.5', yave=-0.03, ystd=0.01)

    tot = np.sum(pdf*delta)
    
    samp = kde.resample(NUM, reflect=reflect)
    *_, l3 = ax.hist(samp, bins=edges, alpha=0.25, color='r', edgecolor='b', density=True)
#     for ss in samp:
#         l4, = ax.plot([ss, ss], [-0.03, -0.06], color='r', alpha=0.5, lw=0.5)
    kale.plot.carpet(samp, ax=ax, color='r', yave=-0.06, ystd=0.01)
    
    some_outside = np.any((samp < extr[0]) + (extr[1] < samp))
    if reflect is None:
        assert some_outside, "There should be some samples outside!"
    else:
        assert (not some_outside), "There should not be any samples outside!"

    assert np.isclose(tot, 1.0, rtol=2e-2), "Unitarity violated!  Prob tot: {:.4e}".format(tot)

lines = [l1, l0[0], l3[0]]
names = ['KDE', 'Data', 'Samples']
plt.legend(lines, names)
nbshow()

In [None]:
seed = np.random.randint(int(1e4))
seed = 8067
# print(seed)
np.random.seed(seed)
NUM = 300
xx = np.random.uniform(0.0, 2.0, NUM)
yy = np.random.normal(1.0, 1.5, NUM)
yy = yy[yy < 2.0]
yy = np.concatenate([yy, np.random.choice(yy, NUM-yy.size)])

data = [xx, yy]
edges = [kale.utils.spacing(aa, 'lin', 20) for aa in [xx, yy]]
egrid = [kale.utils.spacing(ee, 'lin', 50, stretch=0.5) for ee in edges]
cgrid = [kale.utils.midpoints(ee, 'lin') for ee in egrid]
width = [np.diff(ee) for ee in egrid]

xc, yc = np.meshgrid(*cgrid, indexing='ij')

grid = np.vstack([xc.ravel(), yc.ravel()])

fig, axes = plt.subplots(figsize=[10, 6], ncols=4, nrows=2, sharex=True, sharey=True)
plt.subplots_adjust(hspace=0.05, wspace=0.05, left=0.08, right=0.98, bottom=0.08, top=0.92)

hist, *_ = np.histogram2d(*data, bins=egrid, density=True)

levels = [0.05, 0.1, 0.15, 0.2, 0.25]

kde = kale.KDE(data, kernel='box')
inside_test_func = np.all if not kde.kernel.FINITE else np.any

reflections = [
    [[0.0, 2.0], [None, 2.0]],
    [[0.0, 2.0], None],
    [None, [None, 2.0]],
    None
]
kw = dict(edgecolor='0.5', alpha=0.1)
fs = 14
for ii, (axcol, reflect) in enumerate(zip(axes.T, reflections[::-1])):
    title = "Reflection:\n" + str(reflect)
    ylab = None
    pdf_1d = kde.pdf(grid, reflect=reflect)[1]
    pdf = pdf_1d.reshape(hist.shape)
    uu, vv = kde.resample(reflect=reflect)
    for jj, ax in enumerate(axcol):
        if jj == 0:
            if ii == 0:
               ylab = 'Data'                 
            ax.scatter(xx, yy, facecolor='b', **kw)
            ax.set_title(title)
        else:
            if ii == 0:
               ylab = 'Samples'                 
            ax.scatter(uu, vv, facecolor='r', **kw)

        ax.contour(*cgrid, pdf.T, zorder=10, levels=levels)
        ax.set(xlim=[-0.5, 2.5], ylim=[-3, 2.8])
        ax.set_ylabel(ylab, size=fs)
        
    inside = np.ones_like(pdf_1d, dtype=bool)
    if reflect is None:
        outside = np.zeros_like(pdf_1d, dtype=bool)
    else:
        outside = np.ones_like(pdf_1d, dtype=bool)
        for ii, ref in enumerate(reflect):
            if ref is None:
                ref = [-np.inf, np.inf]
            if ref[0] is None:
                ref[0] = -np.inf
            if ref[1] is None:
                ref[1] = np.inf
            inside = inside & (ref[0] < grid[ii]) & (grid[ii] < ref[1])
            outside = outside & ((grid[ii] < ref[0]) | (ref[1] < grid[ii]))

    print("reflect: {}, inside: {}".format(reflect, kale.utils.stats_str(pdf_1d[inside])))
    tools.assert_true(inside_test_func(pdf_1d[inside] > 0.0))
    tools.assert_true(np.allclose(pdf_1d[outside], 0.0))

    area = width[0][:, np.newaxis] * width[1][np.newaxis, :]
    prob_tot = np.sum(pdf * area)
    print(jj, reflect, "prob_tot = {:.4e}".format(prob_tot))
    print("\t", kale.utils.stats_str(pdf))
    tools.assert_true(np.isclose(prob_tot, 1.0, rtol=3e-2))
        

nbshow()

# fname = 'kde_2d_reflect.png'
# fname = os.path.abspath(fname)
# fig.savefig(fname)
# print("Saved to '{}'".format(fname))

# Derived Statistics

## CDF - Cumulative DF

### 1D

In [None]:
def check_1d_samples_cdf(samples, xx, tru_cdf, tru_pdf=None):
    xx = np.asarray(xx)
    # Construct KDE and calculate CDF
    print("samples = ", np.shape(samples), "xx = ", np.shape(xx))
    kde = kale.KDE(samples)    
    kde_cdf = kde.cdf(xx)
    # Calculate error relative to true CDF
    kde_err = np.fabs(1 - kde_cdf/tru_cdf)

    # Create and Setup Figure/Axes
    fig, axes = plt.subplots(figsize=[12, 6], ncols=2)
    # tw = ax.twinx()
    ax = axes[0]
    tw = axes[1]
    tw.set_yscale('log')

    kde_pdf = kde.pdf(xx)[1]
    if tru_pdf is not None:
        norm = tru_pdf.max()
        ax.plot(xx, tru_pdf / norm, 'k--', alpha=0.5)
    else:
        norm = kde_pdf.max()
        
    ax.plot(xx, kde_pdf / norm, 'b--', alpha=0.5)
    
    ax.plot(xx, tru_cdf, 'k-', lw=3.0, alpha=0.6)
    ax.plot(xx, kde_cdf, 'b-', lw=2.0, alpha=0.7)
    tw.plot(xx, kde_err, 'r-', lw=3.0, alpha=0.7, label='kde error')

    # Calculate an "expected" error based on Poisson statistics
    zz = np.sort(samples)
    cnt = np.arange(1, zz.size+1)
    # Poisson error up to each sample-point
    err = 1 / np.sqrt(cnt)
    # Interpolate to test-points, and scale by number of bins up to each bin
    err_samp = np.interp(xx, zz, err) / np.sqrt(np.arange(1, len(xx)+1))
    # Calculate a fractional, integrated error measure
    # This is the excess of KDE error above expected
    int_err = np.clip(kde_err - err_samp, 0.0, None)
    _xx = (xx - ave) / std
    int_err = kale.utils.cumtrapz(int_err, _xx) / (_xx.max() - _xx.min())
    tot_err = int_err[-1]
    print("tot err  = ", tot_err)
    
    tw.plot(xx, err_samp, 'r--', lw=3.0, alpha=0.5, label='expected error')
    tw.fill_between(xx, 0.0, err_samp, color='r', alpha=0.1)
    tw.plot(xx, int_err, 'r-.', label='Frac Int Err')

    tw.legend()  # loc='center left', title='Right Axes')
    
    if tot_err > 0.1:
        # warnings.warn("Error {:.2e} is large".format(tot_err))
        if tot_err > 0.2:
            raise ValueError("Error {:.2e} is unexceptably large!".format(tot_err))
    
    return fig

### 1D Normal Distribution

In [None]:
NUM_POINTS = 3e3
NUM_PDF = 1e2
np.random.seed(1234)

# Construct random values
ave, std = 10**np.random.uniform(-1, 1, 2)
print("Normal Distribution: {:.2e} ± {:.2e}".format(ave, std))
# Choose random samples
samples = np.random.normal(ave, std, size=int(NUM_POINTS))
# Choose test-points (x-axis values)
xx = sorted(np.random.uniform(samples.min(), samples.max(), int(NUM_PDF)))
# Find the "true" CDF
tru_cdf = sp.stats.norm.cdf(xx, ave, std)
tru_pdf = sp.stats.norm.pdf(xx, ave, std)

# Perform Test
fig = check_1d_samples_cdf(samples, xx, tru_cdf, tru_pdf)
nbshow()

### 1D Uniform Distribution

In [None]:
NUM_POINTS = 3e3
NUM_PDF = 1e2
np.random.seed(4321234)

# Construct random values
ave, std = 10**np.random.uniform(-1, 1, 2)
print("Uniform Distribution: {:.2e} ± {:.2e}".format(ave, std))
# Choose random samples
samples = np.random.uniform(ave, ave+std, size=int(NUM_POINTS))
# Choose test-points (x-axis values)
xx = sorted(np.random.uniform(samples.min(), samples.max(), int(NUM_PDF)))
# Find the "true" CDF
tru_cdf = sp.stats.uniform.cdf(xx, ave, std)
tru_pdf = sp.stats.uniform.pdf(xx, ave, std)

# Perform Test
fig = check_1d_samples_cdf(samples, xx, tru_cdf, tru_pdf)
nbshow()

### 1D Composite

In [None]:
NUM_POINTS = 3e3
NUM_PDF = 1e2
np.random.seed(4321234)

# Construct random values
# ave, std = 10**np.random.uniform(-1, 1, 2)
ave = 3.63e-01
std = 2.35e-01
print("Distribution params: {:.2e} ± {:.2e}".format(ave, std))

# Choose random samples
n1 = int(NUM_POINTS*0.5)
n2 = int(NUM_POINTS - n1)
a1 = np.random.normal(ave, std, n1)
# a2 = np.random.lognormal(ave, std, size=n2)
a2 = sp.stats.lognorm.ppf(np.random.uniform(0, 1, n2), ave, std)
samples = np.concatenate([a1, a2])

# edges, pdf = kale.pdf(aa)
# plt.plot(edges, pdf)

# Choose test-points (x-axis values)
xx = sorted(np.random.uniform(samples.min(), samples.max(), int(NUM_PDF)))
# Find the "true" CDF
c1 = sp.stats.norm.cdf(xx, ave, std) * n1 / (n1 + n2)
c2 = sp.stats.lognorm.cdf(xx, ave, std) * n2 / (n1 + n2)
tru_cdf = c1 + c2

p1 = sp.stats.norm.pdf(xx, ave, std) * n1 / (n1 + n2)
p2 = sp.stats.lognorm.pdf(xx, ave, std) * n2 / (n1 + n2)
tru_pdf = p1 + p2

# Perform Test
fig = check_1d_samples_cdf(samples, xx, tru_cdf, tru_pdf)
nbshow()

### 2D

In [None]:
def check_2d_samples_cdf(samples, edges, tru_cdf, tru_pdf=None):
    xx = np.asarray(xx)
    # Construct KDE and calculate CDF
    kde = kale.KDE(samples)    
    kde_cdf = kde.cdf(xx)
    # Calculate error relative to true CDF
    kde_err = np.fabs(1 - kde_cdf/tru_cdf)

    # Create and Setup Figure/Axes
    fig, axes = plt.subplots(figsize=[12, 6], ncols=2)
    # tw = ax.twinx()
    ax = axes[0]
    tw = axes[1]
    tw.set_yscale('log')

    kde_pdf = kde.pdf(xx)[1]
    if tru_pdf is not None:
        norm = tru_pdf.max()
        ax.plot(xx, tru_pdf / norm, 'k--', alpha=0.5)
    else:
        norm = kde_pdf.max()
        
    ax.plot(xx, kde_pdf / norm, 'b--', alpha=0.5)
    
    ax.plot(xx, tru_cdf, 'k-', lw=3.0, alpha=0.6)
    ax.plot(xx, kde_cdf, 'b-', lw=2.0, alpha=0.7)
    tw.plot(xx, kde_err, 'r-', lw=3.0, alpha=0.7, label='kde error')

    # Calculate an "expected" error based on Poisson statistics
    zz = np.sort(samples)
    cnt = np.arange(1, zz.size+1)
    # Poisson error up to each sample-point
    err = 1 / np.sqrt(cnt)
    # Interpolate to test-points, and scale by number of bins up to each bin
    err_samp = np.interp(xx, zz, err) / np.sqrt(np.arange(1, len(xx)+1))
    # Calculate a fractional, integrated error measure
    # This is the excess of KDE error above expected
    int_err = np.clip(kde_err - err_samp, 0.0, None)
    _xx = (xx - ave) / std
    int_err = kale.utils.cumtrapz(int_err, _xx) / (_xx.max() - _xx.min())
    tot_err = int_err[-1]
    print("tot err  = ", tot_err)
    
    tw.plot(xx, err_samp, 'r--', lw=3.0, alpha=0.5, label='expected error')
    tw.fill_between(xx, 0.0, err_samp, color='r', alpha=0.1)
    tw.plot(xx, int_err, 'r-.', label='Frac Int Err')

    tw.legend()  # loc='center left', title='Right Axes')
    
    if tot_err > 0.1:
        warnings.warn("Error {:.2e} is large".format(tot_err))
        if tot_err > 0.2:
            raise ValueError("Error {:.2e} is unexceptably large!".format(tot_err))
    
    return fig

In [None]:
# NUM_PTS = 1e3
# NUM_EDGES = 40
# np.random.seed(1234)

# aves = np.random.uniform(-10, 10, 2)
# stds = np.random.uniform(0.0, 3.0, 2)
# cov = np.random.uniform(-1.0, 1.0, (2,2))
# cov[np.arange(2), np.arange(2)] = stds
# cov[1, 0] = cov[0, 1]

# # print(aves)
# # print(cov)

# dist = sp.stats.multivariate_normal(aves, cov)
# samples = dist.rvs(int(NUM_PTS)).T
# # print(samples.shape)
# edges = [[xx.min(), xx.max()] for xx in samples]
# edges = [np.linspace(*ex, int(NUM_EDGES)) for ex in edges]
# edges = np.array(edges)

# xx, yy = np.meshgrid(*edges, indexing='ij')
# grid = np.moveaxis([xx, yy], 0, -1)

# pdf = dist.pdf(grid)
# cdf = dist.cdf(grid)
# # print("pdf = ", kale.utils.stats_str(pdf))
# # print("cdf = ", kale.utils.stats_str(cdf))

# pdf_levels = [1e-4, 1e-3, 1e-2, 0.02, 0.05, 0.1]
# cdf_levels = [1e-3, 1e-2, 1e-1, 0.2, 0.5, 0.9, 0.95]

# kde = kale.KDE(samples)

# kde_pdf = kde.pdf(edges, grid=True)[1]
# kde_cdf = kde.cdf_grid(edges)

# pdf_rat = kde_pdf / pdf
# cdf_rat = kde_cdf / cdf

# ratios = np.array([pdf_rat, cdf_rat])
# idx = (ratios != 0.0)
# rextr = [np.min(ratios[idx]), np.max(ratios[idx])]
# # print("Ratio extrema = ", rextr)

# pdf_rat = np.log10(pdf_rat)
# cdf_rat = np.log10(cdf_rat)
# rextr = [-1.0, 1.0]


# fig, axes = plt.subplots(figsize=[15, 5], ncols=4, sharex=True, sharey=True)
# plt.subplots_adjust(bottom=0.05, left=0.03, right=0.9, top=0.95, wspace=0.02)
# cbax = fig.add_axes([0.92, 0.1, 0.03, 0.8])

# for ax in axes[:2]:
#     ax.scatter(*samples, alpha=0.1, s=10, color='0.5')

# ax = axes[0]
# ax.contour(xx, yy, pdf, colors='r', levels=pdf_levels)  # cmap='Reds', 
# ax.contour(xx, yy, cdf, colors='b', levels=cdf_levels)  # cmap='Blues', 

# ax = axes[1]
# ax.contour(xx, yy, kde_pdf, colors='r', levels=pdf_levels)  # cmap='Reds', 
# ax.contour(xx, yy, kde_cdf, colors='b', levels=cdf_levels)  # cmap='Blues', 

# ax = axes[2]
# smap = kale.plot._get_smap(rextr, cmap='RdBu')

# for ax, rat in zip(axes[2:], ratios):
#     pcm = ax.pcolormesh(xx, yy, pdf_rat, cmap=smap.cmap, norm=smap.norm, shading='auto')

# plt.colorbar(smap, cax=cbax, orientation='vertical')

# ax = axes[0]
# xlim = ax.get_xlim()
# ylim = ax.get_ylim()
# wid = 0.5 * np.max([xx[1] - xx[0] for xx in [xlim, ylim]])
# cen = [0.5*(xx[1] + xx[0]) for xx in [xlim, ylim]]
# lims = [[cc - wid, cc + wid] for cc in cen]
# for ff, ll in zip(['set_xlim', 'set_ylim'], lims):
#     getattr(ax, ff)(ll)

    
# fname = "2d-gaussian_cdf-test.pdf"
# save_fig(fig, fname)
# nbshow()

In [None]:
import numpy as np
np.random.seed(1234)
data = np.random.normal(0.0, 1.0, 1000)

# Construct `KDE` instance using this data, and the default bandwidth and kernels.

import kalepy as kale
kde = kale.KDE(data)

# Compare original PDF and the data to the reconstructed PDF from the KDE:

xx = np.linspace(-3, 3, 400)
pdf_tru = np.exp(-xx*xx/2) / np.sqrt(2*np.pi)
pdf_kde = kde.pdf(xx)[1]

import matplotlib.pyplot as plt
ll = plt.plot(xx, pdf_tru, 'k--', label='Normal PDF')
_, bins, _ = plt.hist(data, bins=14, density=True, \
                          color='0.5', rwidth=0.9, alpha=0.5, label='Data')
ll = plt.plot(xx, pdf_kde, 'r-', label='KDE')
ll = plt.legend()

# Compare the KDE reconstructed PDF to the "true" PDF, make sure the chi-squared is consistent:

dof = xx.size - 1
x2 = np.sum(np.square(pdf_kde - pdf_tru)/pdf_tru**2)
x2 = x2 / dof
print(x2 < 0.1)
# True
print("Chi-Squared: {:.1e}".format(x2))
# Chi-Squared: 1.7e-02

# Draw new samples from the data and make sure they are consistent with the original data:

import scipy as sp
samp = kde.resample()
ll = plt.hist(samp, bins=bins, density=True, color='r', alpha=0.5, rwidth=0.5, \
                  label='Samples')
ks, pv = sp.stats.ks_2samp(data, samp)
print(pv > 0.05)
# True
print("p-value: {:.1e}".format(pv))
# p-value: 9.5e-01
