internal/go-moremath/stats/kde.go

// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package stats

import (
	"fmt"
	"math"
)

// A KDE is a distribution that estimates the underlying distribution
// of a Sample using kernel density estimation.
//
// Kernel density estimation is a method for constructing an estimate
// ƒ̂(x) of a unknown distribution ƒ(x) given a sample from that
// distribution. Unlike many techniques, kernel density estimation is
// non-parametric: in general, it doesn't assume any particular true
// distribution (note, however, that the resulting distribution
// depends deeply on the selected bandwidth, and many bandwidth
// estimation techniques assume normal reference rules).
//
// A kernel density estimate is similar to a histogram, except that it
// is a smooth probability estimate and does not require choosing a
// bin size and discretizing the data.
//
// Sample is the only required field. All others have reasonable
// defaults.
type KDE struct {
	// Sample is the data sample underlying this KDE.
	Sample Sample

	// Kernel is the kernel to use for the KDE.
	Kernel KDEKernel

	// Bandwidth is the bandwidth to use for the KDE.
	//
	// If this is zero, the bandwidth is computed from the
	// provided data using a default bandwidth estimator
	// (currently BandwidthScott).
	Bandwidth float64

	// BoundaryMethod is the boundary correction method to use for
	// the KDE. The default value is BoundaryReflect; however, the
	// default bounds are effectively +/-inf, which is equivalent
	// to performing no boundary correction.
	BoundaryMethod KDEBoundaryMethod

	// [BoundaryMin, BoundaryMax) specify a bounded support for
	// the KDE. If both are 0 (their default values), they are
	// treated as +/-inf.
	//
	// To specify a half-bounded support, set Min to math.Inf(-1)
	// or Max to math.Inf(1).
	BoundaryMin float64
	BoundaryMax float64
}

// BandwidthSilverman is a bandwidth estimator implementing
// Silverman's Rule of Thumb. It's fast, but not very robust to
// outliers as it assumes data is approximately normal.
//
// Silverman, B. W. (1986) Density Estimation.
func BandwidthSilverman(data interface {
	StdDev() float64
	Weight() float64
}) float64 {
	return 1.06 * data.StdDev() * math.Pow(data.Weight(), -1.0/5)
}

// BandwidthScott is a bandwidth estimator implementing Scott's Rule.
// This is generally robust to outliers: it chooses the minimum
// between the sample's standard deviation and an robust estimator of
// a Gaussian distribution's standard deviation.
//
// Scott, D. W. (1992) Multivariate Density Estimation: Theory,
// Practice, and Visualization.
func BandwidthScott(data interface {
	StdDev() float64
	Weight() float64
	Percentile(float64) float64
}) float64 {
	iqr := data.Percentile(0.75) - data.Percentile(0.25)
	hScale := 1.06 * math.Pow(data.Weight(), -1.0/5)
	stdDev := data.StdDev()
	if stdDev < iqr/1.349 {
		// Use Silverman's Rule of Thumb
		return hScale * stdDev
	} else {
		// Use IQR/1.349 as a robust estimator of the standard
		// deviation of a Gaussian distribution.
		return hScale * (iqr / 1.349)
	}
}

// TODO(austin) Implement bandwidth estimator from Botev, Grotowski,
// Kroese. (2010) Kernel Density Estimation via Diffusion.

// KDEKernel represents a kernel to use for a KDE.
type KDEKernel int

//go:generate stringer -type=KDEKernel

const (
	// An EpanechnikovKernel is a smooth kernel with bounded
	// support. As a result, the KDE will also have bounded
	// support. It is "optimal" in the sense that it minimizes the
	// asymptotic mean integrated squared error (AMISE).
	EpanechnikovKernel KDEKernel = iota

	// A GaussianKernel is a Gaussian (normal) kernel.
	GaussianKernel

	// A DeltaKernel is a Dirac delta function. The PDF of such a
	// KDE is not well-defined, but the CDF will represent each
	// sample as an instantaneous increase. This kernel ignores
	// bandwidth and never requires boundary correction.
	DeltaKernel
)

// KDEBoundaryMethod represents a boundary correction method for
// constructing a KDE with bounded support.
type KDEBoundaryMethod int

//go:generate stringer -type=KDEBoundaryMethod

const (
	// BoundaryReflect reflects the density estimate at the
	// boundaries.  For example, for a KDE with support [0, inf),
	// this is equivalent to ƒ̂ᵣ(x)=ƒ̂(x)+ƒ̂(-x) for x>=0.  This is a
	// simple and fast technique, but enforces that ƒ̂ᵣ'(0)=0, so
	// it may not be applicable to all distributions.
	BoundaryReflect KDEBoundaryMethod = iota
)

type kdeKernel interface {
	pdfEach(xs []float64) []float64
	cdfEach(xs []float64) []float64
}

func (k *KDE) prepare() (kdeKernel, bool) {
	// Compute bandwidth.
	if k.Bandwidth == 0 {
		k.Bandwidth = BandwidthScott(k.Sample)
	}

	// Construct kernel.
	kernel := kdeKernel(nil)
	switch k.Kernel {
	default:
		panic(fmt.Sprint("unknown kernel", k))
	case EpanechnikovKernel:
		kernel = epanechnikovKernel{k.Bandwidth}
	case GaussianKernel:
		kernel = NormalDist{0, k.Bandwidth}
	case DeltaKernel:
		kernel = DeltaDist{0}
	}

	// Use boundary correction?
	bc := k.BoundaryMin != 0 || k.BoundaryMax != 0

	return kernel, bc
}

// TODO: For KDEs of histograms, make histograms able to create a
// weighted Sample and simply require the caller to provide a
// good bandwidth from a StreamStats.

// normalizedXs returns x - kde.Sample.Xs. Evaluating kernels shifted
// by kde.Sample.Xs all at x is equivalent to evaluating one unshifted
// kernel at x - kde.Sample.Xs.
func (kde *KDE) normalizedXs(x float64) []float64 {
	txs := make([]float64, len(kde.Sample.Xs))
	for i, xi := range kde.Sample.Xs {
		txs[i] = x - xi
	}
	return txs
}

func (kde *KDE) PDF(x float64) float64 {
	kernel, bc := kde.prepare()

	// Apply boundary
	if bc && (x < kde.BoundaryMin || x >= kde.BoundaryMax) {
		return 0
	}

	y := func(x float64) float64 {
		// Shift kernel to each of kde.xs and evaluate at x
		ys := kernel.pdfEach(kde.normalizedXs(x))

		// Kernel samples are weighted according to the weights of xs
		wys := Sample{Xs: ys, Weights: kde.Sample.Weights}

		return wys.Sum() / wys.Weight()
	}
	if !bc {
		return y(x)
	}
	switch kde.BoundaryMethod {
	default:
		panic("unknown boundary correction method")
	case BoundaryReflect:
		if math.IsInf(kde.BoundaryMax, 1) {
			return y(x) + y(2*kde.BoundaryMin-x)
		} else if math.IsInf(kde.BoundaryMin, -1) {
			return y(x) + y(2*kde.BoundaryMax-x)
		} else {
			d := 2 * (kde.BoundaryMax - kde.BoundaryMin)
			w := 2 * (x - kde.BoundaryMin)
			return series(func(n float64) float64 {
				// Points >= x
				return y(x+n*d) + y(x+n*d-w)
			}) + series(func(n float64) float64 {
				// Points < x
				return y(x-(n+1)*d+w) + y(x-(n+1)*d)
			})
		}
	}
}

func (kde *KDE) CDF(x float64) float64 {
	kernel, bc := kde.prepare()

	// Apply boundary
	if bc {
		if x < kde.BoundaryMin {
			return 0
		} else if x >= kde.BoundaryMax {
			return 1
		}
	}

	y := func(x float64) float64 {
		// Shift kernel integral to each of cdf.xs and evaluate at x
		ys := kernel.cdfEach(kde.normalizedXs(x))

		// Kernel samples are weighted according to the weights of xs
		wys := Sample{Xs: ys, Weights: kde.Sample.Weights}

		return wys.Sum() / wys.Weight()
	}
	if !bc {
		return y(x)
	}
	switch kde.BoundaryMethod {
	default:
		panic("unknown boundary correction method")
	case BoundaryReflect:
		if math.IsInf(kde.BoundaryMax, 1) {
			return y(x) - y(2*kde.BoundaryMin-x)
		} else if math.IsInf(kde.BoundaryMin, -1) {
			return y(x) + (1 - y(2*kde.BoundaryMax-x))
		} else {
			d := 2 * (kde.BoundaryMax - kde.BoundaryMin)
			w := 2 * (x - kde.BoundaryMin)
			return series(func(n float64) float64 {
				// Windows >= x-w
				return y(x+n*d) - y(x+n*d-w)
			}) + series(func(n float64) float64 {
				// Windows < x-w
				return y(x-(n+1)*d) - y(x-(n+1)*d-w)
			})
		}
	}
}

func (kde *KDE) Bounds() (low float64, high float64) {
	_, bc := kde.prepare()

	// TODO(austin) If this KDE came from a histogram, we'd better
	// not sample at a significantly higher rate than the
	// histogram.  Maybe we want to just return the bounds of the
	// histogram?

	// TODO(austin) It would be nice if this could be instructed
	// to include all original data points, even if they are in
	// the tail.  Probably that should just be up to the caller to
	// pass an axis derived from the bounds of the original data.

	// Use the lowest and highest samples as starting points
	lowX, highX := kde.Sample.Bounds()
	if lowX == highX {
		lowX -= 1
		highX += 1
	}

	// Find the end points that contain 99% of the CDF's weight.
	// Since bisect requires that the root be bracketed, start by
	// expanding our range if necessary.  TODO(austin) This can
	// definitely be done faster.
	const (
		lowY      = 0.005
		highY     = 0.995
		tolerance = 0.001
	)
	for kde.CDF(lowX) > lowY {
		lowX -= highX - lowX
	}
	for kde.CDF(highX) < highY {
		highX += highX - lowX
	}
	// Explicitly accept discontinuities, since we may be using a
	// discontiguous kernel.
	low, _ = bisect(func(x float64) float64 { return kde.CDF(x) - lowY }, lowX, highX, tolerance)
	high, _ = bisect(func(x float64) float64 { return kde.CDF(x) - highY }, lowX, highX, tolerance)

	// Expand width by 20% to give some margins
	width := high - low
	low, high = low-0.1*width, high+0.1*width

	// Limit to bounds
	if bc {
		low = math.Max(low, kde.BoundaryMin)
		high = math.Min(high, kde.BoundaryMax)
	}

	return
}

type epanechnikovKernel struct {
	h float64
}

func (d epanechnikovKernel) pdfEach(xs []float64) []float64 {
	ys := make([]float64, len(xs))
	a := 0.75 / d.h
	invhh := 1 / (d.h * d.h)
	for i, x := range xs {
		if -d.h < x && x < d.h {
			ys[i] = a * (1 - x*x*invhh)
		}
	}
	return ys
}

func (d epanechnikovKernel) cdfEach(xs []float64) []float64 {
	ys := make([]float64, len(xs))
	invh := 1 / d.h
	for i, x := range xs {
		if x > d.h {
			ys[i] = 1
		} else if x > -d.h {
			u := x * invh
			ys[i] = 0.25 * (2 + 3*u - u*u*u)
		}
	}
	return ys
}