Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Log-normal probability distribution #218

Merged
merged 14 commits into from Apr 18, 2024
2 changes: 2 additions & 0 deletions skpro/distributions/__init__.py
Expand Up @@ -7,6 +7,7 @@
"Laplace",
"Mixture",
"Normal",
"LogNormal",
"Poisson",
"QPD_S",
"QPD_B",
Expand All @@ -17,6 +18,7 @@

from skpro.distributions.empirical import Empirical
from skpro.distributions.laplace import Laplace
from skpro.distributions.lognormal import LogNormal
from skpro.distributions.mixture import Mixture
from skpro.distributions.normal import Normal
from skpro.distributions.poisson import Poisson
Expand Down
174 changes: 174 additions & 0 deletions skpro/distributions/lognormal.py
@@ -0,0 +1,174 @@
# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
"""Log-Normal probability distribution."""

__author__ = ["bhavikar04", "fkiraly"]

import numpy as np
import pandas as pd
from scipy.special import erf, erfinv

from skpro.distributions.base import BaseDistribution


class LogNormal(BaseDistribution):
"""Log-Normal distribution (skpro native).

Parameters
----------
mean : float or array of float (1D or 2D)
mean of the logarithm of the distribution
sd : float or array of float (1D or 2D), must be positive
standard deviation the logarithm of the distribution
index : pd.Index, optional, default = RangeIndex
columns : pd.Index, optional, default = RangeIndex

Example
-------
>>> from skpro.distributions.lognormal import LogNormal

>>> n = LogNormal(mu=[[0, 1], [2, 3], [4, 5]], sigma=1)
"""

_tags = {
"authors": ["bhavikar04", "fkiraly"],
"capabilities:approx": ["pdflognorm"],
"capabilities:exact": ["mean", "var", "energy", "pdf", "log_pdf", "cdf", "ppf"],
"distr:measuretype": "continuous",
}

def __init__(self, mu, sigma, index=None, columns=None):
self.mu = mu
self.sigma = sigma
self.index = index
self.columns = columns

# todo: untangle index handling
# and broadcast of parameters.
# move this functionality to the base class
self._mu, self._sigma = self._get_bc_params()
shape = self._mu.shape

if index is None:
index = pd.RangeIndex(shape[0])

if columns is None:
columns = pd.RangeIndex(shape[1])

super().__init__(index=index, columns=columns)

def _get_bc_params(self):
"""Fully broadcast parameters of self, given param shapes and index, columns."""
to_broadcast = [self.mu, self.sigma]
if hasattr(self, "index") and self.index is not None:
to_broadcast += [self.index.to_numpy().reshape(-1, 1)]
if hasattr(self, "columns") and self.columns is not None:
to_broadcast += [self.columns.to_numpy()]
bc = np.broadcast_arrays(*to_broadcast)
return bc[0], bc[1]

# commented out, seems incorrect
# def energy(self, x=None):
# r"""Energy of self, w.r.t. self or a constant frame x.

# Let :math:`X, Y` be i.i.d. random variables with the distribution of `self`.

# If `x` is `None`, returns :math:`\mathbb{E}[|X-Y|]` (per row), "self-energy".
# If `x` is passed, returns :math:`\mathbb{E}[|X-x|]-0.5\mathbb{E}[|X-Y|]`
# (per row), "CRPS wrt x".

# Parameters
# ----------
# x : None or pd.DataFrame, optional, default=None
# if pd.DataFrame, must have same rows and columns as `self`

# Returns
# -------
# pd.DataFrame with same rows as `self`, single column `"energy"`
# each row contains one float, self-energy/energy as described above.
# """
# if x is None:
# return super().energy(x)
# # explicit formula for CRPS of log-normal cross-term
# # obtained by bhavikar04 via wolfram alpha
# else:
# d = self.loc[x.index, x.columns]
# mu_arr, sd_arr = d._mu, d._sigma
# c_arr = x * (2 * self.cdf(x) - 1)
# c_arr2 = -2 * np.exp((mu_arr + sd_arr**2) / 2)
# c_arr3 = self.cdf((np.log(x) - mu_arr - sd_arr**2) / sd_arr)
# c_arr3 = c_arr3 + self.cdf(sd_arr / mu_arr**0.5) - 1
# c_arr2 = c_arr2 * c_arr3
# c_arr = c_arr + c_arr2

# energy_arr = np.sum(c_arr, axis=1)
# energy = pd.DataFrame(energy_arr, index=self.index, columns=["energy"])
# return energy

def mean(self):
r"""Return expected value of the distribution.

Let :math:`X` be a random variable with the distribution of `self`.
Returns the expectation :math:`\mathbb{E}[X]`

Returns
-------
pd.DataFrame with same rows, columns as `self`
expected value of distribution (entry-wise)
"""
mean_arr = np.exp(self._mu + self._sigma**2 / 2)
return pd.DataFrame(mean_arr, index=self.index, columns=self.columns)

def var(self):
r"""Return element/entry-wise variance of the distribution.

Let :math:`X` be a random variable with the distribution of `self`.
Returns :math:`\mathbb{V}[X] = \mathbb{E}\left(X - \mathbb{E}[X]\right)^2`

Returns
-------
pd.DataFrame with same rows, columns as `self`
variance of distribution (entry-wise)
"""
mu = self._mu
sigma = self._sigma
sd_arr = np.exp(2 * mu + 2 * sigma**2) - np.exp(2 * mu + sigma**2)
return pd.DataFrame(sd_arr, index=self.index, columns=self.columns) ** 2

def pdf(self, x):
"""Probability density function."""
d = self.loc[x.index, x.columns]
pdf_arr = np.exp(-0.5 * ((np.log(x.values) - d.mu) / d.sigma) ** 2)
pdf_arr = pdf_arr / (x.values * d.sigma * np.sqrt(2 * np.pi))
return pd.DataFrame(pdf_arr, index=x.index, columns=x.columns)

def log_pdf(self, x):
"""Logarithmic probability density function."""
d = self.loc[x.index, x.columns]
lpdf_arr = -0.5 * ((np.log(x.values) - d.mu) / d.sigma) ** 2
lpdf_arr = lpdf_arr - np.log(x.values * d.sigma * np.sqrt(2 * np.pi))
return pd.DataFrame(lpdf_arr, index=x.index, columns=x.columns)

def cdf(self, x):
"""Cumulative distribution function."""
d = self.loc[x.index, x.columns]
cdf_arr = 0.5 + 0.5 * erf((np.log(x.values) - d.mu) / (d.sigma * np.sqrt(2)))
return pd.DataFrame(cdf_arr, index=x.index, columns=x.columns)

def ppf(self, p):
"""Quantile function = percent point function = inverse cdf."""
d = self.loc[p.index, p.columns]
icdf_arr = d.mu + d.sigma * np.sqrt(2) * erfinv(2 * p.values - 1)
icdf_arr = np.exp(self._sigma * icdf_arr)
return pd.DataFrame(icdf_arr, index=p.index, columns=p.columns)

@classmethod
def get_test_params(cls, parameter_set="default"):
"""Return testing parameter settings for the estimator."""
params1 = {"mu": [[0, 1], [2, 3], [4, 5]], "sigma": 1}
params2 = {
"mu": 0,
"sigma": 1,
"index": pd.Index([1, 2, 5]),
"columns": pd.Index(["a", "b"]),
}
return [params1, params2]