Skip to content

Commit

Permalink
add native log and entropy expression (#2952)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Mar 23, 2022
1 parent 7791655 commit ad2f32a
Show file tree
Hide file tree
Showing 16 changed files with 207 additions and 16 deletions.
1 change: 1 addition & 0 deletions polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ dot_diagram = ["polars-lazy/dot_diagram"]
dataframe_arithmetic = ["polars-core/dataframe_arithmetic"]
product = ["polars-core/product"]
unique_counts = ["polars-core/unique_counts", "polars-lazy/unique_counts"]
log = ["polars-core/log", "polars-lazy/log"]

series_from_anyvalue = ["polars-core/series_from_anyvalue"]

Expand Down
2 changes: 2 additions & 0 deletions polars/polars-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ ewma = ["polars-utils"]
dataframe_arithmetic = []
product = []
unique_counts = []
log = []

dynamic_groupby = ["dtype-datetime", "dtype-date"]

Expand Down Expand Up @@ -135,6 +136,7 @@ docs-selection = [
"string_encoding",
"product",
"unique_counts",
"log",
]

[dependencies]
Expand Down
54 changes: 54 additions & 0 deletions polars/polars-core/src/chunked_array/ops/apply.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,60 @@ macro_rules! apply_enumerate {
}};
}

fn apply_in_place_impl<S, F>(name: &str, chunks: Vec<ArrayRef>, f: F) -> ChunkedArray<S>
where
F: Fn(S::Native) -> S::Native + Copy,
S: PolarsNumericType,
{
use arrow::Either::*;
let chunks = chunks
.into_iter()
.map(|arr| {
let owned_arr = arr
.as_any()
.downcast_ref::<PrimitiveArray<S::Native>>()
.unwrap()
.clone();
// make sure we have a single ref count coming in.
drop(arr);

match owned_arr.into_mut() {
Left(immutable) => Arc::new(arrow::compute::arity::unary(
&immutable,
f,
S::get_dtype().to_arrow(),
)),
Right(mut mutable) => {
let vals = mutable.values_mut_slice();
vals.iter_mut().for_each(|v| *v = f(*v));
mutable.into_arc()
}
}
})
.collect();
ChunkedArray::<S>::from_chunks(name, chunks)
}

impl<T: PolarsNumericType> ChunkedArray<T> {
/// Cast a numeric array to another numeric data type and apply a function in place.
/// This saves an allocation.
pub fn cast_and_apply_in_place<F, S>(&self, f: F) -> ChunkedArray<S>
where
F: Fn(S::Native) -> S::Native + Copy,
S: PolarsNumericType,
{
// if we cast, we create a new arrow buffer
// then we clone the arrays and drop the casted arrays
// this will ensure we have a single ref count
// and we can mutate in place
let chunks = {
let s = self.cast(&S::get_dtype()).unwrap();
s.chunks().clone()
};
apply_in_place_impl(self.name(), chunks, f)
}
}

impl<'a, T> ChunkApply<'a, T::Native, T::Native> for ChunkedArray<T>
where
T: PolarsNumericType,
Expand Down
34 changes: 34 additions & 0 deletions polars/polars-core/src/series/ops/log.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
use crate::prelude::*;

fn log<T: PolarsNumericType>(ca: &ChunkedArray<T>, base: f64) -> Float64Chunked {
ca.cast_and_apply_in_place(|v: f64| v.log(base))
}

impl Series {
/// Compute the logarithm to a given base
#[cfg_attr(docsrs, doc(cfg(feature = "log")))]
pub fn log(&self, base: f64) -> Series {
let s = self.to_physical_repr();
let s = s.as_ref();

use DataType::*;
match s.dtype() {
Int32 => log(s.i32().unwrap(), base).into_series(),
Int64 => log(s.i64().unwrap(), base).into_series(),
UInt32 => log(s.u32().unwrap(), base).into_series(),
UInt64 => log(s.u64().unwrap(), base).into_series(),
Float32 => s.f32().unwrap().apply(|v| v.log(base as f32)).into_series(),
Float64 => s.f64().unwrap().apply(|v| v.log(base)).into_series(),
_ => unimplemented!(),
}
}

/// Compute the entropy as `-sum(pk * log(pk)`.
/// where `pk` are discrete probabilities.
#[cfg_attr(docsrs, doc(cfg(feature = "log")))]
pub fn entropy(&self, base: f64) -> Option<f64> {
let pk = self;
let log_pk = pk.log(base);
(pk * &log_pk).sum::<f64>().map(|v| -v)
}
}
8 changes: 8 additions & 0 deletions polars/polars-core/src/series/ops/mod.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,22 @@
#[cfg(feature = "diff")]
#[cfg_attr(docsrs, doc(cfg(feature = "diff")))]
pub mod diff;
#[cfg(feature = "ewma")]
#[cfg_attr(docsrs, doc(cfg(feature = "ewma")))]
mod ewm;
mod extend;
#[cfg(feature = "log")]
#[cfg_attr(docsrs, doc(cfg(feature = "log")))]
mod log;
#[cfg(feature = "moment")]
#[cfg_attr(docsrs, doc(cfg(feature = "moment")))]
pub mod moment;
mod null;
#[cfg(feature = "pct_change")]
#[cfg_attr(docsrs, doc(cfg(feature = "pct_change")))]
pub mod pct_change;
#[cfg(feature = "round_series")]
#[cfg_attr(docsrs, doc(cfg(feature = "round_series")))]
mod round;
mod to_list;
mod unique;
Expand Down
1 change: 1 addition & 0 deletions polars/polars-lazy/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ dynamic_groupby = ["polars-core/dynamic_groupby"]
ewma = ["polars-core/ewma"]
dot_diagram = []
unique_counts = ["polars-core/unique_counts"]
log = ["polars-core/log"]

# no guarantees whatsoever
private = ["polars-time/private"]
Expand Down
35 changes: 35 additions & 0 deletions polars/polars-lazy/src/dsl/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2103,6 +2103,41 @@ impl Expr {
.with_fmt("unique_counts")
}

#[cfg(feature = "log")]
#[cfg_attr(docsrs, doc(cfg(feature = "log")))]
/// Compute the logarithm to a given base
pub fn log(self, base: f64) -> Self {
self.map(
move |s| Ok(s.log(base)),
GetOutput::map_dtype(|dt| {
if matches!(dt, DataType::Float32) {
DataType::Float32
} else {
DataType::Float64
}
}),
)
.with_fmt("log")
}

#[cfg(feature = "log")]
#[cfg_attr(docsrs, doc(cfg(feature = "log")))]
/// Compute the entropy as `-sum(pk * log(pk)`.
/// where `pk` are discrete probabilities.
pub fn entropy(self, base: f64) -> Self {
self.apply(
move |s| Ok(Series::new(s.name(), [s.entropy(base)])),
GetOutput::map_dtype(|dt| {
if matches!(dt, DataType::Float32) {
DataType::Float32
} else {
DataType::Float64
}
}),
)
.with_fmt("entropy")
}

#[cfg(feature = "strings")]
pub fn str(self) -> string::StringNameSpace {
string::StringNameSpace(self)
Expand Down
1 change: 1 addition & 0 deletions polars/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,7 @@
//! - `diff` - `diff` operation.
//! - `pct_change` - Compute change percentages.
//! - `unique_counts` - Count unique values in expressions.
//! - `log` - Logarithms for `Series`.
//! * `DataFrame` pretty printing (Choose one or none, but not both):
//! - `fmt` - Activate DataFrame formatting
//!
Expand Down
2 changes: 2 additions & 0 deletions polars/tests/it/core/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
mod list;
#[cfg(feature = "rows")]
mod pivot;

use polars::prelude::*;
1 change: 1 addition & 0 deletions py-polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ features = [
"ndarray",
"series_from_anyvalue",
"unique_counts",
"log",
]

# [patch.crates-io]
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/expression.rst
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ Computations
Expr.pct_change
Expr.skew
Expr.kurtosis
Expr.entropy
Expr.sqrt
Expr.sin
Expr.cos
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ Computations
Series.pct_change
Series.skew
Series.kurtosis
Series.entropy
Series.sqrt
Series.sin
Series.cos
Expand Down
36 changes: 26 additions & 10 deletions py-polars/polars/internals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
except ImportError: # pragma: no cover
_DOCUMENTING = True

import math

from polars import internals as pli
from polars.datatypes import (
DataType,
Expand Down Expand Up @@ -232,20 +234,11 @@ def sqrt(self) -> "Expr":
"""
return self ** 0.5

def log(self) -> "Expr":
"""
Natural logarithm, element-wise.
The natural logarithm log is the inverse of the exponential function, so that log(exp(x)) = x.
The natural logarithm is logarithm in base e.
"""
return np.log(self) # type: ignore

def log10(self) -> "Expr":
"""
Return the base 10 logarithm of the input array, element-wise.
"""
return np.log10(self) # type: ignore
return self.log(10.0)

def exp(self) -> "Expr":
"""
Expand Down Expand Up @@ -2864,6 +2857,29 @@ def unique_counts(self) -> "Expr":
"""
return wrap_expr(self._pyexpr.unique_counts())

def log(self, base: float = math.e) -> "Expr":
"""
Compute the logarithm to a given base
Parameters
----------
base
Given base, defaults to `e`
"""
return wrap_expr(self._pyexpr.log(base))

def entropy(self, base: float = math.e) -> "Expr":
"""
Compute the entropy as `-sum(pk * log(pk)`.
where `pk` are discrete probabilities.
Parameters
----------
base
Given base, defaults to `e`
"""
return wrap_expr(self._pyexpr.entropy(base))

# Below are the namespaces defined. Keep these at the end of the definition of Expr, as to not confuse mypy with
# the type annotation `str` with the namespace "str"

Expand Down
18 changes: 12 additions & 6 deletions py-polars/polars/internals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
except ImportError: # pragma: no cover
_PYARROW_AVAILABLE = False

import math

from polars import internals as pli
from polars.internals.construction import (
arrow_to_pyseries,
Expand Down Expand Up @@ -539,14 +541,11 @@ def all(self) -> "Series":
"""
return self.to_frame().select(pli.col(self.name).all()).to_series()

def log(self) -> "Series":
def log(self, base: float = math.e) -> "Series":
"""
Natural logarithm, element-wise.
The natural logarithm log is the inverse of the exponential function, so that log(exp(x)) = x.
The natural logarithm is logarithm in base e.
Compute the logarithm to a given base
"""
return np.log(self) # type: ignore
return self.to_frame().select(pli.col(self.name).log(base)).to_series()

def log10(self) -> "Series":
"""
Expand Down Expand Up @@ -920,6 +919,13 @@ def unique_counts(self) -> "Series":
"""
return pli.select(pli.lit(self).unique_counts()).to_series()

def entropy(self, base: float = math.e) -> Optional[float]:
"""
Compute the entropy as `-sum(pk * log(pk)`.
where `pk` are discrete probabilities.
"""
return pli.select(pli.lit(self).entropy(base)).to_series()[0]

@property
def name(self) -> str:
"""
Expand Down
8 changes: 8 additions & 0 deletions py-polars/src/lazy/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1291,6 +1291,14 @@ impl PyExpr {
pub fn struct_rename_fields(&self, names: Vec<String>) -> PyExpr {
self.inner.clone().struct_().rename_fields(names).into()
}

pub fn log(&self, base: f64) -> Self {
self.inner.clone().log(base).into()
}

pub fn entropy(&self, base: f64) -> Self {
self.inner.clone().entropy(base).into()
}
}

impl From<dsl::Expr> for PyExpr {
Expand Down
20 changes: 20 additions & 0 deletions py-polars/tests/test_exprs.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,3 +162,23 @@ def test_unique_counts() -> None:
s = pl.Series("id", ["a", "b", "b", "c", "c", "c"])
expected = pl.Series("id", [1, 2, 3], dtype=pl.UInt32)
verify_series_and_expr_api(s, expected, "unique_counts")


def test_entropy() -> None:
df = pl.DataFrame({"id": [1, 1, 2, 2, 3]})
assert (
df.select(
[
(
-(
pl.col("id").unique_counts()
/ pl.count()
* (pl.col("id").unique_counts() / pl.count()).log()
).sum()
).alias("e0"),
((pl.col("id").unique_counts() / pl.count()).entropy()).alias("e1"),
]
).rows()
== [(1.0549201679861442, 1.0549201679861442)]
)
assert df["id"].entropy() == -6.068425588244111

0 comments on commit ad2f32a

Please sign in to comment.