Skip to content

Commit

Permalink
polars-ops (#3212)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Apr 22, 2022
1 parent 4e83b40 commit 2e492df
Show file tree
Hide file tree
Showing 39 changed files with 600 additions and 217 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ members = [
"polars/polars-lazy",
"polars/polars-time",
"polars/polars-utils",
"polars/polars-ops",
"examples/read_csv",
"examples/read_parquet",
"examples/python_rust_compiled_function",
Expand Down
32 changes: 26 additions & 6 deletions polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ partition_by = ["polars-core/partition_by"]
semi_anti_join = ["polars-core/semi_anti_join"]
list_eval = ["polars-lazy/list_eval"]
chunked_ids = ["polars-core/chunked_ids", "polars-lazy/chunked_ids"]
to_dummies = ["polars-ops/to_dummies"]

test = [
"lazy",
Expand Down Expand Up @@ -149,21 +150,38 @@ dtype-slim = [
]

# opt-in datatypes for Series
dtype-date = ["polars-core/dtype-date", "polars-lazy/dtype-date", "polars-io/dtype-date", "polars-time/dtype-date"]
dtype-date = [
"polars-core/dtype-date",
"polars-lazy/dtype-date",
"polars-io/dtype-date",
"polars-time/dtype-date",
"polars-core/dtype-date",
]
dtype-datetime = [
"polars-core/dtype-datetime",
"polars-lazy/dtype-datetime",
"polars-io/dtype-datetime",
"polars-time/dtype-datetime",
"polars-ops/dtype-datetime",
]
dtype-duration = [
"polars-core/dtype-duration",
"polars-lazy/dtype-duration",
"polars-time/dtype-duration",
"polars-core/dtype-duration",
]
dtype-duration = ["polars-core/dtype-duration", "polars-lazy/dtype-duration", "polars-time/dtype-duration"]
dtype-time = ["polars-core/dtype-time", "polars-io/dtype-time", "polars-time/dtype-time"]
dtype-time = ["polars-core/dtype-time", "polars-io/dtype-time", "polars-time/dtype-time", "polars-ops/dtype-time"]
dtype-i8 = ["polars-core/dtype-i8", "polars-lazy/dtype-i8"]
dtype-i16 = ["polars-core/dtype-i16", "polars-lazy/dtype-i16"]
dtype-u8 = ["polars-core/dtype-u8", "polars-lazy/dtype-u8"]
dtype-u8 = ["polars-core/dtype-u8", "polars-lazy/dtype-u8", "polars-ops/dtype-u8"]
dtype-u16 = ["polars-core/dtype-u16", "polars-lazy/dtype-u16"]
dtype-categorical = ["polars-core/dtype-categorical", "polars-io/dtype-categorical", "polars-lazy/dtype-categorical"]
dtype-struct = ["polars-core/dtype-struct", "polars-lazy/dtype-struct"]
dtype-categorical = [
"polars-core/dtype-categorical",
"polars-io/dtype-categorical",
"polars-lazy/dtype-categorical",
"polars-ops/dtype-categorical",
]
dtype-struct = ["polars-core/dtype-struct", "polars-lazy/dtype-struct", "polars-ops/dtype-struct"]

docs-selection = [
"csv-file",
Expand Down Expand Up @@ -207,6 +225,7 @@ docs-selection = [
"dot_diagram",
"string_encoding",
"product",
"to_dummies",
]

bench = [
Expand All @@ -217,6 +236,7 @@ bench = [
polars-core = { version = "0.20.0", path = "./polars-core", features = ["docs", "private"], default-features = false }
polars-io = { version = "0.20.0", path = "./polars-io", features = ["private"], default-features = false, optional = true }
polars-lazy = { version = "0.20.0", path = "./polars-lazy", features = ["private"], default-features = false, optional = true }
polars-ops = { version = "0.20.0", path = "./polars-ops" }
polars-time = { version = "0.20.0", path = "./polars-time", default-features = false, optional = true }

[dev-dependencies]
Expand Down
4 changes: 3 additions & 1 deletion polars/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ check:
-p polars-io \
-p polars-lazy \
-p polars-arrow \
-p polars-time
-p polars-time \
-p polars-ops

clippy:
cargo clippy --all-features \
Expand All @@ -21,6 +22,7 @@ clippy:
-p polars-lazy \
-p polars-arrow \
-p polars-utils \
-p polars-ops \
-p polars-time

clippy-default:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,12 @@ impl CategoricalChunked {
self.logical.len()
}

pub(crate) fn name(&self) -> &str {
pub fn name(&self) -> &str {
self.logical.name()
}

/// Get a reference to the logical array (the categories).
pub(crate) fn logical(&self) -> &UInt32Chunked {
pub fn logical(&self) -> &UInt32Chunked {
&self.logical
}

Expand Down
8 changes: 0 additions & 8 deletions polars/polars-core/src/chunked_array/ops/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -499,14 +499,6 @@ pub trait ChunkUnique<T> {
}
}

pub trait ToDummies<T>: ChunkUnique<T> {
fn to_dummies(&self) -> Result<DataFrame> {
Err(PolarsError::InvalidOperation(
"is_duplicated is not implemented for this dtype".into(),
))
}
}

#[derive(Default, Copy, Clone, Eq, PartialEq, Debug)]
#[cfg_attr(feature = "serde-lazy", derive(Serialize, Deserialize))]
pub struct SortOptions {
Expand Down
91 changes: 3 additions & 88 deletions polars/polars-core/src/chunked_array/ops/unique/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@ pub(crate) mod rank;
#[cfg(feature = "object")]
use crate::chunked_array::object::ObjectType;
use crate::datatypes::PlHashSet;
use crate::frame::groupby::{GroupsProxy, IntoGroupsProxy};
use crate::frame::groupby::GroupsProxy;
#[cfg(feature = "mode")]
use crate::frame::groupby::IntoGroupsProxy;
use crate::prelude::*;
use rayon::prelude::*;
use std::hash::Hash;

fn finish_is_unique_helper(
Expand Down Expand Up @@ -237,92 +238,6 @@ impl ChunkUnique<Utf8Type> for Utf8Chunked {
}
}

#[cfg(feature = "dtype-u8")]
fn dummies_helper(mut groups: Vec<IdxSize>, len: usize, name: &str) -> UInt8Chunked {
groups.sort_unstable();

let mut av: Vec<_> = (0..len).map(|_| 0u8).collect();

for idx in groups {
let elem = unsafe { av.get_unchecked_mut(idx as usize) };
*elem = 1;
}

ChunkedArray::from_vec(name, av)
}

#[cfg(not(feature = "dtype-u8"))]
fn dummies_helper(mut groups: Vec<IdxSize>, len: usize, name: &str) -> Int32Chunked {
groups.sort_unstable();

// let mut group_member_iter = groups.into_iter();
let mut av: Vec<_> = (0..len).map(|_| 0i32).collect();

for idx in groups {
let elem = unsafe { av.get_unchecked_mut(idx as usize) };
*elem = 1;
}

ChunkedArray::from_vec(name, av)
}

fn sort_columns(mut columns: Vec<Series>) -> Vec<Series> {
columns.sort_by(|a, b| a.name().partial_cmp(b.name()).unwrap());
columns
}

impl ToDummies<Utf8Type> for Utf8Chunked {
fn to_dummies(&self) -> Result<DataFrame> {
let groups = self.group_tuples(true, false).into_idx();
let col_name = self.name();
let taker = self.take_rand();

let columns = groups
.into_par_iter()
.map(|(first, groups)| {
let name = match unsafe { taker.get_unchecked(first as usize) } {
Some(val) => format!("{}_{}", col_name, val),
None => format!("{}_null", col_name),
};
let ca = dummies_helper(groups, self.len(), &name);
ca.into_series()
})
.collect();

Ok(DataFrame::new_no_checks(sort_columns(columns)))
}
}
impl<T> ToDummies<T> for ChunkedArray<T>
where
T: PolarsIntegerType + Sync,
T::Native: Hash + Eq,
ChunkedArray<T>: ChunkOps + ChunkCompare<T::Native> + ChunkUnique<T>,
{
fn to_dummies(&self) -> Result<DataFrame> {
let groups = self.group_tuples(true, false).into_idx();
let col_name = self.name();
let taker = self.take_rand();

let columns = groups
.into_par_iter()
.map(|(first, groups)| {
let name = match unsafe { taker.get_unchecked(first as usize) } {
Some(val) => format!("{}_{}", col_name, val),
None => format!("{}_null", col_name),
};

let ca = dummies_helper(groups, self.len(), &name);
ca.into_series()
})
.collect();

Ok(DataFrame::new_no_checks(sort_columns(columns)))
}
}

impl ToDummies<Float32Type> for Float32Chunked {}
impl ToDummies<Float64Type> for Float64Chunked {}

impl ChunkUnique<BooleanType> for BooleanChunked {
fn unique(&self) -> Result<Self> {
// can be None, Some(true), Some(false)
Expand Down
84 changes: 1 addition & 83 deletions polars/polars-core/src/frame/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use rayon::prelude::*;

use crate::chunked_array::ops::unique::is_unique_helper;
use crate::prelude::*;
use crate::utils::{accumulate_dataframes_horizontal, get_supertype, split_ca, split_df, NoNull};
use crate::utils::{get_supertype, split_ca, split_df, NoNull};

#[cfg(feature = "dataframe_arithmetic")]
mod arithmetic;
Expand Down Expand Up @@ -2659,62 +2659,6 @@ impl DataFrame {
f(self, args)
}

/// Create dummy variables.
///
/// # Example
///
/// ```ignore
///
/// # #[macro_use] extern crate polars_core;
/// # fn main() {
///
/// use polars_core::prelude::*;
///
/// let df = df! {
/// "id" => &[1, 2, 3, 1, 2, 3, 1, 1],
/// "type" => &["A", "B", "B", "B", "C", "C", "C", "B"],
/// "code" => &["X1", "X2", "X3", "X3", "X2", "X2", "X1", "X1"]
/// }.unwrap();
///
/// let dummies = df.to_dummies().unwrap();
/// dbg!(dummies);
/// # }
/// ```
/// Outputs:
/// ```text
/// +------+------+------+--------+--------+--------+---------+---------+---------+
/// | id_1 | id_3 | id_2 | type_A | type_B | type_C | code_X1 | code_X2 | code_X3 |
/// | --- | --- | --- | --- | --- | --- | --- | --- | --- |
/// | u8 | u8 | u8 | u8 | u8 | u8 | u8 | u8 | u8 |
/// +======+======+======+========+========+========+=========+=========+=========+
/// | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
/// +------+------+------+--------+--------+--------+---------+---------+---------+
/// | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
/// +------+------+------+--------+--------+--------+---------+---------+---------+
/// | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
/// +------+------+------+--------+--------+--------+---------+---------+---------+
/// | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
/// +------+------+------+--------+--------+--------+---------+---------+---------+
/// | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 |
/// +------+------+------+--------+--------+--------+---------+---------+---------+
/// | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
/// +------+------+------+--------+--------+--------+---------+---------+---------+
/// | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 |
/// +------+------+------+--------+--------+--------+---------+---------+---------+
/// | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
/// +------+------+------+--------+--------+--------+---------+---------+---------+
/// ```
pub fn to_dummies(&self) -> Result<Self> {
let cols = POOL.install(|| {
self.columns
.par_iter()
.map(|s| s.to_dummies())
.collect::<Result<Vec<_>>>()
})?;

accumulate_dataframes_horizontal(cols)
}

/// Drop duplicate rows from a `DataFrame`.
/// *This fails when there is a column of type List in DataFrame*
///
Expand Down Expand Up @@ -3166,32 +3110,6 @@ mod test {
assert_eq!(sliced_df.shape(), (2, 2));
}

#[test]
#[cfg(feature = "dtype-u8")]
#[cfg_attr(miri, ignore)]
fn get_dummies() {
let df = df! {
"id" => &[1, 2, 3, 1, 2, 3, 1, 1],
"type" => &["A", "B", "B", "B", "C", "C", "C", "B"],
"code" => &["X1", "X2", "X3", "X3", "X2", "X2", "X1", "X1"]
}
.unwrap();
let dummies = df.to_dummies().unwrap();
assert_eq!(
Vec::from(dummies.column("id_1").unwrap().u8().unwrap()),
&[
Some(1),
Some(0),
Some(0),
Some(1),
Some(0),
Some(0),
Some(1),
Some(1)
]
);
}

#[test]
fn test_duplicate_column() {
let mut df = df! {
Expand Down
4 changes: 0 additions & 4 deletions polars/polars-core/src/series/implementations/dates_time.rs
Original file line number Diff line number Diff line change
Expand Up @@ -392,10 +392,6 @@ macro_rules! impl_dyn_series {
}
}

fn to_dummies(&self) -> Result<DataFrame> {
self.0.to_dummies()
}

fn get(&self, index: usize) -> AnyValue {
self.0.get_any_value(index)
}
Expand Down
4 changes: 0 additions & 4 deletions polars/polars-core/src/series/implementations/datetime.rs
Original file line number Diff line number Diff line change
Expand Up @@ -393,10 +393,6 @@ impl SeriesTrait for SeriesWrap<DatetimeChunked> {
}
}

fn to_dummies(&self) -> Result<DataFrame> {
self.0.to_dummies()
}

fn get(&self, index: usize) -> AnyValue {
self.0.get_any_value(index)
}
Expand Down
4 changes: 0 additions & 4 deletions polars/polars-core/src/series/implementations/duration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -377,10 +377,6 @@ impl SeriesTrait for SeriesWrap<DurationChunked> {
self.0.cast(data_type)
}

fn to_dummies(&self) -> Result<DataFrame> {
self.0.to_dummies()
}

fn get(&self, index: usize) -> AnyValue {
self.0.get_any_value(index)
}
Expand Down
4 changes: 0 additions & 4 deletions polars/polars-core/src/series/implementations/floats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -406,10 +406,6 @@ macro_rules! impl_dyn_series {
self.0.cast(data_type)
}

fn to_dummies(&self) -> Result<DataFrame> {
ToDummies::to_dummies(&self.0)
}

fn get(&self, index: usize) -> AnyValue {
self.0.get_any_value(index)
}
Expand Down
4 changes: 0 additions & 4 deletions polars/polars-core/src/series/implementations/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -575,10 +575,6 @@ macro_rules! impl_dyn_series {
self.0.cast(data_type)
}

fn to_dummies(&self) -> Result<DataFrame> {
ToDummies::to_dummies(&self.0)
}

fn get(&self, index: usize) -> AnyValue {
self.0.get_any_value(index)
}
Expand Down

0 comments on commit 2e492df

Please sign in to comment.