Skip to content

Commit

Permalink
feat[rust, python]: add columns parameter to dummy functions (#4531)
Browse files Browse the repository at this point in the history
  • Loading branch information
matteosantama authored and ritchie46 committed Aug 22, 2022
1 parent 0c9086c commit 133f22d
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 7 deletions.
18 changes: 17 additions & 1 deletion polars/polars-ops/src/frame.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,28 @@ pub trait DataFrameOps: IntoDf {
/// ```
#[cfg(feature = "to_dummies")]
fn to_dummies(&self) -> Result<DataFrame> {
self._to_dummies(None)
}

#[cfg(feature = "to_dummies")]
fn columns_to_dummies(&self, columns: Vec<&str>) -> Result<DataFrame> {
self._to_dummies(Some(columns))
}

#[cfg(feature = "to_dummies")]
fn _to_dummies(&self, columns: Option<Vec<&str>>) -> Result<DataFrame> {
let df = self.to_df();

let set: PlHashSet<&str> =
PlHashSet::from_iter(columns.unwrap_or_else(|| df.get_column_names()));

let cols = POOL.install(|| {
df.get_columns()
.par_iter()
.map(|s| s.to_ops().to_dummies())
.map(|s| match set.contains(s.name()) {
true => s.to_ops().to_dummies(),
false => Ok(s.clone().into_frame()),
})
.collect::<Result<Vec<_>>>()
})?;

Expand Down
10 changes: 8 additions & 2 deletions py-polars/polars/internals/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5364,10 +5364,16 @@ def quantile(
"""
return self._from_pydf(self._df.quantile(quantile, interpolation))

def to_dummies(self: DF) -> DF:
def to_dummies(self: DF, *, columns: list[str] | None = None) -> DF:
"""
Get one hot encoded dummy variables.
Parameters
----------
columns:
A subset of columns to convert to dummy variables. ``None`` means
"all columns".
Examples
--------
>>> df = pl.DataFrame(
Expand All @@ -5392,7 +5398,7 @@ def to_dummies(self: DF) -> DF:
└───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┘
"""
return self._from_pydf(self._df.to_dummies())
return self._from_pydf(self._df.to_dummies(columns))

def unique(
self: DF,
Expand Down
9 changes: 7 additions & 2 deletions py-polars/polars/internals/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,22 @@
from polars.internals.type_aliases import ClosedWindow, ConcatMethod, TimeUnit


def get_dummies(df: pli.DataFrame) -> pli.DataFrame:
def get_dummies(
df: pli.DataFrame, *, columns: list[str] | None = None
) -> pli.DataFrame:
"""
Convert categorical variables into dummy/indicator variables.
Parameters
----------
df
DataFrame to convert.
columns
A subset of columns to convert to dummy variables. ``None`` means
"all columns".
"""
return df.to_dummies()
return df.to_dummies(columns=columns)


@overload
Expand Down
10 changes: 8 additions & 2 deletions py-polars/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1228,8 +1228,14 @@ impl PyDataFrame {
Ok(df.into())
}

pub fn to_dummies(&self) -> PyResult<Self> {
let df = self.df.to_dummies().map_err(PyPolarsErr::from)?;
pub fn to_dummies(&self, columns: Option<Vec<String>>) -> PyResult<Self> {
let df = match columns {
Some(cols) => self
.df
.columns_to_dummies(cols.iter().map(|x| x as &str).collect()),
None => self.df.to_dummies(),
}
.map_err(PyPolarsErr::from)?;
Ok(df.into())
}

Expand Down
15 changes: 15 additions & 0 deletions py-polars/tests/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -708,6 +708,21 @@ def test_get_dummies() -> None:
).with_columns(pl.all().cast(pl.UInt8))
assert res.frame_equal(expected)

df = pl.DataFrame(
{"i": [1, 2, 3], "category": ["dog", "cat", "cat"]},
columns={"i": pl.Int32, "category": pl.Categorical},
)
expected = pl.DataFrame(
{
"i": [1, 2, 3],
"category_cat": [0, 1, 1],
"category_dog": [1, 0, 0],
},
columns={"i": pl.Int32, "category_cat": pl.UInt8, "category_dog": pl.UInt8},
)
result = pl.get_dummies(df, columns=["category"])
assert result.frame_equal(expected)


def test_to_pandas(df: pl.DataFrame) -> None:
# pyarrow cannot deal with unsigned dictionary integer yet.
Expand Down

0 comments on commit 133f22d

Please sign in to comment.