Skip to content

Commit

Permalink
fix categorical to pandas (#2456)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Jan 24, 2022
1 parent c649a20 commit 4ff603b
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 1 deletion.
13 changes: 13 additions & 0 deletions polars/polars-core/src/series/from.rs
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,19 @@ impl Series {
let values = values.as_any().downcast_ref::<LargeStringArray>().unwrap();
(keys, values.clone())
}
(IntegerType::Int64, ArrowDataType::LargeUtf8) => {
let arr = arr.as_any().downcast_ref::<DictionaryArray<i64>>().unwrap();
let keys = arr.keys();
let keys = cast(keys, &ArrowDataType::UInt32)
.unwrap()
.as_any()
.downcast_ref::<PrimitiveArray<u32>>()
.unwrap()
.clone();
let values = arr.values();
let values = values.as_any().downcast_ref::<LargeStringArray>().unwrap();
(keys, values.clone())
}
(IntegerType::UInt32, ArrowDataType::LargeUtf8) => {
let arr = arr.as_any().downcast_ref::<DictionaryArray<u32>>().unwrap();
let keys = arr.keys();
Expand Down
4 changes: 3 additions & 1 deletion py-polars/polars/internals/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -876,7 +876,9 @@ def to_pandas(
<class 'pandas.core.frame.DataFrame'>
"""
return self.to_arrow().to_pandas(*args, date_as_object=date_as_object, **kwargs)
record_batches = self._df.to_pandas()
tbl = pa.Table.from_batches(record_batches)
return tbl.to_pandas(*args, date_as_object=date_as_object, **kwargs)

def to_csv(
self,
Expand Down
47 changes: 47 additions & 0 deletions py-polars/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use crate::apply::dataframe::{
use crate::conversion::{ObjectValue, Wrap};
use crate::file::get_mmap_bytes_reader;
use crate::lazy::dataframe::PyLazyFrame;
use crate::prelude::Context::Default;
use crate::prelude::{dicts_to_rows, str_to_null_strategy};
use crate::utils::str_to_polarstype;
use crate::{
Expand All @@ -21,8 +22,11 @@ use crate::{
series::{to_pyseries_collection, to_series_collection, PySeries},
};
use polars::frame::row::{rows_to_schema, Row};
use polars_core::export::arrow::datatypes::IntegerType;
use polars_core::frame::groupby::PivotAgg;
use polars_core::frame::ArrowChunk;
use polars_core::prelude::QuantileInterpolOptions;
use polars_core::utils::arrow::compute::cast::CastOptions;
use polars_core::utils::get_supertype;

#[pyclass]
Expand Down Expand Up @@ -432,6 +436,49 @@ impl PyDataFrame {
Ok(rbs)
}

pub fn to_pandas(&self) -> PyResult<Vec<PyObject>> {
let gil = Python::acquire_gil();
let py = gil.python();
let pyarrow = py.import("pyarrow")?;
let names = self.df.get_column_names();
let cat_columns = self
.df
.get_columns()
.iter()
.enumerate()
.filter(|(_i, s)| s.dtype() == &DataType::Categorical)
.map(|(i, _)| i)
.collect::<Vec<_>>();

use polars_core::export::arrow::array::ArrayRef;
let rbs = self
.df
.iter_chunks()
.map(|rb| {
let mut rb = rb.into_arrays();
for i in &cat_columns {
let arr = rb.get_mut(*i).unwrap();
let out = polars_core::export::arrow::compute::cast::cast(
&**arr,
&ArrowDataType::Dictionary(
IntegerType::Int64,
Box::new(ArrowDataType::LargeUtf8),
false,
),
CastOptions::default(),
)
.unwrap();
let out = Arc::from(out) as ArrayRef;
*arr = out;
}
let rb = ArrowChunk::new(rb);

arrow_interop::to_py::to_py_rb(&rb, &names, py, pyarrow)
})
.collect::<PyResult<_>>()?;
Ok(rbs)
}

pub fn add(&self, s: &PySeries) -> PyResult<Self> {
let df = (&self.df + &s.series).map_err(PyPolarsEr::from)?;
Ok(df.into())
Expand Down
7 changes: 7 additions & 0 deletions py-polars/tests/test_interop.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,3 +165,10 @@ def test_no_rechunk() -> None:
assert pl.from_arrow(table, rechunk=False).n_chunks() == 2
# chunked array
assert pl.from_arrow(table["x"], rechunk=False).n_chunks() == 2


def test_cat_to_pandas() -> None:
df = pl.DataFrame({"a": ["best", "test"]})
df = df.with_columns(pl.all().cast(pl.Categorical))
out = df.to_pandas()
assert "category" in str(out["a"].dtype)

0 comments on commit 4ff603b

Please sign in to comment.