Skip to content

Commit

Permalink
Move coerce_arrow into internals (#1846)
Browse files Browse the repository at this point in the history
To remove the public api dependency in utils.py
  • Loading branch information
zundertj committed Nov 21, 2021
1 parent bf44119 commit 3a2fad9
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 45 deletions.
34 changes: 33 additions & 1 deletion py-polars/polars/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
_DOCUMENTING = False
except ImportError:
_DOCUMENTING = True
from polars.utils import coerce_arrow

if TYPE_CHECKING:
import pandas as pd
Expand Down Expand Up @@ -417,3 +416,36 @@ def pandas_to_pydf(
}
arrow_table = pa.table(arrow_dict)
return arrow_to_pydf(arrow_table, columns=columns, rechunk=rechunk)


def coerce_arrow(array: "pa.Array") -> "pa.Array":
# also coerces timezone to naive representation
# units are accounted for by pyarrow
if "timestamp" in str(array.type):
warnings.warn(
"Conversion of (potentially) timezone aware to naive datetimes. TZ information may be lost",
)
ts_ms = pa.compute.cast(array, pa.timestamp("ms"), safe=False)
ms = pa.compute.cast(ts_ms, pa.int64())
del ts_ms
array = pa.compute.cast(ms, pa.timestamp("ms"))
del ms
# note: Decimal256 could not be cast to float
elif isinstance(array.type, pa.Decimal128Type):
array = pa.compute.cast(array, pa.float64())

if hasattr(array, "num_chunks") and array.num_chunks > 1:
# we have to coerce before combining chunks, because pyarrow panics if
# offsets overflow
if pa.types.is_string(array.type):
array = pa.compute.cast(array, pa.large_utf8())
elif pa.types.is_list(array.type):
# pyarrow does not seem to support casting from list to largelist
# so we use convert to large list ourselves and do the re-alloc on polars/arrow side
chunks = []
for arr in array.iterchunks():
chunks.append(pli.Series._from_arrow("", arr).to_arrow())
array = pa.chunked_array(chunks)

array = array.combine_chunks()
return array
44 changes: 0 additions & 44 deletions py-polars/polars/utils.py
Original file line number Diff line number Diff line change
@@ -1,53 +1,9 @@
import ctypes
import typing as tp
import warnings
from typing import Any, Dict, Tuple, Union

import numpy as np

try:
import pyarrow as pa
import pyarrow.compute

_PYARROW_AVAILABLE = True
except ImportError:
_PYARROW_AVAILABLE = False

import polars as pl # TODO: this is public API


def coerce_arrow(array: "pa.Array") -> "pa.Array":
# also coerces timezone to naive representation
# units are accounted for by pyarrow
if "timestamp" in str(array.type):
warnings.warn(
"Conversion of (potentially) timezone aware to naive datetimes. TZ information may be lost",
)
ts_ms = pa.compute.cast(array, pa.timestamp("ms"), safe=False)
ms = pa.compute.cast(ts_ms, pa.int64())
del ts_ms
array = pa.compute.cast(ms, pa.timestamp("ms"))
del ms
# note: Decimal256 could not be cast to float
elif isinstance(array.type, pa.Decimal128Type):
array = pa.compute.cast(array, pa.float64())

if hasattr(array, "num_chunks") and array.num_chunks > 1:
# we have to coerce before combining chunks, because pyarrow panics if
# offsets overflow
if pa.types.is_string(array.type):
array = pa.compute.cast(array, pa.large_utf8())
elif pa.types.is_list(array.type):
# pyarrow does not seem to support casting from list to largelist
# so we use convert to large list ourselves and do the re-alloc on polars/arrow side
chunks = []
for arr in array.iterchunks():
chunks.append(pl.from_arrow(arr).to_arrow())
array = pa.chunked_array(chunks)

array = array.combine_chunks()
return array


def _process_null_values(
null_values: Union[None, str, tp.List[str], Dict[str, str]] = None,
Expand Down

0 comments on commit 3a2fad9

Please sign in to comment.