Skip to content

Commit

Permalink
Enhanced column typedef/inference support for DataFrame init (#3633)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexander-beedie committed Jun 9, 2022
1 parent 06ba4f3 commit 9adb25f
Show file tree
Hide file tree
Showing 7 changed files with 218 additions and 66 deletions.
91 changes: 76 additions & 15 deletions py-polars/polars/datatypes.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import ctypes
from datetime import datetime, timedelta
from typing import Any, Dict, Optional, Sequence, Type
from datetime import date, datetime, time, timedelta
from typing import Any, Dict, Optional, Sequence, Type, Union

try:
import pyarrow as pa
Expand Down Expand Up @@ -30,6 +30,9 @@ def __repr__(self) -> str:
return dtype_str_repr(self)


PolarsDataType = Union[Type[DataType], DataType]


class Int8(DataType):
"""8-bit signed integer type"""

Expand Down Expand Up @@ -284,7 +287,7 @@ def __hash__(self) -> int:
return hash(Struct)


_DTYPE_TO_FFINAME: Dict[Type[DataType], str] = {
_DTYPE_TO_FFINAME: Dict[PolarsDataType, str] = {
Int8: "i8",
Int16: "i16",
Int32: "i32",
Expand All @@ -307,7 +310,7 @@ def __hash__(self) -> int:
Struct: "struct",
}

_DTYPE_TO_CTYPE = {
_DTYPE_TO_CTYPE: Dict[PolarsDataType, Any] = {
UInt8: ctypes.c_uint8,
UInt16: ctypes.c_uint16,
UInt32: ctypes.c_uint32,
Expand All @@ -325,15 +328,21 @@ def __hash__(self) -> int:
}


_PY_TYPE_TO_DTYPE = {
_PY_TYPE_TO_DTYPE: Dict[type, Type[DataType]] = {
float: Float64,
int: Int64,
str: Utf8,
bool: Boolean,
date: Date,
datetime: Datetime,
timedelta: Duration,
time: Time,
list: List,
tuple: List,
}


_DTYPE_TO_PY_TYPE = {
_DTYPE_TO_PY_TYPE: Dict[PolarsDataType, type] = {
Float64: float,
Float32: float,
Int64: int,
Expand All @@ -346,47 +355,87 @@ def __hash__(self) -> int:
UInt32: int,
UInt64: int,
Boolean: bool,
Duration: timedelta,
Datetime: datetime,
Date: date,
Time: time,
}

if _PYARROW_AVAILABLE:
_PY_TYPE_TO_ARROW_TYPE = {
_PY_TYPE_TO_ARROW_TYPE: Dict[type, "pa.lib.DataType"] = {
float: pa.float64(),
int: pa.int64(),
str: pa.large_utf8(),
bool: pa.bool_(),
date: pa.date32(),
time: pa.time64("us"),
datetime: pa.timestamp("us"),
timedelta: pa.duration("us"),
}

_DTYPE_TO_ARROW_TYPE = {
Int8: pa.int8(),
Int16: pa.int16(),
Int32: pa.int32(),
Int64: pa.int64(),
UInt8: pa.uint8(),
UInt16: pa.uint16(),
UInt32: pa.uint32(),
UInt64: pa.uint64(),
Float32: pa.float32(),
Float64: pa.float64(),
Boolean: pa.bool_(),
Utf8: pa.large_utf8(),
Date: pa.date32(),
# handle temporal types that require units
Datetime: pa.timestamp("us"),
(Datetime, "ms"): pa.timestamp("ms"),
(Datetime, "us"): pa.timestamp("us"),
(Datetime, "ns"): pa.timestamp("ns"),
Duration: pa.duration("us"),
(Duration, "ms"): pa.duration("ms"),
(Duration, "us"): pa.duration("us"),
(Duration, "ns"): pa.duration("ns"),
Time: pa.time64("us"),
(Time, "ms"): pa.time32("ms"),
(Time, "us"): pa.time64("us"),
(Time, "ns"): pa.time64("ns"),
}


def dtype_to_ctype(dtype: Type[DataType]) -> Type[_SimpleCData]:
def dtype_to_ctype(dtype: PolarsDataType) -> Type[_SimpleCData]:
try:
return _DTYPE_TO_CTYPE[dtype]
except KeyError: # pragma: no cover
raise NotImplementedError


def dtype_to_ffiname(dtype: Type[DataType]) -> str:
def dtype_to_ffiname(dtype: PolarsDataType) -> str:
try:
return _DTYPE_TO_FFINAME[dtype]
except KeyError: # pragma: no cover
raise NotImplementedError


def dtype_to_py_type(dtype: Type[DataType]) -> Type:
def dtype_to_py_type(dtype: PolarsDataType) -> Type:
try:
return _DTYPE_TO_PY_TYPE[dtype]
except KeyError: # pragma: no cover
raise NotImplementedError


def py_type_to_dtype(data_type: Type[Any]) -> Type[DataType]:
# when the passed in is already a Polars datatype, return that
if (
def is_polars_dtype(data_type: Any) -> bool:
return (
type(data_type) is type
and issubclass(data_type, DataType)
or isinstance(data_type, DataType)
):
return data_type
)


def py_type_to_dtype(data_type: Any) -> Type[DataType]:
# when the passed in is already a Polars datatype, return that
if is_polars_dtype(data_type):
return data_type
try:
return _PY_TYPE_TO_DTYPE[data_type]
except KeyError: # pragma: no cover
Expand All @@ -403,6 +452,18 @@ def py_type_to_arrow_type(dtype: Type[Any]) -> "pa.lib.DataType":
raise ValueError(f"Cannot parse dtype {dtype} into Arrow dtype.")


def dtype_to_arrow_type(dtype: PolarsDataType) -> "pa.lib.DataType":
"""
Convert a Polars dtype to an Arrow dtype.
"""
try:
unit = getattr(dtype, "tu", None)
lookup = dtype if unit is None else (dtype, unit)
return _DTYPE_TO_ARROW_TYPE[lookup]
except KeyError: # pragma: no cover
raise ValueError(f"Cannot parse dtype {dtype} into Arrow dtype.")


def maybe_cast(
el: Type[DataType], dtype: Type, time_unit: Optional[str] = None
) -> Type[DataType]:
Expand Down
8 changes: 4 additions & 4 deletions py-polars/polars/datatypes_constructor.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
from typing import Any, Callable, Sequence, Type
from typing import Any, Callable, Dict, Sequence, Type

import numpy as np

from polars.datatypes import (
Boolean,
Categorical,
DataType,
Date,
Datetime,
Duration,
Expand All @@ -16,6 +15,7 @@
Int32,
Int64,
Object,
PolarsDataType,
Time,
UInt8,
UInt16,
Expand All @@ -33,7 +33,7 @@


if not _DOCUMENTING:
_POLARS_TYPE_TO_CONSTRUCTOR = {
_POLARS_TYPE_TO_CONSTRUCTOR: Dict[PolarsDataType, Callable] = {
Float32: PySeries.new_opt_f32,
Float64: PySeries.new_opt_f64,
Int8: PySeries.new_opt_i8,
Expand All @@ -56,7 +56,7 @@


def polars_type_to_constructor(
dtype: Type[DataType],
dtype: PolarsDataType,
) -> Callable[[str, Sequence[Any], bool], "PySeries"]:
"""
Get the right PySeries constructor for the given Polars dtype.
Expand Down

0 comments on commit 9adb25f

Please sign in to comment.