Skip to content

Commit

Permalink
feat[rust, python]: integrate missing us timeunit functionality for…
Browse files Browse the repository at this point in the history
… duration/timedelta (#4584)
  • Loading branch information
alexander-beedie committed Aug 27, 2022
1 parent d718f6a commit 828944d
Show file tree
Hide file tree
Showing 18 changed files with 207 additions and 63 deletions.
10 changes: 10 additions & 0 deletions polars/polars-core/src/chunked_array/logical/duration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,16 @@ impl LogicalType for DurationChunked {
.into_duration(TimeUnit::Microseconds)
.into_series())
}
(Duration(TimeUnit::Microseconds), Duration(TimeUnit::Milliseconds)) => {
Ok((self.0.as_ref() / 1_000i64)
.into_duration(TimeUnit::Milliseconds)
.into_series())
}
(Duration(TimeUnit::Microseconds), Duration(TimeUnit::Nanoseconds)) => {
Ok((self.0.as_ref() * 1_000i64)
.into_duration(TimeUnit::Nanoseconds)
.into_series())
}
(Duration(TimeUnit::Nanoseconds), Duration(TimeUnit::Milliseconds)) => {
Ok((self.0.as_ref() / 1_000_000i64)
.into_duration(TimeUnit::Milliseconds)
Expand Down
6 changes: 3 additions & 3 deletions polars/polars-core/src/datatypes/_serde.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,11 @@ pub enum SerializableDataType {
/// in days (32 bits).
Date,
/// A 64-bit date representing the elapsed time since UNIX epoch (1970-01-01)
/// in milliseconds (64 bits).
/// in the given ms/us/ns TimeUnit (64 bits).
Datetime(TimeUnit, Option<TimeZone>),
// 64-bit integer representing difference between times in milliseconds or nanoseconds
// 64-bit integer representing difference between times in milli|micro|nano seconds
Duration(TimeUnit),
/// A 64-bit time representing the elapsed time since midnight in nanoseconds
/// A 64-bit time representing elapsed time since midnight in the given TimeUnit.
Time,
List(Box<SerializableDataType>),
Null,
Expand Down
2 changes: 1 addition & 1 deletion polars/polars-lazy/src/dsl/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ pub struct StrpTimeOptions {
impl Default for StrpTimeOptions {
fn default() -> Self {
StrpTimeOptions {
date_dtype: DataType::Datetime(TimeUnit::Milliseconds, None),
date_dtype: DataType::Datetime(TimeUnit::Microseconds, None),
fmt: None,
strict: false,
exact: false,
Expand Down
14 changes: 13 additions & 1 deletion polars/polars-time/src/chunkedarray/duration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ pub trait DurationMethods {
/// Extract the milliseconds from a `Duration`
fn milliseconds(&self) -> Int64Chunked;

/// Extract the microseconds from a `Duration`
fn microseconds(&self) -> Int64Chunked;

/// Extract the nanoseconds from a `Duration`
fn nanoseconds(&self) -> Int64Chunked;
}
Expand Down Expand Up @@ -69,7 +72,16 @@ impl DurationMethods for DurationChunked {
match self.time_unit() {
TimeUnit::Milliseconds => self.0.clone(),
TimeUnit::Microseconds => self.0.clone() / 1000,
TimeUnit::Nanoseconds => &self.0 / 1_000_000,
TimeUnit::Nanoseconds => &self.0 / NANOSECONDS_IN_MILLISECOND,
}
}

/// Extract the microseconds from a `Duration`
fn microseconds(&self) -> Int64Chunked {
match self.time_unit() {
TimeUnit::Milliseconds => &self.0 * 1000,
TimeUnit::Microseconds => self.0.clone(),
TimeUnit::Nanoseconds => &self.0 / 1000,
}
}

Expand Down
8 changes: 3 additions & 5 deletions py-polars/polars/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,14 +472,12 @@ def dtype_to_py_type(dtype: PolarsDataType) -> type:


def is_polars_dtype(data_type: Any) -> bool:
return (
type(data_type) is type
and issubclass(data_type, DataType)
or isinstance(data_type, DataType)
return isinstance(data_type, DataType) or (
type(data_type) is type and issubclass(data_type, DataType)
)


def py_type_to_dtype(data_type: Any) -> type[DataType]:
def py_type_to_dtype(data_type: Any) -> PolarsDataType:
# when the passed in is already a Polars datatype, return that
if is_polars_dtype(data_type):
return data_type
Expand Down
1 change: 1 addition & 0 deletions py-polars/polars/datatypes_constructor.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
}
for tu in DTYPE_TEMPORAL_UNITS:
_POLARS_TYPE_TO_CONSTRUCTOR[Datetime(tu)] = PySeries.new_opt_i64
_POLARS_TYPE_TO_CONSTRUCTOR[Duration(tu)] = PySeries.new_opt_i64


def polars_type_to_constructor(
Expand Down
9 changes: 5 additions & 4 deletions py-polars/polars/internals/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
Int16,
Int32,
Int64,
PolarsDataType,
UInt8,
UInt16,
UInt32,
Expand Down Expand Up @@ -545,7 +546,7 @@ def _read_csv(
comment_char: str | None = None,
quote_char: str | None = r'"',
skip_rows: int = 0,
dtypes: None | (Mapping[str, type[DataType]] | list[type[DataType]]) = None,
dtypes: None | (Mapping[str, PolarsDataType] | Sequence[PolarsDataType]) = None,
null_values: str | list[str] | dict[str, str] | None = None,
ignore_errors: bool = False,
parse_dates: bool = False,
Expand Down Expand Up @@ -584,14 +585,14 @@ def _read_csv(
if isinstance(file, StringIO):
file = file.getvalue().encode()

dtype_list: list[tuple[str, type[DataType]]] | None = None
dtype_slice: list[type[DataType]] | None = None
dtype_list: Sequence[tuple[str, PolarsDataType]] | None = None
dtype_slice: Sequence[PolarsDataType] | None = None
if dtypes is not None:
if isinstance(dtypes, dict):
dtype_list = []
for k, v in dtypes.items():
dtype_list.append((k, py_type_to_dtype(v)))
elif isinstance(dtypes, list):
elif isinstance(dtypes, Sequence):
dtype_slice = dtypes
else:
raise ValueError("dtype arg should be list or dict")
Expand Down
52 changes: 52 additions & 0 deletions py-polars/polars/internals/expr/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -1104,6 +1104,58 @@ def milliseconds(self) -> pli.Expr:
"""
return pli.wrap_expr(self._pyexpr.duration_milliseconds())

def microseconds(self) -> pli.Expr:
"""
Extract the microseconds from a Duration type.
Returns
-------
A series of dtype Int64
Examples
--------
>>> from datetime import datetime
>>> df = pl.DataFrame(
... {
... "date": pl.date_range(
... datetime(2020, 1, 1), datetime(2020, 1, 1, 0, 0, 1, 0), "1ms"
... ),
... }
... )
>>> df.select(
... [
... pl.col("date"),
... pl.col("date").diff().dt.microseconds().alias("microseconds_diff"),
... ]
... )
shape: (1001, 2)
┌─────────────────────────┬───────────────────┐
│ date ┆ microseconds_diff │
│ --- ┆ --- │
│ datetime[ns] ┆ i64 │
╞═════════════════════════╪═══════════════════╡
│ 2020-01-01 00:00:00 ┆ null │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2020-01-01 00:00:00.001 ┆ 1000 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2020-01-01 00:00:00.002 ┆ 1000 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2020-01-01 00:00:00.003 ┆ 1000 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ... ┆ ... │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2020-01-01 00:00:00.997 ┆ 1000 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2020-01-01 00:00:00.998 ┆ 1000 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2020-01-01 00:00:00.999 ┆ 1000 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2020-01-01 00:00:01 ┆ 1000 │
└─────────────────────────┴───────────────────┘
"""
return pli.wrap_expr(self._pyexpr.duration_microseconds())

def nanoseconds(self) -> pli.Expr:
"""
Extract the nanoseconds from a Duration type.
Expand Down
21 changes: 12 additions & 9 deletions py-polars/polars/internals/expr/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,15 @@
from typing import TYPE_CHECKING, Any, Callable, Sequence

from polars import internals as pli
from polars.datatypes import DataType, Datetime, Float64, UInt32, py_type_to_dtype
from polars.datatypes import (
DataType,
Datetime,
Float64,
PolarsDataType,
UInt32,
is_polars_dtype,
py_type_to_dtype,
)
from polars.internals.expr.categorical import ExprCatNameSpace
from polars.internals.expr.datetime import ExprDateTimeNameSpace
from polars.internals.expr.list import ExprListNameSpace
Expand Down Expand Up @@ -590,12 +598,7 @@ def exclude(
columns = [columns]
return wrap_expr(self._pyexpr.exclude_dtype(columns))

if not all(
[
isinstance(a, str) or (type(a) is type and issubclass(a, DataType))
for a in columns
]
):
if not all((isinstance(a, str) or is_polars_dtype(a)) for a in columns):
raise ValueError("input should be all string or all DataType")

if isinstance(columns[0], str):
Expand Down Expand Up @@ -1634,7 +1637,7 @@ def mode(self) -> Expr:
"""
return wrap_expr(self._pyexpr.mode())

def cast(self, dtype: type[Any] | DataType, strict: bool = True) -> Expr:
def cast(self, dtype: PolarsDataType | type[Any], strict: bool = True) -> Expr:
"""
Cast between data types.
Expand Down Expand Up @@ -2916,7 +2919,7 @@ def where(self, predicate: Expr) -> Expr:
def map(
self,
f: Callable[[pli.Series], pli.Series | Any],
return_dtype: type[DataType] | None = None,
return_dtype: PolarsDataType | None = None,
agg_list: bool = False,
) -> Expr:
"""
Expand Down
5 changes: 3 additions & 2 deletions py-polars/polars/internals/expr/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import TYPE_CHECKING

import polars.internals as pli
from polars.datatypes import DataType, Date, Datetime, Time
from polars.datatypes import DataType, Date, Datetime, Time, is_polars_dtype

if TYPE_CHECKING:
from polars.internals.type_aliases import TransferEncoding
Expand Down Expand Up @@ -81,8 +81,9 @@ def strptime(
└────────────┘
"""
if not issubclass(datatype, DataType): # pragma: no cover
if not is_polars_dtype(datatype): # pragma: no cover
raise ValueError(f"expected: {DataType} got: {datatype}")

if datatype == Date:
return pli.wrap_expr(self._pyexpr.str_parse_date(fmt, strict, exact))
elif datatype == Datetime:
Expand Down
29 changes: 9 additions & 20 deletions py-polars/polars/internals/lazy_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,10 @@
Datetime,
Duration,
PolarsDataType,
is_polars_dtype,
py_type_to_dtype,
)
from polars.utils import (
_datetime_to_pl_timestamp,
_timedelta_to_pl_timedelta,
timedelta_in_nanoseconds_window,
)
from polars.utils import _datetime_to_pl_timestamp, _timedelta_to_pl_timedelta

try:
from polars.polars import arange as pyarange
Expand Down Expand Up @@ -173,14 +170,10 @@ def col(
if isinstance(name, list):
if len(name) == 0 or isinstance(name[0], str):
return pli.wrap_expr(pycols(name))
elif (
isclass(name[0])
and issubclass(name[0], DataType)
or isinstance(name[0], DataType)
):
elif is_polars_dtype(name[0]):
return pli.wrap_expr(_dtype_cols(name))
else:
raise ValueError("did expect argument of List[str] or List[DataType]")
raise ValueError("Expected list values to be all `str` or all `DataType`")
return pli.wrap_expr(pycol(name))


Expand Down Expand Up @@ -644,19 +637,15 @@ def lit(value: Any, dtype: type[DataType] | None = None) -> pli.Expr:
if isinstance(value, datetime):
tu = "us"
return lit(_datetime_to_pl_timestamp(value, tu)).cast(Datetime(tu))
if isinstance(value, timedelta):
# TODO: python timedelta should also default to 'us' units.
# (needs some corresponding work on the Rust side first)
if timedelta_in_nanoseconds_window(value):
tu = "ns"
else:
tu = "ms"

elif isinstance(value, timedelta):
tu = "us"
return lit(_timedelta_to_pl_timedelta(value, tu)).cast(Duration(tu))

if isinstance(value, date):
elif isinstance(value, date):
return lit(datetime(value.year, value.month, value.day)).cast(Date)

if isinstance(value, pli.Series):
elif isinstance(value, pli.Series):
name = value.name
value = value._s
e = pli.wrap_expr(pylit(value))
Expand Down
6 changes: 3 additions & 3 deletions py-polars/polars/internals/lazyframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from polars import internals as pli
from polars.cfg import Config
from polars.datatypes import DataType, Schema, py_type_to_dtype
from polars.datatypes import DataType, PolarsDataType, Schema, py_type_to_dtype
from polars.internals.lazyframe.groupby import LazyGroupBy
from polars.internals.slice import LazyPolarsSlice
from polars.utils import (
Expand Down Expand Up @@ -88,7 +88,7 @@ def scan_csv(
comment_char: str | None = None,
quote_char: str | None = r'"',
skip_rows: int = 0,
dtypes: dict[str, type[DataType]] | None = None,
dtypes: dict[str, PolarsDataType] | None = None,
null_values: str | list[str] | dict[str, str] | None = None,
ignore_errors: bool = False,
cache: bool = True,
Expand All @@ -114,7 +114,7 @@ def scan_csv(
polars.io.scan_csv
"""
dtype_list: list[tuple[str, type[DataType]]] | None = None
dtype_list: list[tuple[str, PolarsDataType]] | None = None
if dtypes is not None:
dtype_list = []
for k, v in dtypes.items():
Expand Down
10 changes: 10 additions & 0 deletions py-polars/polars/internals/series/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,16 @@ def milliseconds(self) -> pli.Series:
"""

def microseconds(self) -> pli.Series:
"""
Extract the microseconds from a Duration type.
Returns
-------
A series of dtype Int64
"""

def nanoseconds(self) -> pli.Series:
"""
Extract the nanoseconds from a Duration type.
Expand Down
4 changes: 2 additions & 2 deletions py-polars/polars/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
_PYARROW_AVAILABLE = False

from polars.convert import from_arrow
from polars.datatypes import DataType, Utf8
from polars.datatypes import DataType, PolarsDataType, Utf8
from polars.internals import DataFrame, LazyFrame, _scan_ds
from polars.internals.io import _prepare_file_arg

Expand Down Expand Up @@ -420,7 +420,7 @@ def scan_csv(
comment_char: str | None = None,
quote_char: str | None = r'"',
skip_rows: int = 0,
dtypes: dict[str, type[DataType]] | None = None,
dtypes: dict[str, PolarsDataType] | None = None,
null_values: str | list[str] | dict[str, str] | None = None,
ignore_errors: bool = False,
cache: bool = True,
Expand Down
5 changes: 0 additions & 5 deletions py-polars/polars/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,6 @@ def in_nanoseconds_window(dt: datetime) -> bool:
return 1386 < dt.year < 2554


def timedelta_in_nanoseconds_window(td: timedelta) -> bool:
"""Check whether the given timedelta can be represented as a Unix timestamp."""
return in_nanoseconds_window(datetime(1970, 1, 1) + td)


def _datetime_to_pl_timestamp(dt: datetime, tu: TimeUnit | None) -> int:
"""Convert a python datetime to a timestamp in nanoseconds."""
if tu == "ns":
Expand Down

0 comments on commit 828944d

Please sign in to comment.