Skip to content

Commit

Permalink
Some import/module adjustments (#992)
Browse files Browse the repository at this point in the history
* Added __all__ dunders to the __init__.py files to solve the problem mentioned above.
This caused some issues with mypy, so I had to explicitly import Series, DataFrame, etc. in the top __init__.py (I filed a mypy issue for this... I believe there's a bug).
* Removed the dtype_to_int function in the datatypes module. It was unused internally, and I see no use for this function.
* I tried hiding the wrapping functions (wrap_s, wrap_df, etc.) from the main scope, as I believe users should never have to explicitly use these. But the Rust backend expects those functions to be there. I left them for now; something for the future maybe (possibly just rename them with a leading underscore or something).
* Renamed lazy/expr_functions to lazy/functions (now possible thanks to the fix to the first mentioned issue). This conforms to the syntax people know from pyspark: use from polars.lazy import functions as F and then use F.col, F.sum, etc.
Split up functions.py into io.py (for all the read functions like read_csv, etc.) and eager/functions.py (for concat, get_dummies, etc.).
* Moved StringCache and toggle_string_cache to their own file.
  • Loading branch information
stinodego authored and ritchie46 committed Jul 20, 2021
1 parent bdab835 commit 7732483
Show file tree
Hide file tree
Showing 14 changed files with 690 additions and 657 deletions.
19 changes: 19 additions & 0 deletions py-polars/polars/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,18 @@
# flake8: noqa

# mypy needs these imported explicitly
from polars.eager.frame import DataFrame, wrap_df
from polars.eager.series import Series, wrap_s
from polars.lazy.expr import Expr, wrap_expr
from polars.lazy.frame import LazyFrame, wrap_ldf

from . import datatypes, eager, functions, io, lazy, string_cache
from .datatypes import *
from .eager import *
from .functions import *
from .io import *
from .lazy import *
from .string_cache import *

# during docs building the binary code is not yet available
try:
Expand All @@ -11,3 +21,12 @@
__version__ = version()
except ImportError:
pass

__all__ = (
datatypes.__all__
+ eager.__all__
+ functions.__all__
+ io.__all__
+ lazy.__all__
+ string_cache.__all__
)
10 changes: 0 additions & 10 deletions py-polars/polars/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,16 +260,6 @@ def dtype_to_ctype(dtype: Type[DataType]) -> Type[_SimpleCData]: # noqa: F821
return ptr_type


def dtype_to_int(dtype: Type[DataType]) -> int:
i = 0
for dt in DTYPES:
if dt == dtype:
return i
i += 1
else:
raise NotImplementedError


def pytype_to_polars_type(data_type: Type[Any]) -> Type[DataType]:
polars_type: Type[DataType]
if data_type == int:
Expand Down
3 changes: 3 additions & 0 deletions py-polars/polars/eager/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# flake8: noqa
from . import frame, series
from .frame import *
from .series import *

__all__ = frame.__all__ + series.__all__
72 changes: 18 additions & 54 deletions py-polars/polars/eager/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import typing as tp
from io import BytesIO, StringIO
from pathlib import Path
from types import TracebackType
from typing import (
Any,
BinaryIO,
Expand Down Expand Up @@ -34,23 +33,18 @@

try:
from ..polars import PyDataFrame, PySeries
from ..polars import toggle_string_cache as pytoggle_string_cache
except ImportError:
import warnings

warnings.warn("binary files missing")


try:
import pandas as pd
except ImportError:
pass

__all__ = [
"DataFrame",
"wrap_df",
"StringCache",
"toggle_string_cache",
]


Expand Down Expand Up @@ -131,7 +125,7 @@ def _from_pydf(df: "PyDataFrame") -> "DataFrame":
@staticmethod
def from_rows(
rows: Sequence[Sequence[Any]],
column_names: Optional[tp.List[str]] = None,
column_names: Optional[Sequence[str]] = None,
column_name_mapping: Optional[Dict[int, str]] = None,
) -> "DataFrame":
"""
Expand All @@ -154,7 +148,7 @@ def from_rows(
self = DataFrame.__new__(DataFrame)
self._df = PyDataFrame.read_rows(rows)
if column_names is not None:
self.columns = column_names
self.columns = list(column_names)
if column_name_mapping is not None:
for i, name in column_name_mapping.items():
s = self[:, i]
Expand Down Expand Up @@ -608,7 +602,7 @@ def __getattr__(self, item: Any) -> "PySeries":
Access columns as attribute.
"""
try:
return pl.wrap_s(self._df.column(item))
return pl.eager.series.wrap_s(self._df.column(item))
except RuntimeError:
raise AttributeError(f"{item} not found")

Expand Down Expand Up @@ -673,7 +667,7 @@ def __getitem__(self, item: Any) -> Any:
# df[:, unknown]
series = self.__getitem__(col_selection)
# s[:]
pl.wrap_s(series[row_selection])
pl.eager.series.wrap_s(series[row_selection])

# df[2, :] (select row as df)
if isinstance(row_selection, int):
Expand Down Expand Up @@ -706,7 +700,7 @@ def __getitem__(self, item: Any) -> Any:
# select single column
# df["foo"]
if isinstance(item, str):
return pl.wrap_s(self._df.column(item))
return pl.eager.series.wrap_s(self._df.column(item))

# df[idx]
if isinstance(item, int):
Expand Down Expand Up @@ -1012,7 +1006,7 @@ def describe_cast(self: "DataFrame") -> "DataFrame":
columns.append(s)
return pl.DataFrame(columns)

summary = pl.concat(
summary = pl.functions.concat(
[
describe_cast(self.mean()),
describe_cast(self.std()),
Expand Down Expand Up @@ -1503,7 +1497,7 @@ def apply(
return_dtype
Output type of the operation. If none given, Polars tries to infer the type.
"""
return pl.wrap_s(self._df.apply(f, return_dtype))
return pl.eager.series.wrap_s(self._df.apply(f, return_dtype))

def with_column(self, column: Union["pl.Series", "pl.Expr"]) -> "DataFrame":
"""
Expand Down Expand Up @@ -1608,7 +1602,7 @@ def drop_in_place(self, name: str) -> "pl.Series":
name
Column to drop.
"""
return pl.wrap_s(self._df.drop_in_place(name))
return pl.eager.series.wrap_s(self._df.drop_in_place(name))

def select_at_idx(self, idx: int) -> "pl.Series":
"""
Expand All @@ -1619,7 +1613,7 @@ def select_at_idx(self, idx: int) -> "pl.Series":
idx
Location of selection.
"""
return pl.wrap_s(self._df.select_at_idx(idx))
return pl.eager.series.wrap_s(self._df.select_at_idx(idx))

def clone(self) -> "DataFrame":
"""
Expand All @@ -1631,7 +1625,7 @@ def get_columns(self) -> tp.List["pl.Series"]:
"""
Get the DataFrame as a List of Series.
"""
return list(map(lambda s: pl.wrap_s(s), self._df.get_columns()))
return list(map(lambda s: pl.eager.series.wrap_s(s), self._df.get_columns()))

def fill_none(self, strategy: Union[str, "pl.Expr"]) -> "DataFrame":
"""
Expand Down Expand Up @@ -1737,13 +1731,13 @@ def is_duplicated(self) -> "pl.Series":
"""
Get a mask of all duplicated rows in this DataFrame.
"""
return pl.wrap_s(self._df.is_duplicated())
return pl.eager.series.wrap_s(self._df.is_duplicated())

def is_unique(self) -> "pl.Series":
"""
Get a mask of all unique rows in this DataFrame.
"""
return pl.wrap_s(self._df.is_unique())
return pl.eager.series.wrap_s(self._df.is_unique())

def lazy(self) -> "pl.LazyFrame":
"""
Expand All @@ -1759,7 +1753,7 @@ def lazy(self) -> "pl.LazyFrame":
Lazy operations are advised because they allow for query optimization and more parallelization.
"""
return pl.wrap_ldf(self._df.lazy())
return pl.lazy.frame.wrap_ldf(self._df.lazy())

def select(
self, exprs: Union[str, "pl.Expr", Sequence[str], Sequence["pl.Expr"]]
Expand Down Expand Up @@ -1806,7 +1800,7 @@ def max(self, axis: int = 0) -> "DataFrame":
if axis == 0:
return wrap_df(self._df.max())
if axis == 1:
return pl.wrap_s(self._df.hmax()).to_frame()
return pl.eager.series.wrap_s(self._df.hmax()).to_frame()
raise ValueError("Axis should be 0 or 1.")

def min(self, axis: int = 0) -> "DataFrame":
Expand All @@ -1816,7 +1810,7 @@ def min(self, axis: int = 0) -> "DataFrame":
if axis == 0:
return wrap_df(self._df.min())
if axis == 1:
return pl.wrap_s(self._df.hmin()).to_frame()
return pl.eager.series.wrap_s(self._df.hmin()).to_frame()
raise ValueError("Axis should be 0 or 1.")

def sum(self, axis: int = 0) -> "DataFrame":
Expand All @@ -1826,7 +1820,7 @@ def sum(self, axis: int = 0) -> "DataFrame":
if axis == 0:
return wrap_df(self._df.sum())
if axis == 1:
return pl.wrap_s(self._df.hsum()).to_frame()
return pl.eager.series.wrap_s(self._df.hsum()).to_frame()
raise ValueError("Axis should be 0 or 1.")

def mean(self, axis: int = 0) -> "DataFrame":
Expand All @@ -1836,7 +1830,7 @@ def mean(self, axis: int = 0) -> "DataFrame":
if axis == 0:
return wrap_df(self._df.mean())
if axis == 1:
return pl.wrap_s(self._df.hmean()).to_frame()
return pl.eager.series.wrap_s(self._df.hmean()).to_frame()
raise ValueError("Axis should be 0 or 1.")

def std(self) -> "DataFrame":
Expand Down Expand Up @@ -2054,7 +2048,7 @@ def hash_rows(
k3
seed parameter
"""
return pl.wrap_s(self._df.hash_rows(k0, k1, k2, k3))
return pl.eager.series.wrap_s(self._df.hash_rows(k0, k1, k2, k3))


class GroupBy:
Expand Down Expand Up @@ -2551,33 +2545,3 @@ def apply(
df[name] = s

return df


class StringCache:
"""
Context manager that allows data sources to share the same categorical features.
This will temporarily cache the string categories until the context manager is finished.
"""

def __init__(self) -> None:
pass

def __enter__(self) -> "StringCache":
pytoggle_string_cache(True)
return self

def __exit__(
self,
exc_type: Optional[Type[BaseException]],
exc_val: Optional[BaseException],
exc_tb: Optional[TracebackType],
) -> None:
pytoggle_string_cache(False)


def toggle_string_cache(toggle: bool) -> None:
"""
Turn on/off the global string cache. This ensures that casts to Categorical types have the categories when string
values are equal.
"""
pytoggle_string_cache(toggle)
7 changes: 3 additions & 4 deletions py-polars/polars/eager/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@

__all__ = [
"Series",
"wrap_s",
]


Expand Down Expand Up @@ -527,7 +526,7 @@ def to_frame(self) -> "pl.DataFrame":
"""
Cast this Series to a DataFrame.
"""
return pl.wrap_df(PyDataFrame([self._s]))
return pl.eager.frame.wrap_df(PyDataFrame([self._s]))

@property
def dtype(self) -> Type[DataType]:
Expand Down Expand Up @@ -640,13 +639,13 @@ def to_dummies(self) -> "pl.DataFrame":
"""
Get dummy variables.
"""
return pl.wrap_df(self._s.to_dummies())
return pl.eager.frame.wrap_df(self._s.to_dummies())

def value_counts(self) -> "pl.DataFrame":
"""
Count the unique values in a Series.
"""
return pl.wrap_df(self._s.value_counts())
return pl.eager.frame.wrap_df(self._s.value_counts())

@property
def name(self) -> str:
Expand Down

0 comments on commit 7732483

Please sign in to comment.