Some import/module adjustments (#992)

* Added __all__ dunders to the __init__.py files to solve the problem mentioned above. This caused some issues with mypy, so I had to explicitly import Series, DataFrame, etc. in the top __init__.py (I filed a mypy issue for this... I believe there's a bug). * Removed the dtype_to_int function in the datatypes module. It was unused internally, and I see no use for this function. * I tried hiding the wrapping functions (wrap_s, wrap_df, etc.) from the main scope, as I believe users should never have to explicitly use these. But the Rust backend expects those functions to be there. I left them for now; something for the future maybe (possibly just rename them with a leading underscore or something). * Renamed lazy/expr_functions to lazy/functions (now possible thanks to the fix to the first mentioned issue). This conforms to the syntax people know from pyspark: use from polars.lazy import functions as F and then use F.col, F.sum, etc. Split up functions.py into io.py (for all the read functions like read_csv, etc.) and eager/functions.py (for concat, get_dummies, etc.). * Moved StringCache and toggle_string_cache to their own file.
pola-rs · Jul 20, 2021 · 7732483 · 7732483
1 parent bdab835
commit 7732483
Show file tree

Hide file tree

Showing 14 changed files with 690 additions and 657 deletions.
diff --git a/py-polars/polars/__init__.py b/py-polars/polars/__init__.py
@@ -1,8 +1,18 @@
 # flake8: noqa
+
+# mypy needs these imported explicitly
+from polars.eager.frame import DataFrame, wrap_df
+from polars.eager.series import Series, wrap_s
+from polars.lazy.expr import Expr, wrap_expr
+from polars.lazy.frame import LazyFrame, wrap_ldf
+
+from . import datatypes, eager, functions, io, lazy, string_cache
 from .datatypes import *
 from .eager import *
 from .functions import *
+from .io import *
 from .lazy import *
+from .string_cache import *
 
 # during docs building the binary code is not yet available
 try:
@@ -11,3 +21,12 @@
     __version__ = version()
 except ImportError:
     pass
+
+__all__ = (
+    datatypes.__all__
+    + eager.__all__
+    + functions.__all__
+    + io.__all__
+    + lazy.__all__
+    + string_cache.__all__
+)
diff --git a/py-polars/polars/datatypes.py b/py-polars/polars/datatypes.py
@@ -260,16 +260,6 @@ def dtype_to_ctype(dtype: Type[DataType]) -> Type[_SimpleCData]:  # noqa: F821
     return ptr_type
 
 
-def dtype_to_int(dtype: Type[DataType]) -> int:
-    i = 0
-    for dt in DTYPES:
-        if dt == dtype:
-            return i
-        i += 1
-    else:
-        raise NotImplementedError
-
-
 def pytype_to_polars_type(data_type: Type[Any]) -> Type[DataType]:
     polars_type: Type[DataType]
     if data_type == int:

diff --git a/py-polars/polars/eager/__init__.py b/py-polars/polars/eager/__init__.py
@@ -1,3 +1,6 @@
 # flake8: noqa
+from . import frame, series
 from .frame import *
 from .series import *
+
+__all__ = frame.__all__ + series.__all__
diff --git a/py-polars/polars/eager/frame.py b/py-polars/polars/eager/frame.py
@@ -5,7 +5,6 @@
 import typing as tp
 from io import BytesIO, StringIO
 from pathlib import Path
-from types import TracebackType
 from typing import (
     Any,
     BinaryIO,
@@ -34,23 +33,18 @@
 
 try:
     from ..polars import PyDataFrame, PySeries
-    from ..polars import toggle_string_cache as pytoggle_string_cache
 except ImportError:
     import warnings
 
     warnings.warn("binary files missing")
 
-
 try:
     import pandas as pd
 except ImportError:
     pass
 
 __all__ = [
     "DataFrame",
-    "wrap_df",
-    "StringCache",
-    "toggle_string_cache",
 ]
 
 
@@ -131,7 +125,7 @@ def _from_pydf(df: "PyDataFrame") -> "DataFrame":
     @staticmethod
     def from_rows(
         rows: Sequence[Sequence[Any]],
-        column_names: Optional[tp.List[str]] = None,
+        column_names: Optional[Sequence[str]] = None,
         column_name_mapping: Optional[Dict[int, str]] = None,
     ) -> "DataFrame":
         """
@@ -154,7 +148,7 @@ def from_rows(
         self = DataFrame.__new__(DataFrame)
         self._df = PyDataFrame.read_rows(rows)
         if column_names is not None:
-            self.columns = column_names
+            self.columns = list(column_names)
         if column_name_mapping is not None:
             for i, name in column_name_mapping.items():
                 s = self[:, i]
@@ -608,7 +602,7 @@ def __getattr__(self, item: Any) -> "PySeries":
         Access columns as attribute.
         """
         try:
-            return pl.wrap_s(self._df.column(item))
+            return pl.eager.series.wrap_s(self._df.column(item))
         except RuntimeError:
             raise AttributeError(f"{item} not found")
 
@@ -673,7 +667,7 @@ def __getitem__(self, item: Any) -> Any:
                 # df[:, unknown]
                 series = self.__getitem__(col_selection)
                 # s[:]
-                pl.wrap_s(series[row_selection])
+                pl.eager.series.wrap_s(series[row_selection])
 
             # df[2, :] (select row as df)
             if isinstance(row_selection, int):
@@ -706,7 +700,7 @@ def __getitem__(self, item: Any) -> Any:
         # select single column
         # df["foo"]
         if isinstance(item, str):
-            return pl.wrap_s(self._df.column(item))
+            return pl.eager.series.wrap_s(self._df.column(item))
 
         # df[idx]
         if isinstance(item, int):
@@ -1012,7 +1006,7 @@ def describe_cast(self: "DataFrame") -> "DataFrame":
                     columns.append(s)
             return pl.DataFrame(columns)
 
-        summary = pl.concat(
+        summary = pl.functions.concat(
             [
                 describe_cast(self.mean()),
                 describe_cast(self.std()),
@@ -1503,7 +1497,7 @@ def apply(
         return_dtype
             Output type of the operation. If none given, Polars tries to infer the type.
         """
-        return pl.wrap_s(self._df.apply(f, return_dtype))
+        return pl.eager.series.wrap_s(self._df.apply(f, return_dtype))
 
     def with_column(self, column: Union["pl.Series", "pl.Expr"]) -> "DataFrame":
         """
@@ -1608,7 +1602,7 @@ def drop_in_place(self, name: str) -> "pl.Series":
         name
             Column to drop.
         """
-        return pl.wrap_s(self._df.drop_in_place(name))
+        return pl.eager.series.wrap_s(self._df.drop_in_place(name))
 
     def select_at_idx(self, idx: int) -> "pl.Series":
         """
@@ -1619,7 +1613,7 @@ def select_at_idx(self, idx: int) -> "pl.Series":
         idx
             Location of selection.
         """
-        return pl.wrap_s(self._df.select_at_idx(idx))
+        return pl.eager.series.wrap_s(self._df.select_at_idx(idx))
 
     def clone(self) -> "DataFrame":
         """
@@ -1631,7 +1625,7 @@ def get_columns(self) -> tp.List["pl.Series"]:
         """
         Get the DataFrame as a List of Series.
         """
-        return list(map(lambda s: pl.wrap_s(s), self._df.get_columns()))
+        return list(map(lambda s: pl.eager.series.wrap_s(s), self._df.get_columns()))
 
     def fill_none(self, strategy: Union[str, "pl.Expr"]) -> "DataFrame":
         """
@@ -1737,13 +1731,13 @@ def is_duplicated(self) -> "pl.Series":
         """
         Get a mask of all duplicated rows in this DataFrame.
         """
-        return pl.wrap_s(self._df.is_duplicated())
+        return pl.eager.series.wrap_s(self._df.is_duplicated())
 
     def is_unique(self) -> "pl.Series":
         """
         Get a mask of all unique rows in this DataFrame.
         """
-        return pl.wrap_s(self._df.is_unique())
+        return pl.eager.series.wrap_s(self._df.is_unique())
 
     def lazy(self) -> "pl.LazyFrame":
         """
@@ -1759,7 +1753,7 @@ def lazy(self) -> "pl.LazyFrame":
 
         Lazy operations are advised because they allow for query optimization and more parallelization.
         """
-        return pl.wrap_ldf(self._df.lazy())
+        return pl.lazy.frame.wrap_ldf(self._df.lazy())
 
     def select(
         self, exprs: Union[str, "pl.Expr", Sequence[str], Sequence["pl.Expr"]]
@@ -1806,7 +1800,7 @@ def max(self, axis: int = 0) -> "DataFrame":
         if axis == 0:
             return wrap_df(self._df.max())
         if axis == 1:
-            return pl.wrap_s(self._df.hmax()).to_frame()
+            return pl.eager.series.wrap_s(self._df.hmax()).to_frame()
         raise ValueError("Axis should be 0 or 1.")
 
     def min(self, axis: int = 0) -> "DataFrame":
@@ -1816,7 +1810,7 @@ def min(self, axis: int = 0) -> "DataFrame":
         if axis == 0:
             return wrap_df(self._df.min())
         if axis == 1:
-            return pl.wrap_s(self._df.hmin()).to_frame()
+            return pl.eager.series.wrap_s(self._df.hmin()).to_frame()
         raise ValueError("Axis should be 0 or 1.")
 
     def sum(self, axis: int = 0) -> "DataFrame":
@@ -1826,7 +1820,7 @@ def sum(self, axis: int = 0) -> "DataFrame":
         if axis == 0:
             return wrap_df(self._df.sum())
         if axis == 1:
-            return pl.wrap_s(self._df.hsum()).to_frame()
+            return pl.eager.series.wrap_s(self._df.hsum()).to_frame()
         raise ValueError("Axis should be 0 or 1.")
 
     def mean(self, axis: int = 0) -> "DataFrame":
@@ -1836,7 +1830,7 @@ def mean(self, axis: int = 0) -> "DataFrame":
         if axis == 0:
             return wrap_df(self._df.mean())
         if axis == 1:
-            return pl.wrap_s(self._df.hmean()).to_frame()
+            return pl.eager.series.wrap_s(self._df.hmean()).to_frame()
         raise ValueError("Axis should be 0 or 1.")
 
     def std(self) -> "DataFrame":
@@ -2054,7 +2048,7 @@ def hash_rows(
         k3
             seed parameter
         """
-        return pl.wrap_s(self._df.hash_rows(k0, k1, k2, k3))
+        return pl.eager.series.wrap_s(self._df.hash_rows(k0, k1, k2, k3))
 
 
 class GroupBy:
@@ -2551,33 +2545,3 @@ def apply(
             df[name] = s
 
         return df
-
-
-class StringCache:
-    """
-    Context manager that allows data sources to share the same categorical features.
-    This will temporarily cache the string categories until the context manager is finished.
-    """
-
-    def __init__(self) -> None:
-        pass
-
-    def __enter__(self) -> "StringCache":
-        pytoggle_string_cache(True)
-        return self
-
-    def __exit__(
-        self,
-        exc_type: Optional[Type[BaseException]],
-        exc_val: Optional[BaseException],
-        exc_tb: Optional[TracebackType],
-    ) -> None:
-        pytoggle_string_cache(False)
-
-
-def toggle_string_cache(toggle: bool) -> None:
-    """
-    Turn on/off the global string cache. This ensures that casts to Categorical types have the categories when string
-    values are equal.
-    """
-    pytoggle_string_cache(toggle)
diff --git a/py-polars/polars/eager/series.py b/py-polars/polars/eager/series.py
@@ -49,7 +49,6 @@
 
 __all__ = [
     "Series",
-    "wrap_s",
 ]
 
 
@@ -527,7 +526,7 @@ def to_frame(self) -> "pl.DataFrame":
         """
         Cast this Series to a DataFrame.
         """
-        return pl.wrap_df(PyDataFrame([self._s]))
+        return pl.eager.frame.wrap_df(PyDataFrame([self._s]))
 
     @property
     def dtype(self) -> Type[DataType]:
@@ -640,13 +639,13 @@ def to_dummies(self) -> "pl.DataFrame":
         """
         Get dummy variables.
         """
-        return pl.wrap_df(self._s.to_dummies())
+        return pl.eager.frame.wrap_df(self._s.to_dummies())
 
     def value_counts(self) -> "pl.DataFrame":
         """
         Count the unique values in a Series.
         """
-        return pl.wrap_df(self._s.value_counts())
+        return pl.eager.frame.wrap_df(self._s.value_counts())
 
     @property
     def name(self) -> str: