Many Python type hint fixes (#966)

Changes: Fixed many type hints that were previously ignored with # type: ignore Some of these ignores were masking bugs/unwanted behaviour. Fixes included in this PR: DataFrame/LazyFrame.join now properly handles Expr arguments for left_on/right_on. Series.__setitem__ now uses .cast(UInt64) instead of the non-existant .cast_u64() GBSelection.apply now catches an Error when selection=None (result of call from select_all) Some lines require # type: ignore due to limitations in mypy. For these lines, the error code has been included, e.g. # type: ignore[override]. This means that unintended type errors are still caught. Removed return type hint from __init__ functions, as this is not required in the latest mypy versions (since it is always None). Unused private function _as_float_ndarray deleted from utils.py To do in a future PR: Fix the last three nasty '# type: ignore` statements Properly type hint all the file IO functions; these appear to be incomplete at times
pola-rs · Jul 14, 2021 · 17f1ce0 · 17f1ce0
1 parent c0e5617
commit 17f1ce0
Show file tree

Hide file tree

Showing 7 changed files with 127 additions and 120 deletions.
diff --git a/py-polars/polars/_html.py b/py-polars/polars/_html.py
@@ -15,7 +15,7 @@ def __init__(
         elements: tp.List[str],
         tag: str,
         attributes: Optional[Dict[str, str]] = None,
-    ) -> None:
+    ):
         self.tag = tag
         self.elements = elements
         self.attributes = attributes
@@ -130,10 +130,11 @@ def write_body(self) -> None:
     def write(self, inner: str) -> None:
         self.elements.append(inner)
 
-    def render(self) -> None:
+    def render(self) -> tp.List[str]:
         with Tag(self.elements, "table", {"border": "1", "class": "dataframe"}):
             self.write_header()
             self.write_body()
+        return self.elements
 
 
 class NotebookFormatter(HTMLFormatter):
@@ -166,7 +167,7 @@ def write_style(self) -> None:
         template = dedent("\n".join((template_first, template_mid, template_last)))
         self.write(template)
 
-    def render(self) -> tp.List[str]:  # type: ignore
+    def render(self) -> tp.List[str]:
         """
         Return the lines needed to render a HTML table.
         """

diff --git a/py-polars/polars/ffi.py b/py-polars/polars/ffi.py
@@ -26,12 +26,3 @@ def _ptr_to_numpy(ptr: int, len: int, ptr_type: Any) -> np.ndarray:
     """
     ptr_ctype = ctypes.cast(ptr, ctypes.POINTER(ptr_type))
     return np.ctypeslib.as_array(ptr_ctype, (len,))
-
-
-def _as_float_ndarray(ptr: int, size: int) -> np.ndarray:
-    """
-    https://github.com/maciejkula/python-rustlearn
-
-    Turn a float* to a numpy array.
-    """
-    return np.core.multiarray.int_asbuffer(ptr, size * np.float32.itemsize)  # type: ignore
diff --git a/py-polars/polars/frame.py b/py-polars/polars/frame.py
@@ -74,7 +74,7 @@ def __init__(
         self,
         data: Union[Dict[str, Sequence], tp.List[Series], np.ndarray],
         nullable: bool = True,
-    ) -> None:
+    ):
         """
         A DataFrame is a two dimensional data structure that represents data as a table with rows and columns.
         """
@@ -248,7 +248,7 @@ def read_csv(
             for k, v in dtype.items():
                 dtype_list.append((k, pytype_to_polars_type(v)))
 
-        null_values = _process_null_values(null_values)  # type: ignore
+        processed_null_values = _process_null_values(null_values)
 
         self._df = PyDataFrame.read_csv(
             file,
@@ -268,7 +268,7 @@ def read_csv(
             dtype_list,
             low_memory,
             comment_char,
-            null_values,
+            processed_null_values,
         )
         return self
 
@@ -723,8 +723,8 @@ def __getitem__(self, item: Any) -> Any:
         if isinstance(item, Sequence):
             if isinstance(item[0], str):
                 return wrap_df(self._df.select(item))
-            if hasattr(item[0], "_pyexpr"):
-                return self.select(item)  # type: ignore
+            elif isinstance(item[0], pl.Expr):
+                return self.select(item)
 
         # select rows by mask or index
         # df[[1, 2, 3]]
@@ -1366,8 +1366,8 @@ def downsample(self, by: Union[str, tp.List[str]], rule: str, n: int) -> "GroupB
     def join(
         self,
         df: "DataFrame",
-        left_on: Optional[Union[str, tp.List[str], "Expr", tp.List["Expr"]]] = None,
-        right_on: Optional[Union[str, tp.List[str], "Expr", tp.List["Expr"]]] = None,
+        left_on: Optional[Union[str, "Expr", tp.List[str], tp.List["Expr"]]] = None,
+        right_on: Optional[Union[str, "Expr", tp.List[str], tp.List["Expr"]]] = None,
         on: Optional[Union[str, tp.List[str]]] = None,
         how: str = "inner",
     ) -> Union["DataFrame", "LazyFrame"]:
@@ -1446,23 +1446,32 @@ def join(
         if how == "cross":
             return wrap_df(self._df.join(df._df, [], [], how))
 
-        if isinstance(left_on, str):
-            left_on = [left_on]
-        if isinstance(right_on, str):
-            right_on = [right_on]
+        left_on_: Union[tp.List[str], tp.List[pl.Expr], None]
+        if isinstance(left_on, (str, pl.Expr)):
+            left_on_ = [left_on]  # type: ignore[assignment]
+        else:
+            left_on_ = left_on
+
+        right_on_: Union[tp.List[str], tp.List[pl.Expr], None]
+        if isinstance(right_on, (str, pl.Expr)):
+            right_on_ = [right_on]  # type: ignore[assignment]
+        else:
+            right_on_ = right_on
 
         if isinstance(on, str):
-            left_on = [on]
-            right_on = [on]
+            left_on_ = [on]
+            right_on_ = [on]
         elif isinstance(on, list):
-            left_on = on
-            right_on = on
-        if left_on is None or right_on is None:
+            left_on_ = on
+            right_on_ = on
+
+        if left_on_ is None or right_on_ is None:
             raise ValueError("You should pass the column to join on as an argument.")
-        if isinstance(left_on[0], pl.Expr) or isinstance(right_on[0], pl.Expr):  # type: ignore
-            return self.lazy().join(df.lazy(), left_on, right_on, how=how)
 
-        return wrap_df(self._df.join(df._df, left_on, right_on, how))
+        if isinstance(left_on_[0], pl.Expr) or isinstance(right_on_[0], pl.Expr):
+            return self.lazy().join(df.lazy(), left_on, right_on, how=how)
+        else:
+            return wrap_df(self._df.join(df._df, left_on_, right_on_, how))
 
     def apply(
         self,
@@ -1742,7 +1751,7 @@ def lazy(self) -> "LazyFrame":
         return wrap_ldf(self._df.lazy())
 
     def select(
-        self, exprs: Union[str, "Expr", tp.List[str], tp.List["Expr"]]
+        self, exprs: Union[str, "Expr", Sequence[str], Sequence["Expr"]]
     ) -> "DataFrame":
         """
         Select columns from this DataFrame.
@@ -2033,7 +2042,7 @@ def __init__(
         downsample: bool = False,
         rule: Optional[str] = None,
         downsample_n: int = 0,
-    ) -> None:
+    ):
         self._df = df
         self.by = by
         self.downsample = downsample
@@ -2167,8 +2176,8 @@ def agg(
         elif isinstance(column_to_agg, list):
 
             if isinstance(column_to_agg[0], tuple):
-                column_to_agg = [  # type: ignore
-                    (column, [agg] if isinstance(agg, str) else agg)  # type: ignore
+                column_to_agg = [  # type: ignore[misc]
+                    (column, [agg] if isinstance(agg, str) else agg)  # type: ignore[misc]
                     for (column, agg) in column_to_agg
                 ]
 
@@ -2177,7 +2186,7 @@ def agg(
                     wrap_df(self._df)
                     .lazy()
                     .groupby(self.by)
-                    .agg(column_to_agg)  # type: ignore
+                    .agg(column_to_agg)  # type: ignore[arg-type]
                     .collect(no_optimization=True, string_cache=False)
                 )
 
@@ -2316,7 +2325,7 @@ def __init__(
         by: Union[str, tp.List[str]],
         pivot_column: str,
         values_column: str,
-    ) -> None:
+    ):
         self._df = df
         self.by = by
         self.pivot_column = pivot_column
@@ -2392,7 +2401,7 @@ def __init__(
         downsample: bool = False,
         rule: Optional[str] = None,
         downsample_n: int = 0,
-    ) -> None:
+    ):
         self._df = df
         self.by = by
         self.selection = selection
@@ -2491,18 +2500,18 @@ def agg_list(self) -> DataFrame:
 
     def apply(
         self,
-        func: Union[Callable[[Any], Any], Callable[[Any], Any]],
+        func: Callable[[Any], Any],
         return_dtype: Optional[Type[DataType]] = None,
     ) -> DataFrame:
         """
         Apply a function over the groups.
         """
         df = self.agg_list()
-        if isinstance(self.selection, str):
-            selection = [self.selection]
-        else:
-            selection = self.selection
-        for name in selection:  # type: ignore
+        if self.selection is None:
+            raise TypeError(
+                "apply not available for Groupby.select_all(). Use select() instead."
+            )
+        for name in self.selection:
             s = df.drop_in_place(name + "_agg_list").apply(func, return_dtype)
             s.rename(name, in_place=True)
             df[name] = s

diff --git a/py-polars/polars/functions.py b/py-polars/polars/functions.py
@@ -268,7 +268,7 @@ def read_csv(
                 [f"column_{int(column[1:]) + 1}" for column in tbl.column_names]
             )
 
-        return from_arrow(tbl, rechunk)  # type: ignore
+        return from_arrow(tbl, rechunk)  # type: ignore[return-value]
 
     with _prepare_file_arg(file, **storage_options) as data:
         df = DataFrame.read_csv(
@@ -414,11 +414,11 @@ def read_ipc(
     """
     storage_options = storage_options or {}
     with _prepare_file_arg(file, **storage_options) as data:
-        return DataFrame.read_ipc(data, use_pyarrow)  # type: ignore
+        return DataFrame.read_ipc(data, use_pyarrow)
 
 
 def read_parquet(
-    source: Union[str, BinaryIO, Path, tp.List[str]],
+    source: Union[str, BinaryIO],
     stop_after_n_rows: Optional[int] = None,
     memory_map: bool = True,
     columns: Optional[tp.List[str]] = None,
@@ -453,8 +453,10 @@ def read_parquet(
     storage_options = storage_options or {}
     with _prepare_file_arg(source, **storage_options) as source_prep:
         if stop_after_n_rows is not None:
-            return DataFrame.read_parquet(source_prep, stop_after_n_rows=stop_after_n_rows)  # type: ignore
-        return from_arrow(  # type: ignore
+            return DataFrame.read_parquet(
+                source_prep, stop_after_n_rows=stop_after_n_rows
+            )
+        return from_arrow(  # type: ignore[return-value]
             pa.parquet.read_table(
                 source_prep, memory_map=memory_map, columns=columns, **kwargs
             )
@@ -533,7 +535,7 @@ def _from_pandas_helper(a: pd.Series) -> pa.Array:  # noqa: F821
 def from_pandas(
     df: Union[pd.DataFrame, pd.Series, pd.DatetimeIndex],
     rechunk: bool = True,  # noqa: F821
-) -> DataFrame:
+) -> Union[Series, DataFrame]:
     """
     Convert from a pandas DataFrame to a polars DataFrame.
 
@@ -549,7 +551,7 @@ def from_pandas(
     A Polars DataFrame
     """
     if isinstance(df, pd.Series) or isinstance(df, pd.DatetimeIndex):
-        return from_arrow(_from_pandas_helper(df))  # type: ignore
+        return from_arrow(_from_pandas_helper(df))
 
     # Note: we first tried to infer the schema via pyarrow and then modify the schema if needed.
     # However arrow 3.0 determines the type of a string like this:
@@ -562,10 +564,10 @@ def from_pandas(
         data[name] = _from_pandas_helper(s)
 
     table = pa.table(data)
-    return from_arrow(table, rechunk)  # type: ignore
+    return from_arrow(table, rechunk)
 
 
-def concat(dfs: tp.List[DataFrame], rechunk: bool = True) -> DataFrame:
+def concat(dfs: Sequence[DataFrame], rechunk: bool = True) -> DataFrame:
     """
     Aggregate all the Dataframes in a List of DataFrames to a single DataFrame.
 
@@ -580,7 +582,7 @@ def concat(dfs: tp.List[DataFrame], rechunk: bool = True) -> DataFrame:
     df = dfs[0].clone()
     for i in range(1, len(dfs)):
         try:
-            df = df.vstack(dfs[i], in_place=False)  # type: ignore
+            df = df.vstack(dfs[i], in_place=False)  # type: ignore[assignment]
         # could have a double borrow (one mutable one ref)
         except RuntimeError:
             df.vstack(dfs[i].clone(), in_place=True)
@@ -613,9 +615,7 @@ def repeat(val: Union[int, float, str], n: int, name: Optional[str] = None) -> S
         return Series.from_arrow(name, pa.repeat(val, n))
 
 
-def read_json(
-    source: Union[str, StringIO, Path],
-) -> DataFrame:
+def read_json(source: Union[str, BytesIO]) -> DataFrame:
     """
     Read into a DataFrame from JSON format.
 
@@ -624,7 +624,7 @@ def read_json(
     source
         Path to a file or a file like object.
     """
-    return DataFrame.read_json(source)  # type: ignore
+    return DataFrame.read_json(source)
 
 
 def from_rows(
@@ -682,7 +682,7 @@ def read_sql(sql: str, engine: Any) -> DataFrame:
         # conversion from pandas to arrow is very cheap compared to db driver
         import pandas as pd
 
-        return from_pandas(pd.read_sql(sql, engine))
+        return from_pandas(pd.read_sql(sql, engine))  # type: ignore[return-value]
     except ImportError:
         from sqlalchemy import text