Tighten mypy config (#2197)

* Tighten mypy config Two configs: * `warn_unused_ignores = true` => flags all `#type: ignore` items in the code that are not needed (anymore) * `show_error_codes = true` => will print out the error code, so we can more narrowly define what to ignore in the future, if needed * Fix typing discrepancies between py3.7 and py3.10 * Break out list comprehension in several functions * Revert to class() and issubclass() calls Typing.cast calls are needed to resolve the differences between py3.7 and py3.10
pola-rs · Dec 28, 2021 · e66d7c9 · e66d7c9
1 parent 3008586
commit e66d7c9
Show file tree

Hide file tree

Showing 10 changed files with 43 additions and 35 deletions.
diff --git a/py-polars/polars/internals/expr.py b/py-polars/polars/internals/expr.py
@@ -339,7 +339,7 @@ def exclude(
 
         """
         if isinstance(columns, str):
-            columns = [columns]  # type: ignore
+            columns = [columns]
             return wrap_expr(self._pyexpr.exclude(columns))
         elif not isinstance(columns, list) and issubclass(columns, DataType):  # type: ignore
             columns = [columns]  # type: ignore

diff --git a/py-polars/polars/internals/frame.py b/py-polars/polars/internals/frame.py
@@ -1174,7 +1174,7 @@ def _pos_idx(self, idx: int, dim: int) -> int:
     # __getitem__() mostly returns a dataframe. The major exception is when a string is passed in. Note that there are
     # more subtle cases possible where a non-string value leads to a Series.
     @overload
-    def __getitem__(self, item: str) -> "pli.Series":  # type: ignore
+    def __getitem__(self, item: str) -> "pli.Series":
         ...
 
     @overload
@@ -1272,7 +1272,7 @@ def __getitem__(
                     series_list = [self.to_series(i) for i in col_selection]
                     df = DataFrame(series_list)
                     return df[row_selection]
-            df = self.__getitem__(col_selection)  # type: ignore
+            df = self.__getitem__(col_selection)
             return df.__getitem__(row_selection)
 
         # select single column
@@ -1292,7 +1292,7 @@ def __getitem__(
         if isinstance(item, slice):
             # special case df[::-1]
             if item.start is None and item.stop is None and item.step == -1:
-                return self.select(pli.col("*").reverse())  # type: ignore
+                return self.select(pli.col("*").reverse())
 
             if getattr(item, "end", False):
                 raise ValueError("A slice with steps larger than 1 is not supported.")
@@ -1312,7 +1312,7 @@ def __getitem__(
             else:
                 # df[start:stop:step]
                 return self.select(
-                    pli.col("*").slice(start, length).take_every(item.step)  # type: ignore
+                    pli.col("*").slice(start, length).take_every(item.step)
                 )
 
         # select rows by numpy mask or index
@@ -1743,10 +1743,10 @@ def describe_cast(self: "DataFrame") -> "DataFrame":
                 describe_cast(self.median()),
             ]
         )
-        summary.insert_at_idx(  # type: ignore
+        summary.insert_at_idx(
             0, pli.Series("describe", ["mean", "std", "min", "max", "median"])
         )
-        return summary  # type: ignore
+        return summary
 
     def replace_at_idx(self, index: int, series: "pli.Series") -> None:
         """
@@ -4218,20 +4218,24 @@ def agg(
         └─────┴─────┘
 
         """
+
+        # a single list comprehension would be cleaner, but mypy complains on different
+        # lines for py3.7 vs py3.10 about typing errors, so this is the same logic,
+        # but broken down into two small functions
+        def _str_to_list(y: Any) -> Any:
+            return [y] if isinstance(y, str) else y
+
+        def _wrangle(x: Any) -> list:
+            return [(xi[0], _str_to_list(xi[1])) for xi in x]
+
         if isinstance(column_to_agg, pli.Expr):
             column_to_agg = [column_to_agg]
         if isinstance(column_to_agg, dict):
-            column_to_agg = [
-                (column, [agg] if isinstance(agg, str) else agg)
-                for (column, agg) in column_to_agg.items()
-            ]
+            column_to_agg = _wrangle(column_to_agg.items())
         elif isinstance(column_to_agg, list):
 
             if isinstance(column_to_agg[0], tuple):
-                column_to_agg = [  # type: ignore[misc]
-                    (column, [agg] if isinstance(agg, str) else agg)  # type: ignore[misc]
-                    for (column, agg) in column_to_agg
-                ]
+                column_to_agg = _wrangle(column_to_agg)
 
             elif isinstance(column_to_agg[0], pli.Expr):
                 return (

diff --git a/py-polars/polars/internals/lazy_functions.py b/py-polars/polars/internals/lazy_functions.py
@@ -1,6 +1,6 @@
 from datetime import date, datetime, timezone
 from inspect import isclass
-from typing import Any, Callable, List, Optional, Sequence, Type, Union, overload
+from typing import Any, Callable, List, Optional, Sequence, Type, Union, cast, overload
 
 import numpy as np
 
@@ -126,8 +126,11 @@ def col(
     if isinstance(name, pli.Series):
         name = name.to_list()  # type: ignore
 
-    if isclass(name) and issubclass(name, DataType):  # type: ignore
-        name = [name]  # type: ignore
+    # note: we need the typing.cast call here twice to make mypy happy under Python 3.7
+    # On Python 3.10, it is not needed. We use cast as it works across versions, ignoring
+    # the typing error would lead to unneeded ignores under Python 3.10.
+    if isclass(name) and issubclass(cast(type, name), DataType):
+        name = [cast(type, name)]
 
     if isinstance(name, list):
         if len(name) == 0 or isinstance(name[0], str):
@@ -949,13 +952,13 @@ def _datetime(
     day_expr = pli.expr_to_lit_or_expr(day, str_to_lit=False)
 
     if hour is not None:
-        hour = pli.expr_to_lit_or_expr(hour, str_to_lit=False)._pyexpr  # type: ignore
+        hour = pli.expr_to_lit_or_expr(hour, str_to_lit=False)._pyexpr
     if minute is not None:
-        minute = pli.expr_to_lit_or_expr(minute, str_to_lit=False)._pyexpr  # type: ignore
+        minute = pli.expr_to_lit_or_expr(minute, str_to_lit=False)._pyexpr
     if second is not None:
-        second = pli.expr_to_lit_or_expr(second, str_to_lit=False)._pyexpr  # type: ignore
+        second = pli.expr_to_lit_or_expr(second, str_to_lit=False)._pyexpr
     if millisecond is not None:
-        millisecond = pli.expr_to_lit_or_expr(millisecond, str_to_lit=False)._pyexpr  # type: ignore
+        millisecond = pli.expr_to_lit_or_expr(millisecond, str_to_lit=False)._pyexpr
     return pli.wrap_expr(
         py_datetime(
             year_expr._pyexpr,

diff --git a/py-polars/polars/internals/series.py b/py-polars/polars/internals/series.py
@@ -3531,7 +3531,7 @@ def concat(self, other: Union[List[Series], Series]) -> "Series":
         names.insert(0, s.name)
         df = pli.DataFrame(other)
         df.insert_at_idx(0, s)
-        return df.select(pli.concat_list(names))[s.name]  # type: ignore
+        return df.select(pli.concat_list(names))[s.name]
 
     def get(self, index: int) -> "Series":
         """

diff --git a/py-polars/polars/io.py b/py-polars/polars/io.py
@@ -292,9 +292,9 @@ def read_csv(
             if not has_header:
                 # Convert 'column_1', 'column_2', ... column names to 'f0', 'f1', ... column names for pyarrow,
                 # if CSV file does not contain a header.
-                include_columns = [f"f{int(column[7:]) - 1}" for column in columns]  # type: ignore
+                include_columns = [f"f{int(column[7:]) - 1}" for column in columns]
             else:
-                include_columns = columns  # type: ignore
+                include_columns = columns
 
         if not columns and projection:
             # Convert column indices from projection to 'f0', 'f1', ... column names for pyarrow.

diff --git a/py-polars/pyproject.toml b/py-polars/pyproject.toml
@@ -23,6 +23,8 @@ profile = "black"
 
 [tool.mypy]
 disallow_untyped_defs = true
+warn_unused_ignores = true
+show_error_codes = true
 files = ["polars", "tests"]
 
 [[tool.mypy.overrides]]

diff --git a/py-polars/tests/test_df.py b/py-polars/tests/test_df.py
@@ -743,8 +743,8 @@ def test_from_pandas_nan_to_none() -> None:
             "nulls": [None, np.nan, np.nan],
         }
     )
-    out_true: pl.DataFrame = pl.from_pandas(df)  # type: ignore
-    out_false: pl.DataFrame = pl.from_pandas(df, nan_to_none=False)  # type: ignore
+    out_true = pl.from_pandas(df)
+    out_false = pl.from_pandas(df, nan_to_none=False)
     df.loc[2, "nulls"] = pd.NA
     assert all(val is None for val in out_true["nulls"])
     assert all(np.isnan(val) for val in out_false["nulls"][1:])
@@ -785,7 +785,7 @@ def test_concat() -> None:
     assert a.shape == (2, 2)
 
     with pytest.raises(ValueError):
-        _ = pl.concat([])  # type: ignore
+        _ = pl.concat([])
 
     with pytest.raises(ValueError):
         pl.concat([df, df], how="rubbish")
@@ -1059,8 +1059,7 @@ def test_rename(df: pl.DataFrame) -> None:
 def test_to_json(df: pl.DataFrame) -> None:
     # text based conversion loses time info
     df = df.select(pl.all().exclude(["cat", "time"]))
-    s: str = df.to_json(to_string=True)  # type: ignore
-    # TODO add overload on to_json()
+    s = df.to_json(to_string=True)
     out = pl.read_json(s)
     assert df.frame_equal(out, null_equal=True)
 
@@ -1124,7 +1123,7 @@ def test_join_dates() -> None:
     )
     dts = (
         pl.from_pandas(date_times)
-        .apply(lambda x: x + np.random.randint(1_000 * 60, 60_000 * 60))  # type: ignore
+        .apply(lambda x: x + np.random.randint(1_000 * 60, 60_000 * 60))
         .cast(pl.Datetime)
     )
 

diff --git a/py-polars/tests/test_interop.py b/py-polars/tests/test_interop.py
@@ -108,7 +108,7 @@ def test_from_arrow() -> None:
 
     # if not a PyArrow type, raise a ValueError
     with pytest.raises(ValueError):
-        _ = pl.from_arrow([1, 2])  # type: ignore
+        _ = pl.from_arrow([1, 2])
 
 
 def test_from_pandas_dataframe() -> None:

diff --git a/py-polars/tests/test_io.py b/py-polars/tests/test_io.py
@@ -48,7 +48,7 @@ def test_select_columns_and_projection_from_buffer() -> None:
     df = pl.DataFrame({"a": [1, 2, 3], "b": [True, False, True], "c": ["a", "b", "c"]})
     expected = pl.DataFrame({"b": [True, False, True], "c": ["a", "b", "c"]})
     for to_fn, from_fn in zip(
-        [df.to_parquet, df.to_ipc], [pl.read_parquet, pl.read_ipc]  # type: ignore
+        [df.to_parquet, df.to_ipc], [pl.read_parquet, pl.read_ipc]
     ):
         f = io.BytesIO()
         to_fn(f)  # type: ignore
@@ -58,7 +58,7 @@ def test_select_columns_and_projection_from_buffer() -> None:
         assert df_1.frame_equal(expected)
 
     for to_fn, from_fn in zip(
-        [df.to_parquet, df.to_ipc], [pl.read_parquet, pl.read_ipc]  # type: ignore
+        [df.to_parquet, df.to_ipc], [pl.read_parquet, pl.read_ipc]
     ):
         f = io.BytesIO()
         to_fn(f)  # type: ignore

diff --git a/py-polars/tests/test_series.py b/py-polars/tests/test_series.py
@@ -929,7 +929,7 @@ def test_abs() -> None:
     )
     testing.assert_series_equal(
         pl.select(pl.lit(s).abs()).to_series(), pl.Series([1.0, 2.0, 3.0, 4.0])
-    )  # type: ignore
+    )
 
 
 def test_to_dummies() -> None: