Roll-out type checking with mypy (#171)

Roll-out mypy ... Co-authored-by: Amirhessam Tahmassebi <admin@slickml.com>
slickml · Nov 28, 2022 · 664df08 · 664df08
1 parent 1f91289
commit 664df08
Show file tree

Hide file tree

Showing 33 changed files with 429 additions and 355 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -187,7 +187,7 @@
 autoapi_python_use_implicit_namespaces = False
 autoapi_prepare_jinja_env = None
 autoapi_keep_files = False
-suppress_warnings = []
+suppress_warnings = []  # type: ignore
 
 # -- Options for View-Code -------------------------------------------------
 viewcode_follow_imported_members = True

diff --git a/mypy.ini b/mypy.ini
@@ -1,7 +1,5 @@
-# More details on how to update this config file: https://mypy.readthedocs.io/en/stable/config_file.html
-# TODO(amir): Still a lot of advanced options have not been added here! 
-# TODO(amir): Currently we use `ignore_errors = True` which would ignore all non-fatal errors
-# gradually, we have to turn on `ignore_missing_imports = False` and `strict = True`
+# More details on how to update this config file: https://mypy.readthedocs.io/en/stable/config_file.html 
+# TODO(amir): gradually, we have to turn on `strict = True`
 # More details on strategies on how to use `mypy` in large code-base:
 #    - https://blog.wolt.com/engineering/2021/09/30/professional-grade-mypy-configuration/
 #    - https://dropbox.tech/application/our-journey-to-type-checking-4-million-lines-of-python
@@ -18,7 +16,7 @@ disallow_untyped_defs = True
 no_implicit_optional = True
 
 # --- errors ---
-ignore_errors = True
+ignore_errors = False
 
 # --- imports ----
 ignore_missing_imports = True

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -108,7 +108,11 @@ flake8-type-checking = "^2.3"
 flake8-typing-imports = "^1.12"
 flake8-use-fstring = "^1.4"
 pep8-naming = "^0.13"
+
+# --- type-checking ---
 mypy = "^0.991"
+pandas-stubs = "^1.5"
+data-science-types = "^0.2"
 
 # --- unit-testing ---
 pytest = "^7.2"

diff --git a/src/slickml/base/_estimator.py b/src/slickml/base/_estimator.py
@@ -66,11 +66,11 @@ class BaseXGBoostEstimator(ABC, BaseEstimator):
     .. [xgboost-api] https://xgboost.readthedocs.io/en/latest/python/python_api.html
     """
 
-    num_boost_round: int
-    sparse_matrix: bool
-    scale_mean: bool
-    scale_std: bool
-    importance_type: str
+    num_boost_round: Optional[int]
+    sparse_matrix: Optional[bool]
+    scale_mean: Optional[bool]
+    scale_std: Optional[bool]
+    importance_type: Optional[str]
     params: Optional[Dict[str, Union[str, float, int]]] = None
 
     def __post_init__(self) -> None:

diff --git a/src/slickml/classification/_xgboost.py b/src/slickml/classification/_xgboost.py
@@ -594,7 +594,7 @@ def plot_shap_waterfall(
             return_fig=return_fig,
         )
 
-    def get_params(self) -> Dict[str, Union[str, float, int]]:
+    def get_params(self) -> Optional[Dict[str, Union[str, float, int]]]:
         """Returns the final set of train parameters.
 
         The default set of parameters will be updated with the new ones that passed to ``params``.
@@ -653,7 +653,7 @@ def _model(self) -> xgb.Booster:
         return xgb.train(
             params=self.params,
             dtrain=self.dtrain_,
-            num_boost_round=self.num_boost_round - 1,
+            num_boost_round=self.num_boost_round,  # type: ignore
         )
 
     def _explainer(self) -> None:
@@ -682,7 +682,7 @@ def _imp_to_df(self) -> pd.DataFrame:
         -------
         pd.DataFrame
         """
-        data = {
+        data: Dict[str, List[float]] = {
             "feature": [],
             f"{self.importance_type}": [],
         }

diff --git a/src/slickml/classification/_xgboostcv.py b/src/slickml/classification/_xgboostcv.py
@@ -194,7 +194,7 @@ class XGBoostCVClassifier(XGBoostClassifier):
     scale_mean: Optional[bool] = False
     scale_std: Optional[bool] = False
     importance_type: Optional[str] = "total_gain"
-    params: Optional[Dict] = None
+    params: Optional[Dict[str, Union[str, float, int]]] = None
     verbose: Optional[bool] = True
     callbacks: Optional[bool] = False
 
@@ -414,7 +414,9 @@ def _callbacks(self) -> None:
         None
         """
         if self.callbacks:
-            self.callbacks = [
+            # TODO(amir): we receive bool from user and define callbacks; so mypy complains
+            # we prolly need to use type overloads here
+            self.callbacks = [  # type: ignore
                 xgb.callback.EvaluationMonitor(
                     rank=0,
                     period=1,
@@ -437,30 +439,31 @@ def _verbose_log(self) -> None:
         -------
         None
         """
-        print(
-            str(Colors.BOLD)
-            + "*-* "
-            + str(Colors.GREEN)
-            + f"Best Boosting Round = {len(self.cv_results_) - 1}"
-            + str(Colors.END)
-            + str(Colors.BOLD)
-            + " -*- "
-            + str(Colors.F_Red)
-            + f"{self.n_splits}-Folds CV {self.metrics.upper()}: "
-            + str(Colors.END)
-            + str(Colors.BOLD)
-            + str(Colors.B_Blue)
-            + f"Train = {self.cv_results_.iloc[-1][0]:.3f}"
-            + " +/- "
-            + f"{self.cv_results_.iloc[-1][1]:.3f}"
-            + str(Colors.END)
-            + str(Colors.BOLD)
-            + " -*- "
-            + str(Colors.B_Magenta)
-            + f"Test = {self.cv_results_.iloc[-1][2]:.3f}"
-            + " +/- "
-            + f"{self.cv_results_.iloc[-1][3]:.3f}"
-            + str(Colors.END)
-            + str(Colors.BOLD)
-            + " *-*",
-        )
+        if self.metrics is not None:
+            print(
+                str(Colors.BOLD)
+                + "*-* "
+                + str(Colors.GREEN)
+                + f"Best Boosting Round = {len(self.cv_results_) - 1}"
+                + str(Colors.END)
+                + str(Colors.BOLD)
+                + " -*- "
+                + str(Colors.F_Red)
+                + f"{self.n_splits}-Folds CV {self.metrics.upper()}: "
+                + str(Colors.END)
+                + str(Colors.BOLD)
+                + str(Colors.B_Blue)
+                + f"Train = {self.cv_results_.iloc[-1][0]:.3f}"
+                + " +/- "
+                + f"{self.cv_results_.iloc[-1][1]:.3f}"
+                + str(Colors.END)
+                + str(Colors.BOLD)
+                + " -*- "
+                + str(Colors.B_Magenta)
+                + f"Test = {self.cv_results_.iloc[-1][2]:.3f}"
+                + " +/- "
+                + f"{self.cv_results_.iloc[-1][3]:.3f}"
+                + str(Colors.END)
+                + str(Colors.BOLD)
+                + " *-*",
+            )
diff --git a/src/slickml/metrics/_classification.py b/src/slickml/metrics/_classification.py
@@ -196,7 +196,7 @@ class BinaryClassificationMetrics:
     precision_digits: Optional[int] = 3
     display_df: Optional[bool] = True
 
-    def __post_init__(self):
+    def __post_init__(self) -> None:
         """Post instantiation validations and assignments."""
         check_var(
             self.y_true,
@@ -243,7 +243,8 @@ def __post_init__(self):
             dtypes=bool,
         )
         # TODO(amir): add `values_between` option to `check_var()`
-        if self.threshold < 0.0 or self.threshold > 1.0:
+
+        if self.threshold is not None and (self.threshold < 0.0 or self.threshold > 1.0):
             raise ValueError("The input threshold must have a value between 0.0 and 1.0.")
 
         # TODO(amir): how we can pull off special cases like this ?

diff --git a/src/slickml/metrics/_regression.py b/src/slickml/metrics/_regression.py
@@ -149,7 +149,7 @@ class RegressionMetrics:
     precision_digits: Optional[int] = 3
     display_df: Optional[bool] = True
 
-    def __post_init__(self):
+    def __post_init__(self) -> None:
         """Post instantiation validations and assignments."""
         check_var(
             self.y_true,
@@ -390,10 +390,10 @@ def _rec_curve(self) -> Tuple[np.ndarray, np.ndarray, float]:
         interval = 0.01
         accuracy = []
         deviation = np.arange(begin, end, interval)
-        # this would prolly break mypy since it cannot understand that the list is alrady cast to
+        # TODO(amir): this would prolly break mypy since it cannot understand that the list is alrady cast to
         # np.ndarray; so np.array() or np.linalg.norm() should be used
-        norms = np.abs(self.y_true - self.y_pred) / np.sqrt(
-            self.y_true**2 + self.y_pred**2,
+        norms = np.abs(self.y_true - self.y_pred) / np.sqrt(  # type: ignore
+            self.y_true**2 + self.y_pred**2,  # type: ignore
         )
 
         # main loop to count the number of times that the calculated norm is less than deviation
@@ -417,7 +417,8 @@ def _ratio_hist(self) -> Tuple[np.ndarray, float, float, float]:
         -------
         Tuple[np.ndarray, float, float, float]
         """
-        y_ratio = self.y_pred / self.y_true
+        # TODO(amir): self.y_pred is already np.ndarray and mypy does not infer it
+        y_ratio = self.y_pred / self.y_true  # type: ignore
         mean_y_ratio = np.mean(y_ratio)
         std_y_ratio = np.std(y_ratio)
         cv_y_ratio = std_y_ratio / mean_y_ratio

diff --git a/src/slickml/optimization/_bayesianopt.py b/src/slickml/optimization/_bayesianopt.py
@@ -225,7 +225,8 @@ def __post_init__(self) -> None:
             var_name="verbose",
             dtypes=bool,
         )
-        self.verbose = self._verbose()
+        # TODO(amir): use type overload
+        self.verbose = self._verbose()  # type: ignore
         check_var(
             self.objective,
             var_name="objective",
@@ -377,7 +378,7 @@ def _xgb_eval(
 
         return None
 
-    def get_params_bounds(self) -> Dict[str, Tuple[Union[int, float], Union[int, float]]]:
+    def get_params_bounds(self) -> Optional[Dict[str, Tuple[Union[int, float], Union[int, float]]]]:
         """Returns the hyper-parameters boundaries for the tuning process.
 
         Returns
@@ -488,7 +489,7 @@ def _inner_params(
         gamma: float,
         reg_alpha: float,
         reg_lambda: float,
-    ) -> Dict[str, Union[str, float, int]]:
+    ) -> Dict[str, Union[str, float, int, None]]:
         """Default set of parameters passed in inner evaluation.
 
         Notes