[FIX] Remove redundant categorical imputation (automl#375)

* remove categorical strategy from simple imputer * fix tests * address comments from eddie * fix flake and mypy error * fix test cases for imputation
ravinkohli · Apr 12, 2022 · d9fa7b2 · d9fa7b2
1 parent e6efce8
commit d9fa7b2
Show file tree

Hide file tree

Showing 5 changed files with 93 additions and 149 deletions.
diff --git a/autoPyTorch/configs/greedy_portfolio.json b/autoPyTorch/configs/greedy_portfolio.json
@@ -1,7 +1,6 @@
 [{"data_loader:batch_size": 60,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedMLPBackbone",
@@ -32,7 +31,6 @@
  {"data_loader:batch_size": 255,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedResNetBackbone",
@@ -66,7 +64,6 @@
  {"data_loader:batch_size": 165,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedResNetBackbone",
@@ -97,7 +94,6 @@
  {"data_loader:batch_size": 299,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedResNetBackbone",
@@ -129,7 +125,6 @@
  {"data_loader:batch_size": 183,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedResNetBackbone",
@@ -163,7 +158,6 @@
  {"data_loader:batch_size": 21,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedMLPBackbone",
@@ -192,7 +186,6 @@
  {"data_loader:batch_size": 159,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "TruncatedSVD",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedMLPBackbone",
@@ -222,7 +215,6 @@
  {"data_loader:batch_size": 442,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "TruncatedSVD",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedResNetBackbone",
@@ -255,7 +247,6 @@
  {"data_loader:batch_size": 140,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "TruncatedSVD",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedResNetBackbone",
@@ -288,7 +279,6 @@
  {"data_loader:batch_size": 48,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedMLPBackbone",
@@ -316,7 +306,6 @@
  {"data_loader:batch_size": 168,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedResNetBackbone",
@@ -349,7 +338,6 @@
  {"data_loader:batch_size": 21,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedMLPBackbone",
@@ -378,7 +366,6 @@
  {"data_loader:batch_size": 163,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedResNetBackbone",
@@ -411,7 +398,6 @@
  {"data_loader:batch_size": 150,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedResNetBackbone",
@@ -445,7 +431,6 @@
  {"data_loader:batch_size": 151,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "TruncatedSVD",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedMLPBackbone",
@@ -475,7 +460,6 @@
  {"data_loader:batch_size": 42,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "TruncatedSVD",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedResNetBackbone",

diff --git a/...Torch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/...Torch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py
@@ -1,7 +1,8 @@
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 
+from sklearn.base import BaseEstimator
 from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import make_pipeline
 
@@ -49,18 +50,25 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer":
         """
 
         self.check_requirements(X, y)
-        numerical_pipeline = 'passthrough'
-        categorical_pipeline = 'passthrough'
 
         preprocessors = get_tabular_preprocessers(X)
-        if len(X['dataset_properties']['numerical_columns']):
+        column_transformers: List[Tuple[str, BaseEstimator, List[int]]] = []
+        if len(preprocessors['numerical']) > 0:
             numerical_pipeline = make_pipeline(*preprocessors['numerical'])
-        if len(X['dataset_properties']['categorical_columns']):
+            column_transformers.append(
+                ('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns'])
+            )
+        if len(preprocessors['categorical']) > 0:
             categorical_pipeline = make_pipeline(*preprocessors['categorical'])
-
-        self.preprocessor = ColumnTransformer([
-            ('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns']),
-            ('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])],
+            column_transformers.append(
+                ('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])
+            )
+
+        # in case the preprocessing steps are disabled
+        # i.e, NoEncoder for categorical, we want to
+        # let the data in categorical columns pass through
+        self.preprocessor = ColumnTransformer(
+            column_transformers,
             remainder='passthrough'
         )
 

diff --git a/...Torch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py b/...Torch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py
@@ -13,70 +13,42 @@
 
 
 class SimpleImputer(BaseImputer):
-    """An imputer for categorical and numerical columns
-
-    Impute missing values for categorical columns with 'constant_!missing!'
-
-    Note:
-        In case of numpy data, the constant value is set to -1, under the assumption
-        that categorical data is fit with an Ordinal Scaler.
+    """
+    An imputer for numerical columns
 
     Attributes:
         random_state (Optional[np.random.RandomState]):
             The random state to use for the imputer.
         numerical_strategy (str: default='mean'):
             The strategy to use for imputing numerical columns.
             Can be one of ['most_frequent', 'constant_!missing!']
-        categorical_strategy (str: default='most_frequent')
-            The strategy to use for imputing categorical columns.
-            Can be one of ['mean', 'median', 'most_frequent', 'constant_zero']
     """
 
     def __init__(
         self,
         random_state: Optional[np.random.RandomState] = None,
         numerical_strategy: str = 'mean',
-        categorical_strategy: str = 'most_frequent'
     ):
-        """
-        Note:
-            'constant' as numerical_strategy uses 0 as the default fill_value while
-            'constant_!missing!' uses a fill_value of -1.
-            This behaviour should probably be fixed.
-        """
         super().__init__()
         self.random_state = random_state
         self.numerical_strategy = numerical_strategy
-        self.categorical_strategy = categorical_strategy
 
     def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseImputer:
-        """ Fits the underlying model and returns the transformed array.
+        """
+        Builds the preprocessor based on the given fit dictionary 'X'.
 
         Args:
-            X (np.ndarray):
-                The input features to fit on
-            y (Optional[np.ndarray]):
-                The labels for the input features `X`
+            X (Dict[str, Any]):
+                The fit dictionary
+            y (Optional[Any]):
+                Not Used -- to comply with API
 
         Returns:
-            SimpleImputer:
-                returns self
+            self:
+                returns an instance of self.
         """
         self.check_requirements(X, y)
 
-        # Choose an imputer for any categorical columns
-        categorical_columns = X['dataset_properties']['categorical_columns']
-
-        if isinstance(categorical_columns, List) and len(categorical_columns) != 0:
-            if self.categorical_strategy == 'constant_!missing!':
-                # Train data is numpy as of this point, where an Ordinal Encoding is used
-                # for categoricals. Only Numbers are allowed for `fill_value`
-                imputer = SklearnSimpleImputer(strategy='constant', fill_value=-1, copy=False)
-                self.preprocessor['categorical'] = imputer
-            else:
-                imputer = SklearnSimpleImputer(strategy=self.categorical_strategy, copy=False)
-                self.preprocessor['categorical'] = imputer
-
         # Choose an imputer for any numerical columns
         numerical_columns = X['dataset_properties']['numerical_columns']
 
@@ -98,11 +70,6 @@ def get_hyperparameter_search_space(
             value_range=("mean", "median", "most_frequent", "constant_zero"),
             default_value="mean",
         ),
-        categorical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
-            hyperparameter='categorical_strategy',
-            value_range=("most_frequent", "constant_!missing!"),
-            default_value="most_frequent"
-        )
     ) -> ConfigurationSpace:
         """Get the hyperparameter search space for the SimpleImputer
 
@@ -112,8 +79,6 @@ def get_hyperparameter_search_space(
                 Note: Not actually Optional, just adhering to its supertype
             numerical_strategy (HyperparameterSearchSpace: default = ...)
                 The strategy to use for numerical imputation
-            caterogical_strategy (HyperparameterSearchSpace: default = ...)
-                The strategy to use for categorical imputation
 
         Returns:
             ConfigurationSpace
@@ -132,12 +97,6 @@ def get_hyperparameter_search_space(
         ):
             add_hyperparameter(cs, numerical_strategy, CategoricalHyperparameter)
 
-        if (
-            isinstance(dataset_properties['categorical_columns'], List)
-            and len(dataset_properties['categorical_columns'])
-        ):
-            add_hyperparameter(cs, categorical_strategy, CategoricalHyperparameter)
-
         return cs
 
     @staticmethod

diff --git a/...yTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py b/...yTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py
@@ -14,8 +14,7 @@ class BaseImputer(autoPyTorchTabularPreprocessingComponent):
     def __init__(self) -> None:
         super().__init__()
         self.add_fit_requirements([
-            FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True),
-            FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True)])
+            FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True)])
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         """
@@ -26,7 +25,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         Returns:
             (Dict[str, Any]): the updated 'X' dictionary
         """
-        if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None:
+        if self.preprocessor['numerical'] is None and len(X["dataset_properties"]["numerical_columns"]) != 0:
             raise ValueError("cant call transform on {} without fitting first."
                              .format(self.__class__.__name__))
         X.update({'imputer': self.preprocessor})