Skip to content

Commit

Permalink
[FIX] Remove redundant categorical imputation (automl#375)
Browse files Browse the repository at this point in the history
* remove categorical strategy from simple imputer

* fix tests

* address comments from eddie

* fix flake and mypy error

* fix test cases for imputation
  • Loading branch information
ravinkohli committed Apr 12, 2022
1 parent e6efce8 commit d9fa7b2
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 149 deletions.
16 changes: 0 additions & 16 deletions autoPyTorch/configs/greedy_portfolio.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
[{"data_loader:batch_size": 60,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedMLPBackbone",
Expand Down Expand Up @@ -32,7 +31,6 @@
{"data_loader:batch_size": 255,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedResNetBackbone",
Expand Down Expand Up @@ -66,7 +64,6 @@
{"data_loader:batch_size": 165,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedResNetBackbone",
Expand Down Expand Up @@ -97,7 +94,6 @@
{"data_loader:batch_size": 299,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedResNetBackbone",
Expand Down Expand Up @@ -129,7 +125,6 @@
{"data_loader:batch_size": 183,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedResNetBackbone",
Expand Down Expand Up @@ -163,7 +158,6 @@
{"data_loader:batch_size": 21,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedMLPBackbone",
Expand Down Expand Up @@ -192,7 +186,6 @@
{"data_loader:batch_size": 159,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "TruncatedSVD",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedMLPBackbone",
Expand Down Expand Up @@ -222,7 +215,6 @@
{"data_loader:batch_size": 442,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "TruncatedSVD",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedResNetBackbone",
Expand Down Expand Up @@ -255,7 +247,6 @@
{"data_loader:batch_size": 140,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "TruncatedSVD",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedResNetBackbone",
Expand Down Expand Up @@ -288,7 +279,6 @@
{"data_loader:batch_size": 48,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedMLPBackbone",
Expand Down Expand Up @@ -316,7 +306,6 @@
{"data_loader:batch_size": 168,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedResNetBackbone",
Expand Down Expand Up @@ -349,7 +338,6 @@
{"data_loader:batch_size": 21,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedMLPBackbone",
Expand Down Expand Up @@ -378,7 +366,6 @@
{"data_loader:batch_size": 163,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedResNetBackbone",
Expand Down Expand Up @@ -411,7 +398,6 @@
{"data_loader:batch_size": 150,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedResNetBackbone",
Expand Down Expand Up @@ -445,7 +431,6 @@
{"data_loader:batch_size": 151,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "TruncatedSVD",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedMLPBackbone",
Expand Down Expand Up @@ -475,7 +460,6 @@
{"data_loader:batch_size": 42,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "TruncatedSVD",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedResNetBackbone",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, List, Optional, Tuple, Union

import numpy as np

from sklearn.base import BaseEstimator
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

Expand Down Expand Up @@ -49,18 +50,25 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer":
"""

self.check_requirements(X, y)
numerical_pipeline = 'passthrough'
categorical_pipeline = 'passthrough'

preprocessors = get_tabular_preprocessers(X)
if len(X['dataset_properties']['numerical_columns']):
column_transformers: List[Tuple[str, BaseEstimator, List[int]]] = []
if len(preprocessors['numerical']) > 0:
numerical_pipeline = make_pipeline(*preprocessors['numerical'])
if len(X['dataset_properties']['categorical_columns']):
column_transformers.append(
('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns'])
)
if len(preprocessors['categorical']) > 0:
categorical_pipeline = make_pipeline(*preprocessors['categorical'])

self.preprocessor = ColumnTransformer([
('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns']),
('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])],
column_transformers.append(
('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])
)

# in case the preprocessing steps are disabled
# i.e, NoEncoder for categorical, we want to
# let the data in categorical columns pass through
self.preprocessor = ColumnTransformer(
column_transformers,
remainder='passthrough'
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,70 +13,42 @@


class SimpleImputer(BaseImputer):
"""An imputer for categorical and numerical columns
Impute missing values for categorical columns with 'constant_!missing!'
Note:
In case of numpy data, the constant value is set to -1, under the assumption
that categorical data is fit with an Ordinal Scaler.
"""
An imputer for numerical columns
Attributes:
random_state (Optional[np.random.RandomState]):
The random state to use for the imputer.
numerical_strategy (str: default='mean'):
The strategy to use for imputing numerical columns.
Can be one of ['most_frequent', 'constant_!missing!']
categorical_strategy (str: default='most_frequent')
The strategy to use for imputing categorical columns.
Can be one of ['mean', 'median', 'most_frequent', 'constant_zero']
"""

def __init__(
self,
random_state: Optional[np.random.RandomState] = None,
numerical_strategy: str = 'mean',
categorical_strategy: str = 'most_frequent'
):
"""
Note:
'constant' as numerical_strategy uses 0 as the default fill_value while
'constant_!missing!' uses a fill_value of -1.
This behaviour should probably be fixed.
"""
super().__init__()
self.random_state = random_state
self.numerical_strategy = numerical_strategy
self.categorical_strategy = categorical_strategy

def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseImputer:
""" Fits the underlying model and returns the transformed array.
"""
Builds the preprocessor based on the given fit dictionary 'X'.
Args:
X (np.ndarray):
The input features to fit on
y (Optional[np.ndarray]):
The labels for the input features `X`
X (Dict[str, Any]):
The fit dictionary
y (Optional[Any]):
Not Used -- to comply with API
Returns:
SimpleImputer:
returns self
self:
returns an instance of self.
"""
self.check_requirements(X, y)

# Choose an imputer for any categorical columns
categorical_columns = X['dataset_properties']['categorical_columns']

if isinstance(categorical_columns, List) and len(categorical_columns) != 0:
if self.categorical_strategy == 'constant_!missing!':
# Train data is numpy as of this point, where an Ordinal Encoding is used
# for categoricals. Only Numbers are allowed for `fill_value`
imputer = SklearnSimpleImputer(strategy='constant', fill_value=-1, copy=False)
self.preprocessor['categorical'] = imputer
else:
imputer = SklearnSimpleImputer(strategy=self.categorical_strategy, copy=False)
self.preprocessor['categorical'] = imputer

# Choose an imputer for any numerical columns
numerical_columns = X['dataset_properties']['numerical_columns']

Expand All @@ -98,11 +70,6 @@ def get_hyperparameter_search_space(
value_range=("mean", "median", "most_frequent", "constant_zero"),
default_value="mean",
),
categorical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
hyperparameter='categorical_strategy',
value_range=("most_frequent", "constant_!missing!"),
default_value="most_frequent"
)
) -> ConfigurationSpace:
"""Get the hyperparameter search space for the SimpleImputer
Expand All @@ -112,8 +79,6 @@ def get_hyperparameter_search_space(
Note: Not actually Optional, just adhering to its supertype
numerical_strategy (HyperparameterSearchSpace: default = ...)
The strategy to use for numerical imputation
caterogical_strategy (HyperparameterSearchSpace: default = ...)
The strategy to use for categorical imputation
Returns:
ConfigurationSpace
Expand All @@ -132,12 +97,6 @@ def get_hyperparameter_search_space(
):
add_hyperparameter(cs, numerical_strategy, CategoricalHyperparameter)

if (
isinstance(dataset_properties['categorical_columns'], List)
and len(dataset_properties['categorical_columns'])
):
add_hyperparameter(cs, categorical_strategy, CategoricalHyperparameter)

return cs

@staticmethod
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@ class BaseImputer(autoPyTorchTabularPreprocessingComponent):
def __init__(self) -> None:
super().__init__()
self.add_fit_requirements([
FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True),
FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True)])
FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True)])

def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
"""
Expand All @@ -26,7 +25,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
Returns:
(Dict[str, Any]): the updated 'X' dictionary
"""
if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None:
if self.preprocessor['numerical'] is None and len(X["dataset_properties"]["numerical_columns"]) != 0:
raise ValueError("cant call transform on {} without fitting first."
.format(self.__class__.__name__))
X.update({'imputer': self.preprocessor})
Expand Down

0 comments on commit d9fa7b2

Please sign in to comment.