diff --git a/examples/README.md b/examples/README.md index fa71afaa..31db0d77 100644 --- a/examples/README.md +++ b/examples/README.md @@ -14,3 +14,4 @@ 12. `demo11.py` - AutoML NLP preset usage for tabular datasets with text columns 13. `demo12.py` - AutoML tabular preset usage with custom validation scheme and multiprocessed inference 14. `demo13.py` - AutoML TS preset usage with lag and diff transformers' parameters selection +15. `demo14.py` - Groupby features (using TabularAutoML preset and custom pipeline) diff --git a/examples/demo15.py b/examples/demo15.py new file mode 100644 index 00000000..39dbc78d --- /dev/null +++ b/examples/demo15.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python +# coding: utf-8 + +import numpy as np +import pandas as pd + +from sklearn.model_selection import train_test_split + +from lightautoml.automl.base import AutoML +from lightautoml.automl.presets.tabular_presets import TabularAutoML +from lightautoml.dataset.roles import CategoryRole +from lightautoml.dataset.roles import NumericRole +from lightautoml.ml_algo.boost_lgbm import BoostLGBM +from lightautoml.pipelines.features.lgb_pipeline import LGBAdvancedPipeline +from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures +from lightautoml.pipelines.ml.base import MLPipeline +from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector +from lightautoml.pipelines.selection.importance_based import ( + ModelBasedImportanceEstimator, +) +from lightautoml.reader.base import PandasToPandasReader +from lightautoml.tasks import Task + + +################################ +# Features: +# - group_by transformer +################################ + +N_FOLDS = 3 # number of folds for cross-validation inside AutoML +RANDOM_STATE = 42 # fixed random state for various reasons +N_THREADS = 4 # threads cnt for lgbm and linear models +TIMEOUT = 100 +USED_COLS = ["SK_ID_CURR", "TARGET", "NAME_CONTRACT_TYPE", "CODE_GENDER", "AMT_INCOME_TOTAL", "DAYS_BIRTH"] +TARGET = "TARGET" + +# load data +data = pd.read_csv("./data/sampled_app_train.csv") +data = data[USED_COLS] +train, test = train_test_split(data, test_size=2000, random_state=42) + +# Using TabularAutoML preset +task = Task("binary") +roles = { + "target": TARGET, + CategoryRole(dtype=str): ["NAME_CONTRACT_TYPE", "CODE_GENDER"], + NumericRole(np.float32): ["AMT_INCOME_TOTAL"], +} + +# specify groupby triplets: [("group_col", "feature", "transform_type"),] +groupby_triplets = [ + ("CODE_GENDER", "AMT_INCOME_TOTAL", "max"), + ("NAME_CONTRACT_TYPE", "CODE_GENDER", "mode"), + ("NAME_CONTRACT_TYPE", "AMT_INCOME_TOTAL", "delta_mean"), +] + +print(f"Try TabularAutoML with the following groupby_triplets:\n{groupby_triplets}") + +automl = TabularAutoML( + task=task, + timeout=TIMEOUT, + cpu_limit=N_THREADS, + reader_params={"n_jobs": N_THREADS, "cv": N_FOLDS, "random_state": RANDOM_STATE}, + general_params={"use_algos": [["lgb"]]}, + gbm_pipeline_params={"use_groupby": True, "groupby_triplets": groupby_triplets}, +) +automl.fit_predict(train, roles=roles) + +feature_scores = automl.levels[0][0].ml_algos[0].get_features_score() + +print(f"Feature importances of BoostLGBM model. Pay attention to groupby features:\n{feature_scores}") + +# Custom pipeline with groupby features defined by importance +print("\nTry custom pipeline with groupby features defined by importance:\n") + +task = Task("binary") +reader = PandasToPandasReader(task, cv=N_FOLDS, random_state=RANDOM_STATE) +model0 = BoostLGBM(default_params={"learning_rate": 0.1, "num_leaves": 64, "seed": 42, "num_threads": N_THREADS}) +pie = ModelBasedImportanceEstimator() +selector = ImportanceCutoffSelector(LGBSimpleFeatures(), model0, pie, cutoff=-9999) + + +pipe = LGBAdvancedPipeline( + use_groupby=True, pre_selector=selector, groupby_types=["delta_median", "std"], groupby_top_based_on="importance" +) + +model = BoostLGBM( + default_params={ + "learning_rate": 0.05, + "num_leaves": 128, + "seed": 1, + "num_threads": N_THREADS, + } +) + +pipeline = MLPipeline([model], pre_selection=selector, features_pipeline=pipe, post_selection=None) + +automl = AutoML( + reader, + [[pipeline]], + skip_conn=False, +) + +oof_pred = automl.fit_predict( + train, + roles={"target": TARGET}, +) + +print(f"Feature used by BoostLGBM model. Pay attention to groupby features:\n{pipe.output_features}") diff --git a/lightautoml/automl/presets/image_config.yml b/lightautoml/automl/presets/image_config.yml old mode 100644 new mode 100755 index 7d4d17dc..6842bc0a --- a/lightautoml/automl/presets/image_config.yml +++ b/lightautoml/automl/presets/image_config.yml @@ -128,8 +128,21 @@ gbm_pipeline_params: auto_unique_co: 10 # n_classes to use target encoding for multiclass task multiclass_te_co: 3 + # use groupby features + use_groupby: False + # groupby types used in feature engeneering + groupby_types: [ 'delta_median', 'delta_mean', 'min', 'max', 'std', 'mode', 'is_mode' ] + # top features are choosen by cardinality or feature importance + groupby_top_based_on: 'cardinality' + # top categorical features to use in groupby transformer + groupby_top_categorical: 3 + # top numerical features to use in groupby transformer + groupby_top_numerical: 3 + # list of groupby triplets ("group_col", "feat", "groupby_type") for manual setting + # disables groupby_types, groupby_top_based_on and other groupby parameters if defined + groupby_triplets: [ ] # text_features in gbm feature pipeline. embed or simple - cv_features: "simple" + cv_features: 'simple' linear_pipeline_params: # max number of categories to generate intersections @@ -141,8 +154,21 @@ linear_pipeline_params: auto_unique_co: 50 # n_classes to use target encoding for multiclass task multiclass_te_co: 3 + # use groupby features + use_groupby: False + # groupby types used in feature engeneering + groupby_types: [ 'delta_median', 'delta_mean', 'min', 'max', 'std', 'mode', 'is_mode' ] + # top features are choosen by cardinality or feature importance + groupby_top_based_on: 'cardinality' + # top categorical features to use in groupby transformer + groupby_top_categorical: 3 + # top numerical features to use in groupby transformer + groupby_top_numerical: 3 + # list of groupby triplets ("group_col", "feat", "groupby_type") for manual setting + # disables groupby_types, groupby_top_based_on and other groupby parameters if defined + groupby_triplets: [ ] # text_features in linear feature pipeline. embed or simple - cv_features: "embed" + cv_features: 'embed' timing_params: # select timing mode: diff --git a/lightautoml/automl/presets/image_presets.py b/lightautoml/automl/presets/image_presets.py old mode 100644 new mode 100755 index c33fb75f..977ec1d0 --- a/lightautoml/automl/presets/image_presets.py +++ b/lightautoml/automl/presets/image_presets.py @@ -231,7 +231,7 @@ def get_gbms( ): """Get gbm pipeline.""" cv_gbm_feats = self.get_cv_pipe(self.gbm_pipeline_params["cv_features"]) - gbm_feats = LGBAdvancedPipeline(output_categories=False, **self.gbm_pipeline_params) + gbm_feats = LGBAdvancedPipeline(feats_imp=pre_selector, output_categories=False, **self.gbm_pipeline_params) if cv_gbm_feats is not None: gbm_feats.append(cv_gbm_feats) diff --git a/lightautoml/automl/presets/tabular_config.yml b/lightautoml/automl/presets/tabular_config.yml old mode 100644 new mode 100755 index f7e7aa55..0560d046 --- a/lightautoml/automl/presets/tabular_config.yml +++ b/lightautoml/automl/presets/tabular_config.yml @@ -244,6 +244,19 @@ gbm_pipeline_params: multiclass_te_co: 3 # DEV feature: output categorical features as categories (if True, can totally overfit your model - be careful!) output_categories: False + # use groupby features + use_groupby: False + # groupby types used in feature engeneering + groupby_types: [ 'delta_median', 'delta_mean', 'min', 'max', 'std', 'mode', 'is_mode' ] + # top features are choosen by cardinality or feature importance + groupby_top_based_on: 'cardinality' + # top categorical features to use in groupby transformer + groupby_top_categorical: 3 + # top numerical features to use in groupby transformer + groupby_top_numerical: 3 + # list of groupby triplets ("group_col", "feat", "groupby_type") for manual setting + # disables groupby_types, groupby_top_based_on and other groupby parameters if defined + groupby_triplets: [ ] linear_pipeline_params: # max number of categories to generate intersections @@ -255,6 +268,19 @@ linear_pipeline_params: auto_unique_co: 50 # n_classes to use target encoding for multiclass task multiclass_te_co: 3 + # use groupby features + use_groupby: False + # groupby types used in feature engeneering + groupby_types: [ 'delta_median', 'delta_mean', 'min', 'max', 'std', 'mode', 'is_mode' ] + # top features are choosen by cardinality or feature importance + groupby_top_based_on: 'cardinality' + # top categorical features to use in groupby transformer + groupby_top_categorical: 3 + # top numerical features to use in groupby transformer + groupby_top_numerical: 3 + # list of groupby triplets ("group_col", "feat", "groupby_type") for manual setting + # disables groupby_types, groupby_top_based_on and other groupby parameters if defined + groupby_triplets: [ ] nn_pipeline_params: # use quantile transformer for numerical columns diff --git a/lightautoml/automl/presets/tabular_presets.py b/lightautoml/automl/presets/tabular_presets.py old mode 100644 new mode 100755 index 8d208c39..38c0b170 --- a/lightautoml/automl/presets/tabular_presets.py +++ b/lightautoml/automl/presets/tabular_presets.py @@ -268,11 +268,12 @@ def infer_auto_params(self, train_data: DataFrame, multilevel_avail: bool = Fals ) self.reader_params["n_jobs"] = min(self.reader_params["n_jobs"], cpu_cnt) - def get_feature_pipeline(self, model): + def get_feature_pipeline(self, model, **kwargs): """Get LGBSeqSimpleFeatures pipeline if task is the time series prediction. Args: model: one from ["gbm", "linear_l2",, "rf", "nn"]. + kwargs: Arbitrary keyword arguments. Returns: appropriate features pipeline. @@ -285,9 +286,11 @@ def get_feature_pipeline(self, model): if model == "linear_l2": return LinearFeatures(output_categories=True, **self.linear_pipeline_params) if model == "gbm": - return LGBAdvancedPipeline(**self.gbm_pipeline_params) + return LGBAdvancedPipeline(**self.gbm_pipeline_params, **kwargs) if model == "rf": - return LGBAdvancedPipeline(**self.gbm_pipeline_params, fill_na=True) + if "fill_na" in kwargs: + return LGBAdvancedPipeline(**self.gbm_pipeline_params, **kwargs) + return LGBAdvancedPipeline(**self.gbm_pipeline_params, fill_na=True, **kwargs) def get_time_score(self, n_level: int, model_type: str, nested: Optional[bool] = None): @@ -471,7 +474,7 @@ def get_gbms( pre_selector: Optional[SelectionPipeline] = None, ): - gbm_feats = self.get_feature_pipeline(model="gbm") + gbm_feats = self.get_feature_pipeline(model="gbm", feats_imp=pre_selector) ml_algos = [] force_calc = [] @@ -506,7 +509,7 @@ def get_gbms( def get_rfs(self, keys: Sequence[str], n_level: int = 1, pre_selector: Optional[SelectionPipeline] = None): - rf_feats = self.get_feature_pipeline(model="rf") + rf_feats = self.get_feature_pipeline(model="rf", feats_imp=pre_selector, fill_na=True) ml_algos = [] force_calc = [] for key, force in zip(keys, [True, False]): diff --git a/lightautoml/automl/presets/text_config.yml b/lightautoml/automl/presets/text_config.yml old mode 100644 new mode 100755 index 9f4d990f..0d3b9d37 --- a/lightautoml/automl/presets/text_config.yml +++ b/lightautoml/automl/presets/text_config.yml @@ -187,6 +187,19 @@ gbm_pipeline_params: auto_unique_co: 10 # n_classes to use target encoding for multiclass task multiclass_te_co: 3 + # use groupby features + use_groupby: False + # groupby types used in feature engeneering + groupby_types: [ 'delta_median', 'delta_mean', 'min', 'max', 'std', 'mode', 'is_mode' ] + # top features are choosen by cardinality or feature importance + groupby_top_based_on: 'cardinality' + # top categorical features to use in groupby transformer + groupby_top_categorical: 3 + # top numerical features to use in groupby transformer + groupby_top_numerical: 3 + # list of groupby triplets ("group_col", "feat", "groupby_type") for manual setting + # disables groupby_types, groupby_top_based_on and other groupby parameters if defined + groupby_triplets: [ ] # text_features in gbm feature pipeline. embed or tfidf text_features: "embed" @@ -200,6 +213,19 @@ linear_pipeline_params: auto_unique_co: 50 # n_classes to use target encoding for multiclass task multiclass_te_co: 3 + # use groupby features + use_groupby: False + # groupby types used in feature engeneering + groupby_types: [ 'delta_median', 'delta_mean', 'min', 'max', 'std', 'mode', 'is_mode' ] + # top features are choosen by cardinality or feature importance + groupby_top_based_on: 'cardinality' + # top categorical features to use in groupby transformer + groupby_top_categorical: 3 + # top numerical features to use in groupby transformer + groupby_top_numerical: 3 + # list of groupby triplets ("group_col", "feat", "groupby_type") for manual setting + # disables groupby_types, groupby_top_based_on and other groupby parameters if defined + groupby_triplets: [ ] # text_features in linear feature pipeline. embed or tfidf text_features: "tfidf" diff --git a/lightautoml/automl/presets/text_presets.py b/lightautoml/automl/presets/text_presets.py old mode 100644 new mode 100755 index 3b529a8c..d79fdbb4 --- a/lightautoml/automl/presets/text_presets.py +++ b/lightautoml/automl/presets/text_presets.py @@ -351,7 +351,7 @@ def get_gbms( ): text_gbm_feats = self.get_nlp_pipe(self.gbm_pipeline_params["text_features"]) - gbm_feats = LGBAdvancedPipeline(output_categories=False, **self.gbm_pipeline_params) + gbm_feats = LGBAdvancedPipeline(feats_imp=pre_selector, output_categories=False, **self.gbm_pipeline_params) if text_gbm_feats is not None: gbm_feats.append(text_gbm_feats) diff --git a/lightautoml/pipelines/features/base.py b/lightautoml/pipelines/features/base.py old mode 100644 new mode 100755 index 15c6fb14..63e3d428 --- a/lightautoml/pipelines/features/base.py +++ b/lightautoml/pipelines/features/base.py @@ -1,5 +1,7 @@ """Basic classes for features generation.""" +import logging + from copy import copy from copy import deepcopy from typing import Any @@ -17,6 +19,7 @@ from ...dataset.base import LAMLDataset from ...dataset.np_pd_dataset import NumpyDataset from ...dataset.np_pd_dataset import PandasDataset +from ...dataset.roles import CategoryRole from ...dataset.roles import ColumnRole from ...dataset.roles import NumericRole from ...transformers.base import ChangeRoles @@ -34,6 +37,9 @@ from ...transformers.categorical import TargetEncoder from ...transformers.datetime import BaseDiff from ...transformers.datetime import DateSeasons +from ...transformers.groupby import GroupByTransformer +from ...transformers.numeric import FillInf +from ...transformers.numeric import FillnaMedian from ...transformers.numeric import QuantileBinning from ..utils import get_columns_by_role from ..utils import map_pipeline_names @@ -41,6 +47,8 @@ NumpyOrPandas = Union[PandasDataset, NumpyDataset] +logger = logging.getLogger(__name__) + class FeaturesPipeline: """Abstract class. @@ -214,6 +222,12 @@ def __init__(self, **kwargs: Any): self.max_bin_count = 10 self.sparse_ohe = "auto" + self.groupby_types = ["delta_median", "delta_mean", "min", "max", "std", "mode", "is_mode"] + self.groupby_triplets = [] + self.groupby_top_based_on = "cardinality" + self.groupby_top_categorical = 3 + self.groupby_top_numerical = 3 + for k in kwargs: self.__dict__[k] = kwargs[k] @@ -482,8 +496,10 @@ def get_categorical_intersections( if len(categories) <= 1: return + if self.max_intersection_depth <= 1 or self.top_intersections <= 1: + return elif len(categories) > self.top_intersections: - feats_to_select = self.get_top_categories(train, self.top_intersections) + feats_to_select = self.get_top_categories(train, mode="cat_intersections", top_n=self.top_intersections) elif len(feats_to_select) <= 1: return @@ -526,7 +542,7 @@ def get_uniques_cnt(self, train: NumpyOrPandas, feats: List[str]) -> Series: return Series(uns, index=feats, dtype="int") - def get_top_categories(self, train: NumpyOrPandas, top_n: int = 5) -> List[str]: + def get_top_categories(self, train: NumpyOrPandas, mode: str, top_n: int = 5) -> List[str]: """Get top categories by importance. If feature importance is not defined, @@ -536,34 +552,145 @@ def get_top_categories(self, train: NumpyOrPandas, top_n: int = 5) -> List[str]: Args: train: Dataset with train data. + mode: What feature generation mode is used. Can be "cat_intersections" or "groupby". top_n: Number of top categories. Returns: List. """ - if self.max_intersection_depth <= 1 or self.top_intersections <= 1: - return [] - + assert mode in ["cat_intersections", "groupby"] cats = get_columns_by_role(train, "Category") - if len(cats) == 0: + if len(cats) == 0 or top_n == 0: return [] - + elif len(cats) <= top_n: + return cats df = DataFrame({"importance": 0, "cardinality": 0}, index=cats) # importance if defined if self.feats_imp is not None: feats_imp = Series(self.feats_imp.get_features_score()).sort_values(ascending=False) df["importance"] = feats_imp[feats_imp.index.isin(cats)] df["importance"].fillna(-np.inf) - # check for cardinality df["cardinality"] = self.get_uniques_cnt(train, cats) # sort - df = df.sort_values( - by=["importance", "cardinality"], - ascending=[False, self.ascending_by_cardinality], - ) + if mode == "groupby" and self.groupby_top_based_on == "cardinality" or self.feats_imp is None: + df = df.sort_values(by="cardinality", ascending=self.ascending_by_cardinality) + else: + df = df.sort_values(by="importance", ascending=False) # get top n top = list(df.index[:top_n]) + return top + + def get_top_numeric(self, train: NumpyOrPandas, top_n: int = 5) -> List[str]: + """Get top numeric features by importance. + If feature importance is not defined, + or feats has same importance - sort it by unique values counts. + In second case init param ``ascending_by_cardinality`` + defines how - asc or desc. + + Args: + train: Dataset with train data. + top_n: Number of top numeric features. + + Returns: + List. + """ + nums = get_columns_by_role(train, "Numeric") + if len(nums) == 0 or top_n == 0: + return [] + elif len(nums) <= top_n: + return nums + df = DataFrame({"importance": 0, "cardinality": 0}, index=nums) + # importance if defined + if self.feats_imp is not None: + feats_imp = Series(self.feats_imp.get_features_score()).sort_values(ascending=False) + df["importance"] = feats_imp[feats_imp.index.isin(nums)] + df["importance"].fillna(-np.inf) + # check for cardinality + df["cardinality"] = -self.get_uniques_cnt(train, nums) + # sort + if self.groupby_top_based_on == "cardinality" or self.feats_imp is None: + df = df.sort_values(by="cardinality", ascending=self.ascending_by_cardinality) + else: + df = df.sort_values(by="importance", ascending=False) + # get top n + top = list(df.index[:top_n]) return top + + def get_groupby(self, train: NumpyOrPandas) -> Optional[LAMLTransformer]: + """Get transformer that calculates group by features. + + Amount of features is limited to ``self.top_group_by_categorical`` and ``self.top_group_by_numerical`` fields. + + Args: + train: Dataset with train data. + + Returns: + Transformer. + """ + categorical_names = get_columns_by_role(train, "Category") + numerical_names = get_columns_by_role(train, "Numeric") + + groupby_transformations = [] + if len(self.groupby_triplets) > 0: + for group_col, feat_name, trans in self.groupby_triplets: + categorical_cols = [] if feat_name in numerical_names else [feat_name] + numeric_cols = [] if feat_name in categorical_names else [feat_name] + if len(categorical_cols) + len(numeric_cols) == 0: + logging.info2("Feature is incorrect or dropped by preselector: {}".format(feat_name)) + continue + if group_col not in categorical_names: + logging.info2("Groupby column is incorrect or dropped by preselector: {}".format(group_col)) + continue + new_transformation = { + "group_col": group_col, + "categorical_cols": categorical_cols, + "numeric_cols": numeric_cols, + "used_transforms": [trans], + } + groupby_transformations.append(new_transformation) + else: + cat_feats_to_select = self.get_top_categories(train, "groupby", self.groupby_top_categorical) + num_feats_to_select = self.get_top_numeric(train, self.groupby_top_numerical) + # At least two categoricals or one categorical and one numeric + if len(cat_feats_to_select) < 1: + return + if len(cat_feats_to_select) == 1 and len(num_feats_to_select) < 1: + return + # collect groupby_transformations + for i, group_col in enumerate(cat_feats_to_select): + new_transformation = { + "group_col": group_col, + "categorical_cols": cat_feats_to_select[:i] + cat_feats_to_select[i + 1 :], + "numeric_cols": num_feats_to_select, + "used_transforms": self.groupby_types, + } + groupby_transformations.append(new_transformation) + + groupby_processing = [ + SequentialTransformer( + [ + UnionTransformer( + [ + SequentialTransformer( + [ + ColumnsSelector(keys=[trans["group_col"]] + trans["categorical_cols"]), + LabelEncoder(subs=None, random_state=42), + ChangeRoles(CategoryRole(int)), + ] + ), + SequentialTransformer( + [ColumnsSelector(keys=trans["numeric_cols"]), FillInf(), FillnaMedian()] + ), + ] + ), + GroupByTransformer(**trans), + ] + ) + for trans in groupby_transformations + ] + groupby_processing = UnionTransformer(groupby_processing) + + return groupby_processing diff --git a/lightautoml/pipelines/features/lgb_pipeline.py b/lightautoml/pipelines/features/lgb_pipeline.py old mode 100644 new mode 100755 index 2f5576c6..22023e56 --- a/lightautoml/pipelines/features/lgb_pipeline.py +++ b/lightautoml/pipelines/features/lgb_pipeline.py @@ -1,5 +1,6 @@ """Pipeline for tree based models.""" +from typing import List from typing import Optional from typing import Union @@ -29,6 +30,7 @@ from ...transformers.seq import SeqNumCountsTransformer from ...transformers.seq import SeqStatisticsTransformer from ..selection.base import ImportanceEstimator +from ..selection.base import SelectionPipeline from ..utils import get_columns_by_role from .base import FeaturesPipeline from .base import TabularDataFeatures @@ -488,14 +490,21 @@ class LGBAdvancedPipeline(FeaturesPipeline, TabularDataFeatures): def __init__( self, - feats_imp: Optional[ImportanceEstimator] = None, + feats_imp: Optional[Union[ImportanceEstimator, SelectionPipeline]] = None, top_intersections: int = 5, max_intersection_depth: int = 3, subsample: Optional[Union[int, float]] = None, multiclass_te_co: int = 3, auto_unique_co: int = 10, output_categories: bool = False, - fill_na=False, + fill_na: bool = False, + ascending_by_cardinality: bool = False, + use_groupby: bool = False, + groupby_types: List[str] = ["delta_median", "delta_mean", "min", "max", "std", "mode", "is_mode"], + groupby_triplets: list = [], + groupby_top_based_on: str = "cardinality", + groupby_top_categorical: int = 3, + groupby_top_numerical: int = 3, **kwargs ): super().__init__( @@ -506,9 +515,15 @@ def __init__( feats_imp=feats_imp, auto_unique_co=auto_unique_co, output_categories=output_categories, - ascending_by_cardinality=False, + ascending_by_cardinality=ascending_by_cardinality, + groupby_types=groupby_types, + groupby_triplets=groupby_triplets, + groupby_top_based_on=groupby_top_based_on, + groupby_top_categorical=groupby_top_categorical, + groupby_top_numerical=groupby_top_numerical, ) self.fill_na = fill_na + self.use_groupby = use_groupby def create_pipeline(self, train: NumpyOrPandas) -> LAMLTransformer: """Create tree pipeline. @@ -593,6 +608,9 @@ def create_pipeline(self, train: NumpyOrPandas) -> LAMLTransformer: # add datetime seasonality transformer_list.append(self.get_datetime_seasons(train, NumericRole(np.float32))) + if self.use_groupby: + transformer_list.append(self.get_groupby(train)) + # final pipeline union_all = UnionTransformer([x for x in transformer_list if x is not None]) if self.fill_na: diff --git a/lightautoml/pipelines/features/linear_pipeline.py b/lightautoml/pipelines/features/linear_pipeline.py old mode 100644 new mode 100755 index 0ab1a3ef..c1e2dc1e --- a/lightautoml/pipelines/features/linear_pipeline.py +++ b/lightautoml/pipelines/features/linear_pipeline.py @@ -1,5 +1,6 @@ """Linear models features.""" +from typing import List from typing import Optional from typing import Union @@ -22,6 +23,7 @@ from ...transformers.numeric import NaNFlags from ...transformers.numeric import StandardScaler from ..selection.base import ImportanceEstimator +from ..selection.base import SelectionPipeline from ..utils import get_columns_by_role from .base import FeaturesPipeline from .base import TabularDataFeatures @@ -62,7 +64,7 @@ class LinearFeatures(FeaturesPipeline, TabularDataFeatures): def __init__( self, - feats_imp: Optional[ImportanceEstimator] = None, + feats_imp: Optional[Union[ImportanceEstimator, SelectionPipeline]] = None, top_intersections: int = 5, max_bin_count: int = 10, max_intersection_depth: int = 3, @@ -71,6 +73,12 @@ def __init__( auto_unique_co: int = 50, output_categories: bool = True, multiclass_te_co: int = 3, + use_groupby: bool = False, + groupby_types: List[str] = ["delta_median", "delta_mean", "min", "max", "std", "mode", "is_mode"], + groupby_triplets: list = [], + groupby_top_based_on: str = "cardinality", + groupby_top_categorical: int = 3, + groupby_top_numerical: int = 3, **kwargs ): assert max_bin_count is None or max_bin_count > 1, "Max bin count should be >= 2 or None" @@ -87,7 +95,13 @@ def __init__( max_bin_count=max_bin_count, sparse_ohe=sparse_ohe, multiclass_te_co=multiclass_te_co, + groupby_types=groupby_types, + groupby_triplets=groupby_triplets, + groupby_top_based_on=groupby_top_based_on, + groupby_top_categorical=groupby_top_categorical, + groupby_top_numerical=groupby_top_numerical, ) + self.use_groupby = use_groupby def create_pipeline(self, train: NumpyOrPandas) -> LAMLTransformer: """Create linear pipeline. @@ -167,6 +181,9 @@ def create_pipeline(self, train: NumpyOrPandas) -> LAMLTransformer: # add difference with base date dense_list.append(self.get_datetime_diffs(train)) + if self.use_groupby: + dense_list.append(self.get_groupby(train)) + # combine it all together # handle probs if exists probs_list = [x for x in probs_list if x is not None] diff --git a/lightautoml/transformers/__init__.py b/lightautoml/transformers/__init__.py index 834956f3..53fd9384 100644 --- a/lightautoml/transformers/__init__.py +++ b/lightautoml/transformers/__init__.py @@ -1,3 +1,3 @@ """Basic feature generation steps and helper utils.""" -__all__ = ["base", "categorical", "datetime", "numeric"] +__all__ = ["base", "categorical", "datetime", "numeric", "composite", "utils"] diff --git a/lightautoml/transformers/composite.py b/lightautoml/transformers/composite.py new file mode 100644 index 00000000..11595d5a --- /dev/null +++ b/lightautoml/transformers/composite.py @@ -0,0 +1,177 @@ +"""GroupBy (categorical/numerical) features transformer.""" + +from ..dataset.roles import NumericRole +from ..pipelines.utils import get_columns_by_role +from .base import LAMLTransformer +from .utils import GroupByCatIsMode +from .utils import GroupByCatMode +from .utils import GroupByFactory +from .utils import GroupByNumDeltaMean +from .utils import GroupByNumDeltaMedian +from .utils import GroupByNumMax +from .utils import GroupByNumMin +from .utils import GroupByNumStd +from .utils import GroupByProcessor + + +class GroupByTransformer(LAMLTransformer): + """Transformer, that calculates group_by features. + + Types of group_by features: + - Group by categorical: + - Numerical features: + - Difference with group mode. + - Difference with group median. + - Group min. + - Group max. + - Group std. + - Categorical features: + - Group mode. + - Is current value equal to group mode. + + Attributes: + features list(str): generated features names. + + """ + + _fit_checks = () + _transform_checks = () + _fname_prefix = "grb" + + @property + def features(self): + """Features list.""" + return self._features + + def __init__(self, num_groups=None, use_cat_groups=True, **kwargs): + """Iniatialize. + + Args: + num_groups: IDs of functions to use for numeric features. + use_cat_groups: flag to show use for category features. + **kwargs: additional params + """ + super().__init__() + + self.num_groups = ( + num_groups + if num_groups is not None + else [ + GroupByNumDeltaMean.class_kind, + GroupByNumDeltaMedian.class_kind, + GroupByNumMin.class_kind, + GroupByNumMax.class_kind, + GroupByNumStd.class_kind, + ] + ) + self.use_cat_groups = use_cat_groups + self.dicts = {} + + def fit(self, dataset): + """Fit transformer and return it's instance. + + Args: + dataset: Dataset to fit on. + + Returns: + self. + + """ + # set transformer names and add checks + for check_func in self._fit_checks: + check_func(dataset) + + # convert to accepted dtype and get attributes + dataset = dataset.to_pandas() + + # set transformer features + cat_cols = get_columns_by_role(dataset, "Category") + num_cols = get_columns_by_role(dataset, "Numeric") + feats = [] + for group_column in cat_cols: + group_values = dataset.data[group_column].to_numpy() + group_by_processor = GroupByProcessor(group_values) + + for feature_column in num_cols: + for kind in self.num_groups: + feature = f"{self._fname_prefix}__{group_column}__{kind}__{feature_column}" + self.dicts[feature] = { + "group_column": group_column, + "feature_column": feature_column, + "groups": GroupByFactory.get_GroupBy(kind).fit( + data=dataset.data, + group_by_processor=group_by_processor, + feature_column=feature_column, + ), + "kind": kind, + } + feats.append(feature) + + if self.use_cat_groups: + for feature_column in cat_cols: + if group_column != feature_column: + kind = GroupByCatMode.class_kind + + # group results are the same for "cat_mode" and "cat_is_mode" + groups_1 = GroupByFactory.get_GroupBy(kind).fit( + data=dataset.data, + group_by_processor=group_by_processor, + feature_column=feature_column, + ) + + feature1 = f"{self._fname_prefix}__{group_column}__{kind}__{feature_column}" + self.dicts[feature1] = { + "group_column": group_column, + "feature_column": feature_column, + "groups": groups_1, + "kind": kind, + } + + kind = GroupByCatIsMode.class_kind + + # group results are the same for "cat_mode" and "cat_is_mode" + groups_2 = GroupByFactory.get_GroupBy(kind) + groups_2.set_dict(groups_1.get_dict()) + + feature2 = f"{self._fname_prefix}__{group_column}__{kind}__{feature_column}" + self.dicts[feature2] = { + "group_column": group_column, + "feature_column": feature_column, + "groups": groups_2, + "kind": kind, + } + feats.extend([feature1, feature2]) + + self._features = feats + + return self + + def transform(self, dataset): + """Calculate groups statistics. + + Args: + dataset: Numpy or Pandas dataset with category and numeric columns. + + Returns: + NumpyDataset of calculated group features (numeric). + """ + # checks here + super().transform(dataset) + + # convert to accepted dtype and get attributes + dataset = dataset.to_pandas() + + # transform + roles = NumericRole() + outputs = [] + + for feat, value in self.dicts.items(): + + new_arr = value["groups"].transform(data=dataset.data, value=value) + + output = dataset.empty().to_numpy() + output.set_data(new_arr, [feat], roles) + outputs.append(output) + + # create resulted + return dataset.empty().to_numpy().concat(outputs) diff --git a/lightautoml/transformers/groupby.py b/lightautoml/transformers/groupby.py new file mode 100644 index 00000000..c273b2fb --- /dev/null +++ b/lightautoml/transformers/groupby.py @@ -0,0 +1,205 @@ +"""GroupBy (categorical/numerical) features transformer.""" + +from typing import List +from typing import Optional +from typing import Union + +import numpy as np + +from scipy.stats import mode as get_mode + +from ..dataset.base import LAMLDataset +from ..dataset.roles import NumericRole +from .base import LAMLTransformer + + +_transform_types_numeric = ["delta_median", "delta_mean", "min", "max", "std"] +_transform_types_categorical = ["mode", "is_mode"] + + +class GroupByTransformer(LAMLTransformer): + """Transformer, that calculates groupby features. + + Types of group_by transformations: + - Numerical features: + - delta_median: Difference with group mode. + - delta_mean: Difference with group median. + - min: Group min. + - max: Group max. + - std: Group std. + - Categorical features: + - mode: Group mode. + - is_mode: Is current value equal to group mode. + + Attributes: + features list(str): generated features names. + + """ + + _fname_prefix = "grb" + + @property + def features(self): + """Features list.""" + return self._features + + def __init__( + self, + group_col: str, + numeric_cols: Optional[List[str]] = None, + categorical_cols: Optional[List[str]] = None, + used_transforms: Optional[List[str]] = None, + ): + """Initialize transformer. + + Args: + group_col: Name of categorical variable for grouping. + numeric_cols: List of numeric variables to calculate groupby with respect to the 'group_column'. + categorical_cols: List of categorical variables to calculate groupby with respect to the 'group_column'. + used_transforms: List of used transformation types, for example ["std", "mode", "delta_mean"]. + If not specified, all available transformations are used. + """ + # assert set(used_transforms).issubset(_transform_types_numeric + _transform_types_categorical), \ + # f"Only these transformation types supported: {_transform_types_numeric + _transform_types_categorical}" + + super().__init__() + self.group_col = group_col + self.numeric_cols = numeric_cols + self.categorical_cols = categorical_cols + self._feat_idx = self._set_feature_indices() + self.used_transforms = ( + used_transforms if used_transforms else _transform_types_numeric + _transform_types_categorical + ) + self.numeric_transforms = [t for t in self.used_transforms if t in _transform_types_numeric] + self.categorical_transforms = [t for t in self.used_transforms if t in _transform_types_categorical] + + def _set_feature_indices(self): + feat_idx = dict() + feat_idx[self.group_col] = 0 + i = 1 + for fc in self.categorical_cols: + feat_idx[fc] = i + i += 1 + for fn in self.numeric_cols: + feat_idx[fn] = i + i += 1 + return feat_idx + + def fit(self, dataset: LAMLDataset): + """Fit transformer and return it's instance. + + Args: + dataset: Dataset to fit on. + + Returns: + self. + + """ + # checksum + assert dataset.shape[1] == len(self.categorical_cols) + len(self.numeric_cols) + 1 + self._roles = dataset.roles + dataset = dataset.to_pandas() + + # list of pairs ('feat_name', 'transform_type') + self.transformations_list = [] + self.transformations_list.extend([(f, t) for f in self.numeric_cols for t in self.numeric_transforms]) + self.transformations_list.extend([(f, t) for f in self.categorical_cols for t in self.categorical_transforms]) + # transformed feature names + self._features = [f"{self._fname_prefix}__{self.group_col}__{t}__{f}" for f, t in self.transformations_list] + self._features_mapping = {self._features[i]: k for i, k in enumerate(self.transformations_list)} + + self._group_ids_dict = self._calculate_group_ids(dataset) + self._group_stats_dict = self._calculate_group_stats(dataset) + + return self + + def _calculate_group_ids(self, dataset: LAMLDataset) -> dict: + """Extract unique values from group_col and make a dict with indices corresponding to each value.""" + group_values = dataset.data.iloc[:, self._feat_idx[self.group_col]].to_numpy() + group_ids_dict = dict() + for i, k in enumerate(group_values): + if k not in group_ids_dict: + group_ids_dict[k] = [i] + else: + group_ids_dict[k].append(i) + return {k: np.array(v) for k, v in group_ids_dict.items()} + + def _calculate_group_stats(self, dataset: LAMLDataset) -> dict: + """Calculate statistics for each transformed feature, corresponding to each pair (feature, 'transform_type').""" + group_stats = dict() + dataset = dataset.to_pandas() + for feature_name in self._features: + feat, trans = self._features_mapping[feature_name] + feature_vals = dataset.data.iloc[:, self._feat_idx[feat]].to_numpy() + group_stats[feature_name] = { + k: self._feature_stats(feature_vals[idx], trans) for k, idx in self._group_ids_dict.items() + } + return group_stats + + def _feature_stats(self, vals: np.ndarray, trans: str) -> Union[str, int, float]: + """Calculate statistics for vals vector according to 'trans' type.""" + return getattr(self, trans)(vals) + + def transform(self, dataset: LAMLDataset): + """Calculate groups statistics. + + Args: + dataset: Numpy or Pandas dataset with category and numeric columns. + + Returns: + NumpyDataset of calculated group features (numeric). + """ + feats_block = [] + dataset = dataset.to_pandas() + group_vals = dataset.data.iloc[:, self._feat_idx[self.group_col]].to_numpy() + for feature_name in self._features: + feat, trans = self._features_mapping[feature_name] + feature_vals = dataset.data.iloc[:, self._feat_idx[feat]].to_numpy() + stats_from_fit = np.vectorize(self._group_stats_dict[feature_name].get)(group_vals) + new_feature_vals = self._transform_one(stats_from_fit, feature_vals, trans) + feats_block.append(new_feature_vals[:, np.newaxis]) + feats_block = np.concatenate(feats_block, axis=1) + # create resulted + output = dataset.empty().to_numpy() + output.set_data(feats_block, self.features, NumericRole(dtype=np.float32)) + # print(output.shape) + return output + + def _transform_one(self, stats_from_fit, feature_vals, transform_type): + """Calculate transformation for one pair (feature, 'transform_type').""" + if transform_type in ["min", "max", "std", "mode"]: + return stats_from_fit + elif transform_type in ["delta_mean", "delta_median"]: + return feature_vals - stats_from_fit + elif transform_type == "is_mode": + return feature_vals == stats_from_fit + else: + raise ValueError(f"Unknown transformation type: {transform_type}") + + def delta_median(self, vals: np.ndarray) -> float: + """Alias for numpy median function. Needs subtraction from feature value to get 'delta_median' transformation.""" + return np.nanmedian(vals) + + def delta_mean(self, vals: np.ndarray) -> float: + """Alias for numpy mean function. Needs subtraction from feature value to get 'delta_mean' transformation.""" + return np.nanmean(vals) + + def min(self, vals: np.ndarray) -> float: + """Alias for numpy min function.""" + return np.nanmin(vals) + + def max(self, vals: np.ndarray) -> float: + """Alias for numpy max function.""" + return np.nanmax(vals) + + def std(self, vals: np.ndarray) -> float: + """Alias for numpy std function.""" + return np.nanstd(vals) + + def mode(self, vals: np.ndarray) -> float: + """Calculates mode value for categorical variable.""" + return get_mode(vals, keepdims=True)[0][0] + + def is_mode(self, vals: np.ndarray) -> float: + """Calculates mode value for categorical variable. Needs comparing from initial feature value.""" + return get_mode(vals, keepdims=True)[0][0] diff --git a/lightautoml/transformers/utils.py b/lightautoml/transformers/utils.py new file mode 100644 index 00000000..a2493c53 --- /dev/null +++ b/lightautoml/transformers/utils.py @@ -0,0 +1,269 @@ +"""utils for transformers.""" + +import numpy as np + +from scipy.stats import mode + + +def get_mode(x): + """Helper function to calculate mode.""" + return mode(x)[0][0] + + +class GroupByProcessor: + """Helper class to calculate group_by features.""" + + def __init__(self, keys): + super().__init__() + + assert keys is not None + + self.index, self.keys_as_int = np.unique(keys, return_inverse=True) + self.n_keys = max(self.keys_as_int) + 1 + self.set_indices() + + def set_indices(self): + """Sets indices for keys.""" + self.indices = [[] for i in range(self.n_keys)] + for i, k in enumerate(self.keys_as_int): + self.indices[k].append(i) + self.indices = [np.array(elt) for elt in self.indices] + + def apply(self, functions, vectors): + """Applies functions to vectors.""" + assert functions is not None + assert vectors is not None + + if isinstance(functions, list): + return [[fun(vec[idx].tolist()) for fun, vec in zip(functions, vectors)] for idx in (self.indices)] + else: + return [functions(vectors[idx].tolist()) for idx in (self.indices)] + + +class GroupByFactory: + """Factory to create group_by classes.""" + + @staticmethod + def get_GroupBy(kind): + """Uses string identifiers to locate appropriate implementation. + + Args: + kind: groupby feature transformation class + + Example: + GroupByFactory.get_GroupBy('delta_mean') + + Returns: + Object of GroupByBase impementing selected feature. + + Raises: + ValueError: if identifier is not found. + """ + assert kind is not None + + available_classes = [ + GroupByNumDeltaMean, + GroupByNumDeltaMedian, + GroupByNumMin, + GroupByNumMax, + GroupByNumStd, + GroupByCatMode, + GroupByCatIsMode, + ] + + for class_name in available_classes: + if kind == class_name.class_kind: + return class_name( + class_name.class_kind, + class_name.class_fit_func, + class_name.class_transform_func, + ) + + raise ValueError( + f"Unsupported kind: {kind}, available={[class_name.class_kind for class_name in available_classes]}" + ) + return + + +class GroupByBase: + """Base class for all group_by features. + + Note: + Typically is created from GroupByFactory. + + Args: + kind (string): Id of group_by feature. + fit_func (function): function to calculate groups. + transform_func (function): function to calculate statistics based on fitted groups. + + Example: + GroupByBase(GroupByNumDeltaMean.class_kind, GroupByNumDeltaMean.class_fit_func, GroupByNumDeltaMean.class_transform_func) + + """ + + def __init__(self, kind, fit_func, transform_func): + super().__init__() + + self.kind = kind + self.fit_func = fit_func + self.transform_func = transform_func + + self._dict = None + + def get_dict(self): + """Gets dict with features' statistics.""" + return self._dict + + def set_dict(self, dict): + """Sets dict with features' statistics.""" + self._dict = dict + + def fit(self, data, group_by_processor, feature_column): + """Calculate groups. + + Note: + GroupByProcessor must be initialiaed before call to this function. + + Args: + data (dataset): input data to extract ``feature_column``. + group_by_processor (GroupByProcessor): processor, containig groups. + feature_column (string): name of column to calculate statistics. + + Returns: + self + + """ + assert data is not None + assert group_by_processor is not None + assert feature_column is not None + + assert self.fit_func is not None + + feature_values = data[feature_column].to_numpy() + self._dict = dict( + zip( + group_by_processor.index, + group_by_processor.apply(self.fit_func, feature_values), + ) + ) + + assert self._dict is not None + + return self + + def transform(self, data, value): + """Calculate features statistics. + + Note: + ``fit`` function must be called before ``transform``. + + Args: + data (dataset): input data to extract ``value['group_column']`` and ``value['feature_column']``. + value (dict): column names. + + Returns: + transformed data + + """ + assert data is not None + assert value is not None + + assert self.transform_func is not None + + group_values = data[value["group_column"]].to_numpy() + feature_values = data[value["feature_column"]].to_numpy() + result = self.transform_func( + tuple( + [ + np.nan_to_num(np.array(np.vectorize(self._dict.get)(group_values), dtype=float)), + feature_values, + ] + ) + ).reshape(-1, 1) + + assert result is not None + return result + + +class GroupByNumDeltaMean(GroupByBase): + """Groupby delta mean class.""" + + class_kind = "delta_mean" + class_fit_func = np.nanmean + + @staticmethod + def class_transform_func(values): + """Get difference between feature values inside group and mean value.""" + return values[1] - values[0] + + +class GroupByNumDeltaMedian(GroupByBase): + """Groupby delta median class.""" + + class_kind = "delta_median" + class_fit_func = np.nanmedian + + @staticmethod + def class_transform_func(values): + """Get difference between feature values inside group and median value.""" + return values[1] - values[0] + + +class GroupByNumMin(GroupByBase): + """Groupby min class.""" + + class_kind = "min" + class_fit_func = np.nanmin + + @staticmethod + def class_transform_func(values): + """Get min value inside each group.""" + return values[0] + + +class GroupByNumMax(GroupByBase): + """Groupby max class.""" + + class_kind = "max" + class_fit_func = np.nanmax + + @staticmethod + def class_transform_func(values): + """Get max value inside each group.""" + return values[0] + + +class GroupByNumStd(GroupByBase): + """Groupby std class.""" + + class_kind = "std" + class_fit_func = np.nanstd + + @staticmethod + def class_transform_func(values): + """Get std value inside each group.""" + return values[0] + + +class GroupByCatMode(GroupByBase): + """Groupby cat mode class.""" + + class_kind = "mode" + class_fit_func = get_mode + + @staticmethod + def class_transform_func(values): + """Get category mode inside each group.""" + return values[0] + + +class GroupByCatIsMode(GroupByBase): + """Groupby is mode class.""" + + class_kind = "is_mode" + class_fit_func = get_mode + + @staticmethod + def class_transform_func(values): + """Check if category value is mode inside each group.""" + return values[0] == values[1]