Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/groupby2 #58

Merged
merged 23 commits into from
Jul 26, 2023
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@
12. `demo11.py` - AutoML NLP preset usage for tabular datasets with text columns
13. `demo12.py` - AutoML tabular preset usage with custom validation scheme and multiprocessed inference
14. `demo13.py` - AutoML TS preset usage with lag and diff transformers' parameters selection
15. `demo14.py` - Groupby features (using TabularAutoML preset and custom pipeline)
109 changes: 109 additions & 0 deletions examples/demo14.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#!/usr/bin/env python
# coding: utf-8

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from lightautoml.automl.base import AutoML
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.dataset.roles import CategoryRole
from lightautoml.dataset.roles import NumericRole
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.pipelines.features.lgb_pipeline import LGBAdvancedPipeline
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector
from lightautoml.pipelines.selection.importance_based import (
ModelBasedImportanceEstimator,
)
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.tasks import Task


################################
# Features:
# - group_by transformer
################################

N_FOLDS = 3 # number of folds for cross-validation inside AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
N_THREADS = 4 # threads cnt for lgbm and linear models
TIMEOUT = 100
USED_COLS = ["SK_ID_CURR", "TARGET", "NAME_CONTRACT_TYPE", "CODE_GENDER", "AMT_INCOME_TOTAL", "DAYS_BIRTH"]
TARGET = "TARGET"

# load data
data = pd.read_csv("./data/sampled_app_train.csv")
data = data[USED_COLS]
train, test = train_test_split(data, test_size=2000, random_state=42)

# Using TabularAutoML preset
task = Task("binary")
roles = {
"target": TARGET,
CategoryRole(dtype=str): ["NAME_CONTRACT_TYPE", "CODE_GENDER"],
NumericRole(np.float32): ["AMT_INCOME_TOTAL"],
}

# specify groupby triplets: [("group_col", "feature", "transform_type"),]
groupby_triplets = [
("CODE_GENDER", "AMT_INCOME_TOTAL", "max"),
("NAME_CONTRACT_TYPE", "CODE_GENDER", "mode"),
("NAME_CONTRACT_TYPE", "AMT_INCOME_TOTAL", "delta_mean"),
]

print(f"Try TabularAutoML with the following groupby_triplets:\n{groupby_triplets}")

automl = TabularAutoML(
task=task,
timeout=TIMEOUT,
cpu_limit=N_THREADS,
reader_params={"n_jobs": N_THREADS, "cv": N_FOLDS, "random_state": RANDOM_STATE},
general_params={"use_algos": [["lgb"]]},
gbm_pipeline_params={"use_groupby": True, "groupby_triplets": groupby_triplets},
)
automl.fit_predict(train, roles=roles)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add comment for feature_scores

feature_scores = automl.levels[0][0].ml_algos[0].get_features_score()

print(f"Feature importances of BoostLGBM model. Pay attention to groupby features:\n{feature_scores}")

# Custom pipeline with groupby features defined by importance
print("\nTry custom pipeline with groupby features defined by importance:\n")

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add comment for custom pipeline

task = Task("binary")
reader = PandasToPandasReader(task, cv=N_FOLDS, random_state=RANDOM_STATE)
model0 = BoostLGBM(default_params={"learning_rate": 0.1, "num_leaves": 64, "seed": 42, "num_threads": N_THREADS})
pie = ModelBasedImportanceEstimator()
selector = ImportanceCutoffSelector(LGBSimpleFeatures(), model0, pie, cutoff=-9999)


pipe = LGBAdvancedPipeline(
use_groupby=True, pre_selector=selector, groupby_types=["delta_median", "std"], groupby_top_based_on="importance"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

feats_imp

)

model = BoostLGBM(
default_params={
"learning_rate": 0.05,
"num_leaves": 128,
"seed": 1,
"num_threads": N_THREADS,
}
)

pipeline = MLPipeline([model], pre_selection=selector, features_pipeline=pipe, post_selection=None)

automl = AutoML(
reader,
[[pipeline]],
skip_conn=False,
)

oof_pred = automl.fit_predict(
train,
roles={"target": TARGET},
)

print(f"Feature used by BoostLGBM model. Pay attention to groupby features:\n{pipe.output_features}")
30 changes: 28 additions & 2 deletions lightautoml/automl/presets/image_config.yml
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,21 @@ gbm_pipeline_params:
auto_unique_co: 10
# n_classes to use target encoding for multiclass task
multiclass_te_co: 3
# use groupby features
use_groupby: False
# groupby types used in feature engeneering
groupby_types: [ 'delta_median', 'delta_mean', 'min', 'max', 'std', 'mode', 'is_mode' ]
# top features are choosen by cardinality or feature importance
groupby_top_based_on: 'cardinality'
# top categorical features to use in groupby transformer
groupby_top_categorical: 3
# top numerical features to use in groupby transformer
groupby_top_numerical: 3
# list of groupby triplets ("group_col", "feat", "groupby_type") for manual setting
# disables groupby_types, groupby_top_based_on and other groupby parameters if defined
groupby_triplets: [ ]
# text_features in gbm feature pipeline. embed or simple
cv_features: "simple"
cv_features: 'simple'

linear_pipeline_params:
# max number of categories to generate intersections
Expand All @@ -141,8 +154,21 @@ linear_pipeline_params:
auto_unique_co: 50
# n_classes to use target encoding for multiclass task
multiclass_te_co: 3
# use groupby features
use_groupby: False
# groupby types used in feature engeneering
groupby_types: [ 'delta_median', 'delta_mean', 'min', 'max', 'std', 'mode', 'is_mode' ]
# top features are choosen by cardinality or feature importance
groupby_top_based_on: 'cardinality'
# top categorical features to use in groupby transformer
groupby_top_categorical: 3
# top numerical features to use in groupby transformer
groupby_top_numerical: 3
# list of groupby triplets ("group_col", "feat", "groupby_type") for manual setting
# disables groupby_types, groupby_top_based_on and other groupby parameters if defined
groupby_triplets: [ ]
# text_features in linear feature pipeline. embed or simple
cv_features: "embed"
cv_features: 'embed'

timing_params:
# select timing mode:
Expand Down
2 changes: 1 addition & 1 deletion lightautoml/automl/presets/image_presets.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def get_gbms(
):
"""Get gbm pipeline."""
cv_gbm_feats = self.get_cv_pipe(self.gbm_pipeline_params["cv_features"])
gbm_feats = LGBAdvancedPipeline(output_categories=False, **self.gbm_pipeline_params)
gbm_feats = LGBAdvancedPipeline(feats_imp=pre_selector, output_categories=False, **self.gbm_pipeline_params)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

feats_imp=pre_selector in get_linear and get_rfs

if cv_gbm_feats is not None:
gbm_feats.append(cv_gbm_feats)

Expand Down
26 changes: 26 additions & 0 deletions lightautoml/automl/presets/tabular_config.yml
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,19 @@ gbm_pipeline_params:
multiclass_te_co: 3
# DEV feature: output categorical features as categories (if True, can totally overfit your model - be careful!)
output_categories: False
# use groupby features
use_groupby: False
# groupby types used in feature engeneering
groupby_types: [ 'delta_median', 'delta_mean', 'min', 'max', 'std', 'mode', 'is_mode' ]
# top features are choosen by cardinality or feature importance
groupby_top_based_on: 'cardinality'
# top categorical features to use in groupby transformer
groupby_top_categorical: 3
# top numerical features to use in groupby transformer
groupby_top_numerical: 3
# list of groupby triplets ("group_col", "feat", "groupby_type") for manual setting
# disables groupby_types, groupby_top_based_on and other groupby parameters if defined
groupby_triplets: [ ]

linear_pipeline_params:
# max number of categories to generate intersections
Expand All @@ -255,6 +268,19 @@ linear_pipeline_params:
auto_unique_co: 50
# n_classes to use target encoding for multiclass task
multiclass_te_co: 3
# use groupby features
use_groupby: False
# groupby types used in feature engeneering
groupby_types: [ 'delta_median', 'delta_mean', 'min', 'max', 'std', 'mode', 'is_mode' ]
# top features are choosen by cardinality or feature importance
groupby_top_based_on: 'cardinality'
# top categorical features to use in groupby transformer
groupby_top_categorical: 3
# top numerical features to use in groupby transformer
groupby_top_numerical: 3
# list of groupby triplets ("group_col", "feat", "groupby_type") for manual setting
# disables groupby_types, groupby_top_based_on and other groupby parameters if defined
groupby_triplets: [ ]

nn_pipeline_params:
# use quantile transformer for numerical columns
Expand Down
4 changes: 2 additions & 2 deletions lightautoml/automl/presets/tabular_presets.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ def get_gbms(
pre_selector: Optional[SelectionPipeline] = None,
):

gbm_feats = LGBAdvancedPipeline(**self.gbm_pipeline_params)
gbm_feats = LGBAdvancedPipeline(**self.gbm_pipeline_params, feats_imp=pre_selector)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add pre_selector to linear_l2_feats init in get_linear


ml_algos = []
force_calc = []
Expand Down Expand Up @@ -464,7 +464,7 @@ def get_gbms(

def get_rfs(self, keys: Sequence[str], n_level: int = 1, pre_selector: Optional[SelectionPipeline] = None):

rf_feats = LGBAdvancedPipeline(**self.gbm_pipeline_params, fill_na=True)
rf_feats = LGBAdvancedPipeline(**self.gbm_pipeline_params, feats_imp=pre_selector, fill_na=True)

ml_algos = []
force_calc = []
Expand Down
26 changes: 26 additions & 0 deletions lightautoml/automl/presets/text_config.yml
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,19 @@ gbm_pipeline_params:
auto_unique_co: 10
# n_classes to use target encoding for multiclass task
multiclass_te_co: 3
# use groupby features
use_groupby: False
# groupby types used in feature engeneering
groupby_types: [ 'delta_median', 'delta_mean', 'min', 'max', 'std', 'mode', 'is_mode' ]
# top features are choosen by cardinality or feature importance
groupby_top_based_on: 'cardinality'
# top categorical features to use in groupby transformer
groupby_top_categorical: 3
# top numerical features to use in groupby transformer
groupby_top_numerical: 3
# list of groupby triplets ("group_col", "feat", "groupby_type") for manual setting
# disables groupby_types, groupby_top_based_on and other groupby parameters if defined
groupby_triplets: [ ]
# text_features in gbm feature pipeline. embed or tfidf
text_features: "embed"

Expand All @@ -200,6 +213,19 @@ linear_pipeline_params:
auto_unique_co: 50
# n_classes to use target encoding for multiclass task
multiclass_te_co: 3
# use groupby features
use_groupby: False
# groupby types used in feature engeneering
groupby_types: [ 'delta_median', 'delta_mean', 'min', 'max', 'std', 'mode', 'is_mode' ]
# top features are choosen by cardinality or feature importance
groupby_top_based_on: 'cardinality'
# top categorical features to use in groupby transformer
groupby_top_categorical: 3
# top numerical features to use in groupby transformer
groupby_top_numerical: 3
# list of groupby triplets ("group_col", "feat", "groupby_type") for manual setting
# disables groupby_types, groupby_top_based_on and other groupby parameters if defined
groupby_triplets: [ ]
# text_features in linear feature pipeline. embed or tfidf
text_features: "tfidf"

Expand Down
2 changes: 1 addition & 1 deletion lightautoml/automl/presets/text_presets.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ def get_gbms(
):

text_gbm_feats = self.get_nlp_pipe(self.gbm_pipeline_params["text_features"])
gbm_feats = LGBAdvancedPipeline(output_categories=False, **self.gbm_pipeline_params)
gbm_feats = LGBAdvancedPipeline(feats_imp=pre_selector, output_categories=False, **self.gbm_pipeline_params)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need eats_imp=pre_selector in get_linear?

if text_gbm_feats is not None:
gbm_feats.append(text_gbm_feats)

Expand Down
Loading