-
Notifications
You must be signed in to change notification settings - Fork 48
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Feature/groupby2 #58
Feature/groupby2 #58
Changes from 21 commits
791465f
58908ef
4d7f3d3
52eb953
0ad86ec
23dd0f9
ec95486
8ecd887
dd14c90
75a8e55
5517731
55de5a7
07fa43f
cc5e886
c8b3e1a
ebd9012
9bf257d
22d8425
9aec794
a4001e4
8133380
06af4a8
97bb508
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
#!/usr/bin/env python | ||
# coding: utf-8 | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
from sklearn.model_selection import train_test_split | ||
|
||
from lightautoml.automl.base import AutoML | ||
from lightautoml.automl.presets.tabular_presets import TabularAutoML | ||
from lightautoml.dataset.roles import CategoryRole | ||
from lightautoml.dataset.roles import NumericRole | ||
from lightautoml.ml_algo.boost_lgbm import BoostLGBM | ||
from lightautoml.pipelines.features.lgb_pipeline import LGBAdvancedPipeline | ||
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures | ||
from lightautoml.pipelines.ml.base import MLPipeline | ||
from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector | ||
from lightautoml.pipelines.selection.importance_based import ( | ||
ModelBasedImportanceEstimator, | ||
) | ||
from lightautoml.reader.base import PandasToPandasReader | ||
from lightautoml.tasks import Task | ||
|
||
|
||
################################ | ||
# Features: | ||
# - group_by transformer | ||
################################ | ||
|
||
N_FOLDS = 3 # number of folds for cross-validation inside AutoML | ||
RANDOM_STATE = 42 # fixed random state for various reasons | ||
N_THREADS = 4 # threads cnt for lgbm and linear models | ||
TIMEOUT = 100 | ||
USED_COLS = ["SK_ID_CURR", "TARGET", "NAME_CONTRACT_TYPE", "CODE_GENDER", "AMT_INCOME_TOTAL", "DAYS_BIRTH"] | ||
TARGET = "TARGET" | ||
|
||
# load data | ||
data = pd.read_csv("./data/sampled_app_train.csv") | ||
data = data[USED_COLS] | ||
train, test = train_test_split(data, test_size=2000, random_state=42) | ||
|
||
# Using TabularAutoML preset | ||
task = Task("binary") | ||
roles = { | ||
"target": TARGET, | ||
CategoryRole(dtype=str): ["NAME_CONTRACT_TYPE", "CODE_GENDER"], | ||
NumericRole(np.float32): ["AMT_INCOME_TOTAL"], | ||
} | ||
|
||
# specify groupby triplets: [("group_col", "feature", "transform_type"),] | ||
groupby_triplets = [ | ||
("CODE_GENDER", "AMT_INCOME_TOTAL", "max"), | ||
("NAME_CONTRACT_TYPE", "CODE_GENDER", "mode"), | ||
("NAME_CONTRACT_TYPE", "AMT_INCOME_TOTAL", "delta_mean"), | ||
] | ||
|
||
print(f"Try TabularAutoML with the following groupby_triplets:\n{groupby_triplets}") | ||
|
||
automl = TabularAutoML( | ||
task=task, | ||
timeout=TIMEOUT, | ||
cpu_limit=N_THREADS, | ||
reader_params={"n_jobs": N_THREADS, "cv": N_FOLDS, "random_state": RANDOM_STATE}, | ||
general_params={"use_algos": [["lgb"]]}, | ||
gbm_pipeline_params={"use_groupby": True, "groupby_triplets": groupby_triplets}, | ||
) | ||
automl.fit_predict(train, roles=roles) | ||
|
||
feature_scores = automl.levels[0][0].ml_algos[0].get_features_score() | ||
|
||
print(f"Feature importances of BoostLGBM model. Pay attention to groupby features:\n{feature_scores}") | ||
|
||
# Custom pipeline with groupby features defined by importance | ||
print("\nTry custom pipeline with groupby features defined by importance:\n") | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add comment for custom pipeline |
||
task = Task("binary") | ||
reader = PandasToPandasReader(task, cv=N_FOLDS, random_state=RANDOM_STATE) | ||
model0 = BoostLGBM(default_params={"learning_rate": 0.1, "num_leaves": 64, "seed": 42, "num_threads": N_THREADS}) | ||
pie = ModelBasedImportanceEstimator() | ||
selector = ImportanceCutoffSelector(LGBSimpleFeatures(), model0, pie, cutoff=-9999) | ||
|
||
|
||
pipe = LGBAdvancedPipeline( | ||
use_groupby=True, pre_selector=selector, groupby_types=["delta_median", "std"], groupby_top_based_on="importance" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. feats_imp |
||
) | ||
|
||
model = BoostLGBM( | ||
default_params={ | ||
"learning_rate": 0.05, | ||
"num_leaves": 128, | ||
"seed": 1, | ||
"num_threads": N_THREADS, | ||
} | ||
) | ||
|
||
pipeline = MLPipeline([model], pre_selection=selector, features_pipeline=pipe, post_selection=None) | ||
|
||
automl = AutoML( | ||
reader, | ||
[[pipeline]], | ||
skip_conn=False, | ||
) | ||
|
||
oof_pred = automl.fit_predict( | ||
train, | ||
roles={"target": TARGET}, | ||
) | ||
|
||
print(f"Feature used by BoostLGBM model. Pay attention to groupby features:\n{pipe.output_features}") |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -231,7 +231,7 @@ def get_gbms( | |
): | ||
"""Get gbm pipeline.""" | ||
cv_gbm_feats = self.get_cv_pipe(self.gbm_pipeline_params["cv_features"]) | ||
gbm_feats = LGBAdvancedPipeline(output_categories=False, **self.gbm_pipeline_params) | ||
gbm_feats = LGBAdvancedPipeline(feats_imp=pre_selector, output_categories=False, **self.gbm_pipeline_params) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. feats_imp=pre_selector in get_linear and get_rfs |
||
if cv_gbm_feats is not None: | ||
gbm_feats.append(cv_gbm_feats) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -429,7 +429,7 @@ def get_gbms( | |
pre_selector: Optional[SelectionPipeline] = None, | ||
): | ||
|
||
gbm_feats = LGBAdvancedPipeline(**self.gbm_pipeline_params) | ||
gbm_feats = LGBAdvancedPipeline(**self.gbm_pipeline_params, feats_imp=pre_selector) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add pre_selector to linear_l2_feats init in get_linear |
||
|
||
ml_algos = [] | ||
force_calc = [] | ||
|
@@ -464,7 +464,7 @@ def get_gbms( | |
|
||
def get_rfs(self, keys: Sequence[str], n_level: int = 1, pre_selector: Optional[SelectionPipeline] = None): | ||
|
||
rf_feats = LGBAdvancedPipeline(**self.gbm_pipeline_params, fill_na=True) | ||
rf_feats = LGBAdvancedPipeline(**self.gbm_pipeline_params, feats_imp=pre_selector, fill_na=True) | ||
|
||
ml_algos = [] | ||
force_calc = [] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -348,7 +348,7 @@ def get_gbms( | |
): | ||
|
||
text_gbm_feats = self.get_nlp_pipe(self.gbm_pipeline_params["text_features"]) | ||
gbm_feats = LGBAdvancedPipeline(output_categories=False, **self.gbm_pipeline_params) | ||
gbm_feats = LGBAdvancedPipeline(feats_imp=pre_selector, output_categories=False, **self.gbm_pipeline_params) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we need eats_imp=pre_selector in get_linear? |
||
if text_gbm_feats is not None: | ||
gbm_feats.append(text_gbm_feats) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
add comment for feature_scores