Skip to content

Commit

Permalink
Make hyperparameters flexible (#65)
Browse files Browse the repository at this point in the history
  • Loading branch information
mirand863 committed Nov 28, 2022
1 parent 78b6462 commit b225ef3
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 194 deletions.
2 changes: 1 addition & 1 deletion benchmarks/consumer_complaints/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ n_estimators: 1
criterion: 1
```

The intervals for testing can be defined with the functions `range` or `choice`, as described on [Hydra's documentation](https://hydra.cc/docs/plugins/optuna_sweeper/). If you wish to add more parameters for testing, you can simply add the parameter name inside the `params` field and at the end of the file set it to 1 in order to enable its usage in Hydra. Additionally, you would need to modify one of the functions `configure_lightgbm`, `configure_logistic_regression` or `configure_random_forest` (whichever is appropriate) inside the script [tune.py](scripts/tune.py) to enable the new hyperparameter.
The intervals for testing can be defined with the functions `range` or `choice`, as described on [Hydra's documentation](https://hydra.cc/docs/plugins/optuna_sweeper/). If you wish to add more parameters for testing, you can simply add the parameter name inside the `params` field and at the end of the file set it to 1 in order to enable its usage in Hydra.

## Running locally

Expand Down
124 changes: 45 additions & 79 deletions benchmarks/consumer_complaints/scripts/tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from lightgbm import LGBMClassifier
from numpy.core._exceptions import _ArrayMemoryError
from omegaconf import DictConfig, OmegaConf
from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
Expand All @@ -30,89 +29,55 @@
)
from hiclass.metrics import f1


log = logging.getLogger("TUNE")


def configure_lightgbm(cfg: DictConfig) -> BaseEstimator:
"""
Configure LightGBM with parameters passed as argument.
configure_flat = {
"lightgbm": LGBMClassifier(),
"logistic_regression": LogisticRegression(),
"random_forest": RandomForestClassifier(),
}

Parameters
----------
cfg : DictConfig
Dictionary containing all configuration information.

Returns
-------
classifier : BaseEstimator
Estimator with hyper-parameters configured.
"""
classifier = LGBMClassifier(
n_jobs=cfg.n_jobs,
num_leaves=cfg.num_leaves,
n_estimators=cfg.n_estimators,
min_child_samples=cfg.min_child_samples,
)
return classifier
configure_hierarchical = {
"local_classifier_per_node": LocalClassifierPerNode(),
"local_classifier_per_parent_node": LocalClassifierPerParentNode(),
"local_classifier_per_level": LocalClassifierPerLevel(),
}


def configure_logistic_regression(cfg: DictConfig) -> BaseEstimator:
"""
Configure LogisticRegression with parameters passed as argument.
non_hyperparameters = [
"model",
"classifier",
"n_jobs",
"x_train",
"y_train",
"output_dir",
"mem_gb",
"n_splits",
]

Parameters
----------
cfg : DictConfig
Dictionary containing all configuration information.

Returns
-------
classifier : BaseEstimator
Estimator with hyper-parameters configured.
"""
classifier = LogisticRegression(
n_jobs=cfg.n_jobs,
solver=cfg.solver,
max_iter=cfg.max_iter,
)
return classifier


def configure_random_forest(cfg: DictConfig) -> BaseEstimator:
def delete_non_hyperparameters(cfg: OmegaConf) -> dict:
"""
Configure RandomForest with parameters passed as argument.
Delete non-hyperparameters from the dictionary.
Parameters
----------
cfg : DictConfig
Dictionary containing all configuration information.
cfg : OmegaConf
Dictionary to delete non-hyperparameters from.
Returns
-------
classifier : BaseEstimator
Estimator with hyper-parameters configured.
"""
classifier = RandomForestClassifier(
n_jobs=cfg.n_jobs,
n_estimators=cfg.n_estimators,
criterion=cfg.criterion,
)
return classifier


configure_flat = {
"lightgbm": configure_lightgbm,
"logistic_regression": configure_logistic_regression,
"random_forest": configure_random_forest,
}

hyperparameters : dict
Dictionary containing only hyperparameters.
configure_hierarchical = {
"local_classifier_per_node": LocalClassifierPerNode(),
"local_classifier_per_parent_node": LocalClassifierPerParentNode(),
"local_classifier_per_level": LocalClassifierPerLevel(),
}
"""
hyperparameters = OmegaConf.to_container(cfg)
for key in non_hyperparameters:
if key in hyperparameters:
del hyperparameters[key]
return hyperparameters


def configure_pipeline(cfg: DictConfig) -> Pipeline:
Expand All @@ -130,9 +95,11 @@ def configure_pipeline(cfg: DictConfig) -> Pipeline:
Pipeline with hyper-parameters configured.
"""
if cfg.model == "flat":
classifier = configure_flat[cfg.classifier](cfg)
classifier = configure_flat[cfg.classifier]
classifier.set_params(**delete_non_hyperparameters(cfg))
else:
local_classifier = configure_flat[cfg.classifier](cfg)
local_classifier = configure_flat[cfg.classifier]
local_classifier.set_params(**delete_non_hyperparameters(cfg))
local_classifier.set_params(n_jobs=1)
classifier = configure_hierarchical[cfg.model]
classifier.set_params(
Expand All @@ -148,25 +115,22 @@ def configure_pipeline(cfg: DictConfig) -> Pipeline:
return pipeline


def compute_md5(cfg: DictConfig) -> str:
def compute_md5(cfg: dict) -> str:
"""
Compute MD5 hash of configuration.
Parameters
----------
cfg : DictConfig
Dictionary containing all configuration information.
cfg : dict
Dictionary containing hyperparameters.
Returns
-------
md5 : str
MD5 hash of configuration.
"""
dictionary = OmegaConf.to_object(cfg)
md5 = hashlib.md5(
json.dumps(dictionary, sort_keys=True).encode("utf-8")
).hexdigest()
md5 = hashlib.md5(json.dumps(cfg, sort_keys=True).encode("utf-8")).hexdigest()
return md5


Expand All @@ -181,10 +145,11 @@ def save_trial(cfg: DictConfig, scores: List[float]) -> None:
scores : List[float]
List of scores for each fold.
"""
md5 = compute_md5(cfg)
hyperparameters = delete_non_hyperparameters(cfg)
md5 = compute_md5(hyperparameters)
filename = f"{cfg.output_dir}/{md5}.sav"
with open(filename, "wb") as file:
pickle.dump((cfg, scores), file)
pickle.dump((hyperparameters, scores), file)


def load_trial(cfg: DictConfig) -> List[float]:
Expand All @@ -201,7 +166,8 @@ def load_trial(cfg: DictConfig) -> List[float]:
scores : List[float]
The cross-validation scores or empty list if file does not exist.
"""
md5 = compute_md5(cfg)
hyperparameters = delete_non_hyperparameters(cfg)
md5 = compute_md5(hyperparameters)
filename = f"{cfg.output_dir}/{md5}.sav"
if os.path.exists(filename):
(_, scores) = pickle.load(open(filename, "rb"))
Expand Down
33 changes: 3 additions & 30 deletions benchmarks/consumer_complaints/scripts/tune_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from typing import Tuple, List

import numpy as np
from omegaconf import OmegaConf


def parse_args(args: list) -> Namespace:
Expand Down Expand Up @@ -55,33 +54,6 @@ def parse_args(args: list) -> Namespace:
return parser.parse_args(args)


def delete_non_hyperparameters(hyperparameters: OmegaConf) -> dict:
"""
Delete non-hyperparameters from the dictionary.
Parameters
----------
hyperparameters : OmegaConf
Hyperparameters to delete non-hyperparameters from.
Returns
-------
hyperparameters : dict
Hyperparameters without non-hyperparameters.
"""
hyperparameters = OmegaConf.to_container(hyperparameters)
del hyperparameters["model"]
del hyperparameters["classifier"]
del hyperparameters["n_jobs"]
del hyperparameters["x_train"]
del hyperparameters["y_train"]
del hyperparameters["output_dir"]
del hyperparameters["mem_gb"]
del hyperparameters["n_splits"]
return hyperparameters


def compute(
folder: str,
) -> Tuple[List[dict], List[list], List[np.ndarray], List[np.ndarray]]:
Expand All @@ -104,14 +76,15 @@ def compute(
std : List[np.ndarray]
Standard deviations of k-fold cross-validation.
"""
results = glob.glob(f"{folder}/[!trained_model]*.sav")
results = glob.glob(f"{folder}/*.sav")
if "{}/trained_model.sav".format(folder) in results:
results.remove(f"{folder}/trained_model.sav")
hyperparameters = []
scores = []
avg = []
std = []
for result in results:
parameters, s = pickle.load(open(result, "rb"))
parameters = delete_non_hyperparameters(parameters)
hyperparameters.append(parameters)
scores.append([round(i, 3) for i in s])
avg.append(np.mean(s))
Expand Down
Loading

0 comments on commit b225ef3

Please sign in to comment.