Make hyperparameters flexible (#65)

scikit-learn-contrib · Nov 28, 2022 · b225ef3 · b225ef3
1 parent 78b6462
commit b225ef3
Show file tree

Hide file tree

Showing 5 changed files with 91 additions and 194 deletions.
diff --git a/benchmarks/consumer_complaints/README.md b/benchmarks/consumer_complaints/README.md
@@ -49,7 +49,7 @@ n_estimators: 1
 criterion: 1
 ```
 
-The intervals for testing can be defined with the functions `range` or `choice`, as described on [Hydra's documentation](https://hydra.cc/docs/plugins/optuna_sweeper/). If you wish to add more parameters for testing, you can simply add the parameter name inside the `params` field and at the end of the file set it to 1 in order to enable its usage in Hydra. Additionally, you would need to modify one of the functions `configure_lightgbm`, `configure_logistic_regression` or `configure_random_forest` (whichever is appropriate) inside the script [tune.py](scripts/tune.py) to enable the new hyperparameter.
+The intervals for testing can be defined with the functions `range` or `choice`, as described on [Hydra's documentation](https://hydra.cc/docs/plugins/optuna_sweeper/). If you wish to add more parameters for testing, you can simply add the parameter name inside the `params` field and at the end of the file set it to 1 in order to enable its usage in Hydra.
 
 ## Running locally
 

diff --git a/benchmarks/consumer_complaints/scripts/tune.py b/benchmarks/consumer_complaints/scripts/tune.py
@@ -15,7 +15,6 @@
 from lightgbm import LGBMClassifier
 from numpy.core._exceptions import _ArrayMemoryError
 from omegaconf import DictConfig, OmegaConf
-from sklearn.base import BaseEstimator
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
 from sklearn.linear_model import LogisticRegression
@@ -30,89 +29,55 @@
 )
 from hiclass.metrics import f1
 
-
 log = logging.getLogger("TUNE")
 
 
-def configure_lightgbm(cfg: DictConfig) -> BaseEstimator:
-    """
-    Configure LightGBM with parameters passed as argument.
+configure_flat = {
+    "lightgbm": LGBMClassifier(),
+    "logistic_regression": LogisticRegression(),
+    "random_forest": RandomForestClassifier(),
+}
 
-    Parameters
-    ----------
-    cfg : DictConfig
-        Dictionary containing all configuration information.
 
-    Returns
-    -------
-    classifier : BaseEstimator
-        Estimator with hyper-parameters configured.
-    """
-    classifier = LGBMClassifier(
-        n_jobs=cfg.n_jobs,
-        num_leaves=cfg.num_leaves,
-        n_estimators=cfg.n_estimators,
-        min_child_samples=cfg.min_child_samples,
-    )
-    return classifier
+configure_hierarchical = {
+    "local_classifier_per_node": LocalClassifierPerNode(),
+    "local_classifier_per_parent_node": LocalClassifierPerParentNode(),
+    "local_classifier_per_level": LocalClassifierPerLevel(),
+}
 
 
-def configure_logistic_regression(cfg: DictConfig) -> BaseEstimator:
-    """
-    Configure LogisticRegression with parameters passed as argument.
+non_hyperparameters = [
+    "model",
+    "classifier",
+    "n_jobs",
+    "x_train",
+    "y_train",
+    "output_dir",
+    "mem_gb",
+    "n_splits",
+]
 
-    Parameters
-    ----------
-    cfg : DictConfig
-        Dictionary containing all configuration information.
 
-    Returns
-    -------
-    classifier : BaseEstimator
-        Estimator with hyper-parameters configured.
-    """
-    classifier = LogisticRegression(
-        n_jobs=cfg.n_jobs,
-        solver=cfg.solver,
-        max_iter=cfg.max_iter,
-    )
-    return classifier
-
-
-def configure_random_forest(cfg: DictConfig) -> BaseEstimator:
+def delete_non_hyperparameters(cfg: OmegaConf) -> dict:
     """
-    Configure RandomForest with parameters passed as argument.
+    Delete non-hyperparameters from the dictionary.
 
     Parameters
     ----------
-    cfg : DictConfig
-        Dictionary containing all configuration information.
+    cfg : OmegaConf
+        Dictionary to delete non-hyperparameters from.
 
     Returns
     -------
-    classifier : BaseEstimator
-        Estimator with hyper-parameters configured.
-    """
-    classifier = RandomForestClassifier(
-        n_jobs=cfg.n_jobs,
-        n_estimators=cfg.n_estimators,
-        criterion=cfg.criterion,
-    )
-    return classifier
-
-
-configure_flat = {
-    "lightgbm": configure_lightgbm,
-    "logistic_regression": configure_logistic_regression,
-    "random_forest": configure_random_forest,
-}
-
+    hyperparameters : dict
+        Dictionary containing only hyperparameters.
 
-configure_hierarchical = {
-    "local_classifier_per_node": LocalClassifierPerNode(),
-    "local_classifier_per_parent_node": LocalClassifierPerParentNode(),
-    "local_classifier_per_level": LocalClassifierPerLevel(),
-}
+    """
+    hyperparameters = OmegaConf.to_container(cfg)
+    for key in non_hyperparameters:
+        if key in hyperparameters:
+            del hyperparameters[key]
+    return hyperparameters
 
 
 def configure_pipeline(cfg: DictConfig) -> Pipeline:
@@ -130,9 +95,11 @@ def configure_pipeline(cfg: DictConfig) -> Pipeline:
         Pipeline with hyper-parameters configured.
     """
     if cfg.model == "flat":
-        classifier = configure_flat[cfg.classifier](cfg)
+        classifier = configure_flat[cfg.classifier]
+        classifier.set_params(**delete_non_hyperparameters(cfg))
     else:
-        local_classifier = configure_flat[cfg.classifier](cfg)
+        local_classifier = configure_flat[cfg.classifier]
+        local_classifier.set_params(**delete_non_hyperparameters(cfg))
         local_classifier.set_params(n_jobs=1)
         classifier = configure_hierarchical[cfg.model]
         classifier.set_params(
@@ -148,25 +115,22 @@ def configure_pipeline(cfg: DictConfig) -> Pipeline:
     return pipeline
 
 
-def compute_md5(cfg: DictConfig) -> str:
+def compute_md5(cfg: dict) -> str:
     """
     Compute MD5 hash of configuration.
 
     Parameters
     ----------
-    cfg : DictConfig
-        Dictionary containing all configuration information.
+    cfg : dict
+        Dictionary containing hyperparameters.
 
     Returns
     -------
     md5 : str
         MD5 hash of configuration.
 
     """
-    dictionary = OmegaConf.to_object(cfg)
-    md5 = hashlib.md5(
-        json.dumps(dictionary, sort_keys=True).encode("utf-8")
-    ).hexdigest()
+    md5 = hashlib.md5(json.dumps(cfg, sort_keys=True).encode("utf-8")).hexdigest()
     return md5
 
 
@@ -181,10 +145,11 @@ def save_trial(cfg: DictConfig, scores: List[float]) -> None:
     scores : List[float]
         List of scores for each fold.
     """
-    md5 = compute_md5(cfg)
+    hyperparameters = delete_non_hyperparameters(cfg)
+    md5 = compute_md5(hyperparameters)
     filename = f"{cfg.output_dir}/{md5}.sav"
     with open(filename, "wb") as file:
-        pickle.dump((cfg, scores), file)
+        pickle.dump((hyperparameters, scores), file)
 
 
 def load_trial(cfg: DictConfig) -> List[float]:
@@ -201,7 +166,8 @@ def load_trial(cfg: DictConfig) -> List[float]:
     scores : List[float]
         The cross-validation scores or empty list if file does not exist.
     """
-    md5 = compute_md5(cfg)
+    hyperparameters = delete_non_hyperparameters(cfg)
+    md5 = compute_md5(hyperparameters)
     filename = f"{cfg.output_dir}/{md5}.sav"
     if os.path.exists(filename):
         (_, scores) = pickle.load(open(filename, "rb"))

diff --git a/benchmarks/consumer_complaints/scripts/tune_table.py b/benchmarks/consumer_complaints/scripts/tune_table.py
@@ -8,7 +8,6 @@
 from typing import Tuple, List
 
 import numpy as np
-from omegaconf import OmegaConf
 
 
 def parse_args(args: list) -> Namespace:
@@ -55,33 +54,6 @@ def parse_args(args: list) -> Namespace:
     return parser.parse_args(args)
 
 
-def delete_non_hyperparameters(hyperparameters: OmegaConf) -> dict:
-    """
-    Delete non-hyperparameters from the dictionary.
-
-    Parameters
-    ----------
-    hyperparameters : OmegaConf
-        Hyperparameters to delete non-hyperparameters from.
-
-    Returns
-    -------
-    hyperparameters : dict
-        Hyperparameters without non-hyperparameters.
-
-    """
-    hyperparameters = OmegaConf.to_container(hyperparameters)
-    del hyperparameters["model"]
-    del hyperparameters["classifier"]
-    del hyperparameters["n_jobs"]
-    del hyperparameters["x_train"]
-    del hyperparameters["y_train"]
-    del hyperparameters["output_dir"]
-    del hyperparameters["mem_gb"]
-    del hyperparameters["n_splits"]
-    return hyperparameters
-
-
 def compute(
     folder: str,
 ) -> Tuple[List[dict], List[list], List[np.ndarray], List[np.ndarray]]:
@@ -104,14 +76,15 @@ def compute(
     std : List[np.ndarray]
         Standard deviations of k-fold cross-validation.
     """
-    results = glob.glob(f"{folder}/[!trained_model]*.sav")
+    results = glob.glob(f"{folder}/*.sav")
+    if "{}/trained_model.sav".format(folder) in results:
+        results.remove(f"{folder}/trained_model.sav")
     hyperparameters = []
     scores = []
     avg = []
     std = []
     for result in results:
         parameters, s = pickle.load(open(result, "rb"))
-        parameters = delete_non_hyperparameters(parameters)
         hyperparameters.append(parameters)
         scores.append([round(i, 3) for i in s])
         avg.append(np.mean(s))