From 541b5c3f4aacf0a731b741fc26754cc2ad1f1a31 Mon Sep 17 00:00:00 2001 From: HassnHamada <44922829+HassnHamada@users.noreply.github.com> Date: Thu, 21 Mar 2024 00:48:17 +0200 Subject: [PATCH 1/5] [ENH] forecasting tuners, `return_n_best_forecasters=-1` to return performances of all forecasters (#6031) #### Reference Issues/PRs Fixes #6027 #### What does this implement/fix? Explain your changes. If user input `return_n_best_forecasters=-1` the function should add all forecasters to `n_best_forecasters_` list --- sktime/forecasting/model_selection/_tune.py | 16 +++- .../model_selection/tests/test_tune.py | 77 +++++++++++++++++++ 2 files changed, 89 insertions(+), 4 deletions(-) diff --git a/sktime/forecasting/model_selection/_tune.py b/sktime/forecasting/model_selection/_tune.py index e579cf8248b..a7afe38c0e7 100644 --- a/sktime/forecasting/model_selection/_tune.py +++ b/sktime/forecasting/model_selection/_tune.py @@ -267,7 +267,10 @@ def evaluate_candidates(candidate_params): # Select n best forecaster self.n_best_forecasters_ = [] self.n_best_scores_ = [] - for i in range(self.return_n_best_forecasters): + _forecasters_to_return = min(self.return_n_best_forecasters, len(results.index)) + if _forecasters_to_return == -1: + _forecasters_to_return = len(results.index) + for i in range(_forecasters_to_return): params = results["params"].iloc[i] rank = results[f"rank_{scoring_name}"].iloc[i] rank = str(int(rank)) @@ -445,7 +448,9 @@ class ForecastingGridSearchCV(BaseGridSearch): verbose: int, optional (default=0) return_n_best_forecasters : int, default=1 In case the n best forecaster should be returned, this value can be set - and the n best forecasters will be assigned to n_best_forecasters_ + and the n best forecasters will be assigned to n_best_forecasters_. + Set return_n_best_forecasters to -1 to return all forecasters. + error_score : numeric value or the str 'raise', optional (default=np.nan) The test score returned when a forecaster fails to be fitted. return_train_score : bool, optional (default=False) @@ -773,7 +778,9 @@ class ForecastingRandomizedSearchCV(BaseGridSearch): verbose : int, optional (default=0) return_n_best_forecasters: int, default=1 In case the n best forecaster should be returned, this value can be set - and the n best forecasters will be assigned to n_best_forecasters_ + and the n best forecasters will be assigned to n_best_forecasters_. + Set return_n_best_forecasters to -1 to return all forecasters. + random_state : int, RandomState instance or None, default=None Pseudo random number generator state used for random uniform sampling from lists of possible values instead of scipy.stats distributions. @@ -1037,7 +1044,8 @@ class ForecastingSkoptSearchCV(BaseGridSearch): FitFailedWarning is raised. return_n_best_forecasters: int, default=1 In case the n best forecaster should be returned, this value can be set - and the n best forecasters will be assigned to n_best_forecasters_ + and the n best forecasters will be assigned to n_best_forecasters_. + Set return_n_best_forecasters to -1 to return all forecasters. backend : {"dask", "loky", "multiprocessing", "threading"}, by default "loky". Runs parallel evaluate if specified and ``strategy`` is set as "refit". diff --git a/sktime/forecasting/model_selection/tests/test_tune.py b/sktime/forecasting/model_selection/tests/test_tune.py index 7c1d604e21d..9dfb21a832b 100644 --- a/sktime/forecasting/model_selection/tests/test_tune.py +++ b/sktime/forecasting/model_selection/tests/test_tune.py @@ -4,6 +4,10 @@ __author__ = ["mloning", "fkiraly"] + +from functools import reduce +from typing import Dict, List, Union + import numpy as np import pytest from sklearn.model_selection import ParameterGrid, ParameterSampler @@ -348,3 +352,76 @@ def test_gscv_backends(backend_set): backend_params=backend_params, ) gscv.fit(y, X) + + +TEST_PARAMS_DICT = PIPE_GRID + +TEST_PARAMS_LIST = [ + { + "window_length": [1, 2, 3], + "strategy": ["last", "mean"], + "transformer__degree": [1, 2, 3], + "forecaster__strategy": ["last", "mean", "seasonal_last"], + }, + { + "window_length": [4, 5, 6], + "forecaster__strategy": ["last", "mean"], + }, +] + + +@pytest.mark.parametrize("return_n_best_forecasters", [-1, 0, 3]) +@pytest.mark.parametrize( + "Forecaster, kwargs", + [ + (ForecastingGridSearchCV, {"param_grid": TEST_PARAMS_DICT}), + (ForecastingGridSearchCV, {"param_grid": TEST_PARAMS_LIST}), + (ForecastingRandomizedSearchCV, {"param_distributions": TEST_PARAMS_LIST}), + ( + ForecastingRandomizedSearchCV, + {"param_distributions": TEST_PARAMS_LIST, "n_iter": 100}, + ), + ], +) +def test_return_n_best_forecasters(Forecaster, return_n_best_forecasters, kwargs): + y, X = load_longley() + searchCV = Forecaster( + forecaster=PIPE, + cv=CVs[0], + **kwargs, + return_n_best_forecasters=return_n_best_forecasters, + ) + searchCV.fit(y, X) + if return_n_best_forecasters == -1: + + def calculate_total_combinations(param_grid: Union[List[Dict], Dict]): + if isinstance(param_grid, dict): + return reduce(lambda x, y: x * y, [len(x) for x in param_grid.values()]) + elif isinstance(param_grid, list): + return sum(calculate_total_combinations(i) for i in param_grid) + else: + error_message = "`param_grid` must be a dict or a list[dict]" + raise ValueError(error_message) + + if "param_grid" in kwargs: + total_combinations = calculate_total_combinations(kwargs["param_grid"]) + assert len(searchCV.n_best_forecasters_) == total_combinations + else: + try: + assert len(searchCV.n_best_forecasters_) == searchCV.n_iter + except AssertionError: + total_combinations = calculate_total_combinations( + kwargs["param_distributions"] + ) + assert len(searchCV.n_best_forecasters_) == total_combinations + else: + try: + assert len(searchCV.n_best_forecasters_) == return_n_best_forecasters + except AssertionError: + key = ( + "param_distributions" + if "param_distributions" in kwargs + else "param_grid" + ) + total_combinations = calculate_total_combinations(kwargs[key]) + assert len(searchCV.n_best_forecasters_) == total_combinations From 2d698370ec3e62e406f43b5c9acebd8a306470e5 Mon Sep 17 00:00:00 2001 From: Felix Hirwa Nshuti Date: Thu, 21 Mar 2024 04:46:11 +0530 Subject: [PATCH 2/5] [BUG] Fix the `colalign` functionality to `ScipyDist` class as specified in the docstrings (#6110) This Pull request adds the `colalign` functionality to the `ScipyDist` class as specified in the docstring and corrected the formatting of the docstring of the class. #### Reference Issues/PRs FIxes https://github.com/sktime/sktime/issues/1942 --- sktime/dists_kernels/scipy_dist.py | 52 +++++++++++++++++++----------- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/sktime/dists_kernels/scipy_dist.py b/sktime/dists_kernels/scipy_dist.py index 4c3646f0a5e..7704d529450 100644 --- a/sktime/dists_kernels/scipy_dist.py +++ b/sktime/dists_kernels/scipy_dist.py @@ -22,24 +22,25 @@ class ScipyDist(BasePairwiseTransformer): Parameters ---------- - metric : string or function, as in cdist; default = 'euclidean' - if string, one of: 'braycurtis', 'canberra', 'chebyshev', 'cityblock', - 'correlation', 'cosine', 'dice', 'euclidean', 'hamming', 'jaccard', - 'jensenshannon', - 'kulsinski' (< scipy 1.11) or 'kulczynski1' (from scipy 1.11), - 'mahalanobis', 'matching', 'minkowski', - 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', - 'sokalsneath', 'sqeuclidean', 'yule' + metric : string or function, as in cdist; default = ``euclidean`` + if string, one of: ``braycurtis``, ``canberra``, ``chebyshev``, ``cityblock``, + ``correlation``, ``cosine``, ``dice``, ``euclidean``, ``hamming``, + ``jaccard``, ``jensenshannon``, + ``kulsinski`` (< scipy 1.11) or ``kulczynski1`` (from scipy 1.11), + ``mahalanobis``, ``matching``, ``minkowski``, + ``rogerstanimoto``, ``russellrao``, ``seuclidean``, ``sokalmichener``, + ``sokalsneath``, ``sqeuclidean``, ``yule`` if function, should have signature 1D-np.array x 1D-np.array -> float - p: if metric='minkowski', the "p" in "p-norm", otherwise irrelevant - colalign : string, one of 'intersect' (default), 'force-align', 'none' + p: if metric=``minkowski``, the ``p`` in ``p-norm``, otherwise irrelevant + colalign : string, one of ``intersect`` (default), ``force-align``, ``none`` controls column alignment if X, X2 passed in fit are pd.DataFrame - columns between X and X2 are aligned via column names - if 'intersect', distance is computed on columns occurring both in X and X2, + columns between X and X2 are aligned via column names. + + if ``intersect``, distance is computed on columns occurring both in X and X2, other columns are discarded; column ordering in X2 is copied from X - if 'force-align', raises an error if the set of columns in X, X2 differs; + if ``force-align``, raises an error if the set of columns in X, X2 differs; column ordering in X2 is copied from X - if 'none', X and X2 are passed through unmodified (no columns are aligned) + if ``none``, X and X2 are passed through unmodified (no columns are aligned) note: this will potentially align "non-matching" columns var_weights : 1D np.array of float or None, default=None weight/scaling vector applied to variables in X/X2 @@ -47,7 +48,7 @@ class ScipyDist(BasePairwiseTransformer): if None, equivalent to all-ones vector metric_kwargs : dict, optional, default=None any kwargs passed to the metric in addition, i.e., to the function cdist - common kwargs: "w" : array-like, same length as X.columns, weights for metric + common kwargs: ``w`` : array-like, same length as X.columns, weights for metric refer to scipy.spatial.distance.dist for a documentation of other extra kwargs """ @@ -100,7 +101,21 @@ def _transform(self, X, X2=None): metric_kwargs = self.metric_kwargs if metric_kwargs is None: metric_kwargs = {} - + if isinstance(X, pd.DataFrame) and isinstance(X2, pd.DataFrame): + if self.colalign == "intersect": + common_cols = X.columns.intersection(X2.columns) + X = X[common_cols] + X2 = X2[common_cols] + # reordering X2 columns to match X + X2 = X2[X.columns] + elif self.colalign == "force-align": + if not X.columns.equals(X2.columns): + raise ValueError("X and X2 have different columns") + X2 = X2[X.columns] + elif self.colalign == "none": + pass + else: + raise ValueError("colalign must be one of intersect, force-align, none") if isinstance(X, pd.DataFrame): X = X.select_dtypes("number").to_numpy(dtype="float") @@ -147,6 +162,7 @@ def get_test_params(cls, parameter_set="default"): params1 = {} # using kwargs - params2 = {"metric": "minkowski", "p": 3} + params2 = {"metric": "minkowski", "p": 3, "colalign": "intersect"} + params3 = {"metric": "euclidean", "colalign": "force-align"} - return [params1, params2] + return [params1, params2, params3] From 533fe8c85b2852cecf45116983dc71039e7fa2b0 Mon Sep 17 00:00:00 2001 From: Xinyu Wu <57612792+Xinyu-Wu-0000@users.noreply.github.com> Date: Thu, 21 Mar 2024 19:34:34 +0800 Subject: [PATCH 3/5] [ENH][BUG] Second test parameter set for shapeDTW (#6093) Towards https://github.com/sktime/sktime/issues/3429 Adds a second test parameter set for shapeDTW --- .all-contributorsrc | 13 ++++++- .../distance_based/_shape_dtw.py | 34 +++++++++++++++++-- 2 files changed, 44 insertions(+), 3 deletions(-) diff --git a/.all-contributorsrc b/.all-contributorsrc index 1625508003b..3e39feadae5 100644 --- a/.all-contributorsrc +++ b/.all-contributorsrc @@ -2629,6 +2629,17 @@ "maintenance" ] }, + { + "login": "Xinyu-Wu-0000", + "name": "Xinyu Wu", + "avatar_url": "https://avatars.githubusercontent.com/u/57612792?v=4", + "profile": "https://github.com/Xinyu-Wu-0000", + "contributions": [ + "bug", + "code", + "test" + ] + }, { "login": "meraldoantonio", "name": "Meraldo Antonio", @@ -2636,7 +2647,7 @@ "profile": "https://github.com/meraldoantonio", "contributions": [ "doc" - ] + ] }, { "login": "memeo-pro", diff --git a/sktime/classification/distance_based/_shape_dtw.py b/sktime/classification/distance_based/_shape_dtw.py index 180b5ad9cac..3d9960f3b42 100644 --- a/sktime/classification/distance_based/_shape_dtw.py +++ b/sktime/classification/distance_based/_shape_dtw.py @@ -38,7 +38,7 @@ class ShapeDTW(BaseClassifier): Parameters ---------- - n_neighbours : int, int, set k for knn (default =1). + n_neighbors : int, int, set k for knn (default =1). subsequence_length : int, defines the length of the subsequences(default=sqrt(n_timepoints)). @@ -165,6 +165,8 @@ def _fit(self, X, y): if self.metric_params is None: self.metric_params = {} _reset = True + else: + _reset = False # If the shape descriptor is 'compound', # calculate the appropriate weighting_factor @@ -238,7 +240,7 @@ def _calculate_weighting_factor_value(self, X, y): grid = GridSearchCV( estimator=ShapeDTW( - n_neighbours=n, + n_neighbors=n, subsequence_length=sl, shape_descriptor_function=sdf, shape_descriptor_functions=sdfs, @@ -502,3 +504,31 @@ def _combine_data_frames(self, dataFrames, weighting_factor, col_names): colToAdd.append(pd.Series(inst)) df[col] = colToAdd return df + + @classmethod + def get_test_params(cls, parameter_set="default"): + """Return testing parameter settings for the estimator. + + Parameters + ---------- + parameter_set : str, default="default" + Name of the set of test parameters to return, for use in tests. If no + special parameters are defined for a value, will return ``"default"`` set. + + + Returns + ------- + params : dict or list of dict, default = {} + Parameters to create testing instances of the class + Each dict are parameters to construct an "interesting" test instance, i.e., + ``MyClass(**params)`` or ``MyClass(**params[i])`` creates a valid test + instance. + ``create_test_instance`` uses the first (or only) dictionary in ``params`` + """ + params1 = {} + params2 = { + "n_neighbors": 3, + "shape_descriptor_function": "compound", + "shape_descriptor_functions": ["paa", "dwt"], + } + return [params1, params2] From 5a6b01ce092eb0d62c9e7070ae63a977c0a80df8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Thu, 21 Mar 2024 18:43:44 +0100 Subject: [PATCH 4/5] [DOC] minor clarifications in mtype descriptions (#6078) While refactoring `datatypes`, noticed some minor inconsistencies in the registry short descriptions of mtypes. Fixed in this PR. --- sktime/datatypes/_table/_registry.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sktime/datatypes/_table/_registry.py b/sktime/datatypes/_table/_registry.py index 8e0b76b8969..f7f7974cf8f 100644 --- a/sktime/datatypes/_table/_registry.py +++ b/sktime/datatypes/_table/_registry.py @@ -11,9 +11,9 @@ MTYPE_REGISTER_TABLE = [ ("pd_DataFrame_Table", "Table", "pd.DataFrame representation of a data table"), - ("numpy1D", "Table", "1D np.narray representation of a univariate table"), - ("numpy2D", "Table", "2D np.narray representation of a univariate table"), - ("pd_Series_Table", "Table", "pd.Series representation of a data table"), + ("numpy1D", "Table", "1D np.narray representation of a univariate data table"), + ("numpy2D", "Table", "2D np.narray representation of a multivariate data table"), + ("pd_Series_Table", "Table", "pd.Series representation of a univariate data table"), ("list_of_dict", "Table", "list of dictionaries with primitive entries"), ("polars_eager_table", "Table", "polars.DataFrame representation of a data table"), ("polars_lazy_table", "Table", "polars.LazyFrame representation of a data table"), From b6671117336a6d8af521226318cf78329be3cb01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Thu, 21 Mar 2024 19:29:33 +0100 Subject: [PATCH 5/5] [MNT] bound `temporian<0.8.0` (#6184) `temporian` is currently causing install failures on `main`, likely due to version 0.8.0 released today. This PR adds a bound `temporian<0.8.0` to `pyproject.toml`. FYI @ianspektor, @achoum --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7fdcce89014..e9d480d7417 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -108,7 +108,7 @@ all_extras = [ "statsmodels>=0.12.1", 'stumpy>=1.5.1; python_version < "3.11"', 'tbats>=1.1; python_version < "3.12"', - 'temporian>=0.7.0; python_version < "3.12" and sys_platform != "win32"', + 'temporian<0.8.0,>=0.7.0; python_version < "3.12" and sys_platform != "win32"', 'tensorflow<2.17,>=2; python_version < "3.12"', 'tsfresh>=0.17; python_version < "3.12"', 'tslearn<0.7.0,!=0.6.0,>=0.5.2; python_version < "3.11"', @@ -146,7 +146,7 @@ all_extras_pandas2 = [ "statsmodels>=0.12.1", 'stumpy>=1.5.1; python_version < "3.11"', 'tbats>=1.1; python_version < "3.12"', - 'temporian>=0.7.0; python_version < "3.12" and sys_platform != "win32"', + 'temporian<0.8.0,>=0.7.0; python_version < "3.12" and sys_platform != "win32"', 'tensorflow<2.17,>=2; python_version < "3.12"', 'tsfresh>=0.17; python_version < "3.12"', 'tslearn<0.7.0,!=0.6.0,>=0.5.2; python_version < "3.11"', @@ -212,7 +212,7 @@ transformations = [ "pykalman-bardo<0.10,>=0.9.7", "statsmodels<0.15,>=0.12.1", 'stumpy<1.13,>=1.5.1; python_version < "3.12"', - 'temporian>=0.7.0; python_version < "3.12" and sys_platform != "win32"', + 'temporian<0.8.0,>=0.7.0; python_version < "3.12" and sys_platform != "win32"', 'tsfresh<0.21,>=0.17; python_version < "3.12"', ]