Merge branch 'main' into tsbootstrap_adapter

sktime · Mar 22, 2024 · 9755bc3 · 9755bc3
2 parents ce0ef73 + b667111
commit 9755bc3
Show file tree

Hide file tree

Showing 7 changed files with 173 additions and 31 deletions.
diff --git a/.all-contributorsrc b/.all-contributorsrc
@@ -2629,14 +2629,25 @@
         "maintenance"
       ]
     },
+    {
+      "login": "Xinyu-Wu-0000",
+      "name": "Xinyu Wu",
+      "avatar_url": "https://avatars.githubusercontent.com/u/57612792?v=4",
+      "profile": "https://github.com/Xinyu-Wu-0000",
+      "contributions": [
+        "bug",
+        "code",
+        "test"
+      ]
+    },
     {
       "login": "meraldoantonio",
       "name": "Meraldo Antonio",
       "avatar_url": "https://avatars.githubusercontent.com/u/37468543?v=4",
       "profile": "https://github.com/meraldoantonio",
       "contributions": [
         "doc"
-       ]
+      ]
     },
     {
       "login": "memeo-pro",

diff --git a/pyproject.toml b/pyproject.toml
@@ -108,7 +108,7 @@ all_extras = [
   "statsmodels>=0.12.1",
   'stumpy>=1.5.1; python_version < "3.11"',
   'tbats>=1.1; python_version < "3.12"',
-  'temporian>=0.7.0; python_version < "3.12" and sys_platform != "win32"',
+  'temporian<0.8.0,>=0.7.0; python_version < "3.12" and sys_platform != "win32"',
   'tensorflow<2.17,>=2; python_version < "3.12"',
   'tsbootstrap<0.2,>=0.1.0',
   'tsfresh>=0.17; python_version < "3.12"',
@@ -147,7 +147,7 @@ all_extras_pandas2 = [
   "statsmodels>=0.12.1",
   'stumpy>=1.5.1; python_version < "3.11"',
   'tbats>=1.1; python_version < "3.12"',
-  'temporian>=0.7.0; python_version < "3.12" and sys_platform != "win32"',
+  'temporian<0.8.0,>=0.7.0; python_version < "3.12" and sys_platform != "win32"',
   'tensorflow<2.17,>=2; python_version < "3.12"',
   'tsbootstrap<0.2,>=0.1.0',
   'tsfresh>=0.17; python_version < "3.12"',
@@ -214,7 +214,7 @@ transformations = [
   "pykalman-bardo<0.10,>=0.9.7",
   "statsmodels<0.15,>=0.12.1",
   'stumpy<1.13,>=1.5.1; python_version < "3.12"',
-  'temporian>=0.7.0; python_version < "3.12" and sys_platform != "win32"',
+  'temporian<0.8.0,>=0.7.0; python_version < "3.12" and sys_platform != "win32"',
   'tsbootstrap<0.2,>=0.1.0',
   'tsfresh<0.21,>=0.17; python_version < "3.12"',
 ]

diff --git a/sktime/classification/distance_based/_shape_dtw.py b/sktime/classification/distance_based/_shape_dtw.py
@@ -38,7 +38,7 @@ class ShapeDTW(BaseClassifier):
 
     Parameters
     ----------
-    n_neighbours                : int, int, set k for knn (default =1).
+    n_neighbors                : int, int, set k for knn (default =1).
     subsequence_length          : int, defines the length of the
                                   subsequences(default=sqrt(n_timepoints)).
 
@@ -165,6 +165,8 @@ def _fit(self, X, y):
         if self.metric_params is None:
             self.metric_params = {}
             _reset = True
+        else:
+            _reset = False
 
         # If the shape descriptor is 'compound',
         # calculate the appropriate weighting_factor
@@ -238,7 +240,7 @@ def _calculate_weighting_factor_value(self, X, y):
 
             grid = GridSearchCV(
                 estimator=ShapeDTW(
-                    n_neighbours=n,
+                    n_neighbors=n,
                     subsequence_length=sl,
                     shape_descriptor_function=sdf,
                     shape_descriptor_functions=sdfs,
@@ -502,3 +504,31 @@ def _combine_data_frames(self, dataFrames, weighting_factor, col_names):
                 colToAdd.append(pd.Series(inst))
             df[col] = colToAdd
         return df
+
+    @classmethod
+    def get_test_params(cls, parameter_set="default"):
+        """Return testing parameter settings for the estimator.
+
+        Parameters
+        ----------
+        parameter_set : str, default="default"
+            Name of the set of test parameters to return, for use in tests. If no
+            special parameters are defined for a value, will return ``"default"`` set.
+
+
+        Returns
+        -------
+        params : dict or list of dict, default = {}
+            Parameters to create testing instances of the class
+            Each dict are parameters to construct an "interesting" test instance, i.e.,
+            ``MyClass(**params)`` or ``MyClass(**params[i])`` creates a valid test
+            instance.
+            ``create_test_instance`` uses the first (or only) dictionary in ``params``
+        """
+        params1 = {}
+        params2 = {
+            "n_neighbors": 3,
+            "shape_descriptor_function": "compound",
+            "shape_descriptor_functions": ["paa", "dwt"],
+        }
+        return [params1, params2]
diff --git a/sktime/datatypes/_table/_registry.py b/sktime/datatypes/_table/_registry.py
@@ -11,9 +11,9 @@
 
 MTYPE_REGISTER_TABLE = [
     ("pd_DataFrame_Table", "Table", "pd.DataFrame representation of a data table"),
-    ("numpy1D", "Table", "1D np.narray representation of a univariate table"),
-    ("numpy2D", "Table", "2D np.narray representation of a univariate table"),
-    ("pd_Series_Table", "Table", "pd.Series representation of a data table"),
+    ("numpy1D", "Table", "1D np.narray representation of a univariate data table"),
+    ("numpy2D", "Table", "2D np.narray representation of a multivariate data table"),
+    ("pd_Series_Table", "Table", "pd.Series representation of a univariate data table"),
     ("list_of_dict", "Table", "list of dictionaries with primitive entries"),
     ("polars_eager_table", "Table", "polars.DataFrame representation of a data table"),
     ("polars_lazy_table", "Table", "polars.LazyFrame representation of a data table"),

diff --git a/sktime/dists_kernels/scipy_dist.py b/sktime/dists_kernels/scipy_dist.py
@@ -22,32 +22,33 @@ class ScipyDist(BasePairwiseTransformer):
 
     Parameters
     ----------
-    metric : string or function, as in cdist; default = 'euclidean'
-        if string, one of: 'braycurtis', 'canberra', 'chebyshev', 'cityblock',
-            'correlation', 'cosine', 'dice', 'euclidean', 'hamming', 'jaccard',
-            'jensenshannon',
-            'kulsinski' (< scipy 1.11) or 'kulczynski1' (from scipy 1.11),
-            'mahalanobis', 'matching', 'minkowski',
-            'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener',
-            'sokalsneath', 'sqeuclidean', 'yule'
+    metric : string or function, as in cdist; default = ``euclidean``
+        if string, one of: ``braycurtis``, ``canberra``, ``chebyshev``, ``cityblock``,
+            ``correlation``, ``cosine``, ``dice``, ``euclidean``, ``hamming``,
+            ``jaccard``, ``jensenshannon``,
+            ``kulsinski`` (< scipy 1.11) or ``kulczynski1`` (from scipy 1.11),
+            ``mahalanobis``, ``matching``, ``minkowski``,
+            ``rogerstanimoto``, ``russellrao``, ``seuclidean``, ``sokalmichener``,
+            ``sokalsneath``, ``sqeuclidean``, ``yule``
         if function, should have signature 1D-np.array x 1D-np.array -> float
-    p:  if metric='minkowski', the "p" in "p-norm", otherwise irrelevant
-    colalign : string, one of 'intersect' (default), 'force-align', 'none'
+    p:  if metric=``minkowski``, the ``p`` in ``p-norm``, otherwise irrelevant
+    colalign : string, one of ``intersect`` (default), ``force-align``, ``none``
         controls column alignment if X, X2 passed in fit are pd.DataFrame
-        columns between X and X2 are aligned via column names
-        if 'intersect', distance is computed on columns occurring both in X and X2,
+        columns between X and X2 are aligned via column names.
+
+        if ``intersect``, distance is computed on columns occurring both in X and X2,
             other columns are discarded; column ordering in X2 is copied from X
-        if 'force-align', raises an error if the set of columns in X, X2 differs;
+        if ``force-align``, raises an error if the set of columns in X, X2 differs;
             column ordering in X2 is copied from X
-        if 'none', X and X2 are passed through unmodified (no columns are aligned)
+        if ``none``, X and X2 are passed through unmodified (no columns are aligned)
             note: this will potentially align "non-matching" columns
     var_weights : 1D np.array of float or None, default=None
         weight/scaling vector applied to variables in X/X2
         before being passed to cdist, i-th col of X/X2 is multiplied by var_weights[i]
         if None, equivalent to all-ones vector
     metric_kwargs : dict, optional, default=None
         any kwargs passed to the metric in addition, i.e., to the function cdist
-        common kwargs: "w" : array-like, same length as X.columns, weights for metric
+        common kwargs: ``w`` : array-like, same length as X.columns, weights for metric
         refer to scipy.spatial.distance.dist for a documentation of other extra kwargs
     """
 
@@ -100,7 +101,21 @@ def _transform(self, X, X2=None):
         metric_kwargs = self.metric_kwargs
         if metric_kwargs is None:
             metric_kwargs = {}
-
+        if isinstance(X, pd.DataFrame) and isinstance(X2, pd.DataFrame):
+            if self.colalign == "intersect":
+                common_cols = X.columns.intersection(X2.columns)
+                X = X[common_cols]
+                X2 = X2[common_cols]
+                # reordering X2 columns to match X
+                X2 = X2[X.columns]
+            elif self.colalign == "force-align":
+                if not X.columns.equals(X2.columns):
+                    raise ValueError("X and X2 have different columns")
+                X2 = X2[X.columns]
+            elif self.colalign == "none":
+                pass
+            else:
+                raise ValueError("colalign must be one of intersect, force-align, none")
         if isinstance(X, pd.DataFrame):
             X = X.select_dtypes("number").to_numpy(dtype="float")
 
@@ -147,6 +162,7 @@ def get_test_params(cls, parameter_set="default"):
         params1 = {}
 
         # using kwargs
-        params2 = {"metric": "minkowski", "p": 3}
+        params2 = {"metric": "minkowski", "p": 3, "colalign": "intersect"}
+        params3 = {"metric": "euclidean", "colalign": "force-align"}
 
-        return [params1, params2]
+        return [params1, params2, params3]
diff --git a/sktime/forecasting/model_selection/_tune.py b/sktime/forecasting/model_selection/_tune.py
@@ -267,7 +267,10 @@ def evaluate_candidates(candidate_params):
         # Select n best forecaster
         self.n_best_forecasters_ = []
         self.n_best_scores_ = []
-        for i in range(self.return_n_best_forecasters):
+        _forecasters_to_return = min(self.return_n_best_forecasters, len(results.index))
+        if _forecasters_to_return == -1:
+            _forecasters_to_return = len(results.index)
+        for i in range(_forecasters_to_return):
             params = results["params"].iloc[i]
             rank = results[f"rank_{scoring_name}"].iloc[i]
             rank = str(int(rank))
@@ -445,7 +448,9 @@ class ForecastingGridSearchCV(BaseGridSearch):
     verbose: int, optional (default=0)
     return_n_best_forecasters : int, default=1
         In case the n best forecaster should be returned, this value can be set
-        and the n best forecasters will be assigned to n_best_forecasters_
+        and the n best forecasters will be assigned to n_best_forecasters_.
+        Set return_n_best_forecasters to -1 to return all forecasters.
+
     error_score : numeric value or the str 'raise', optional (default=np.nan)
         The test score returned when a forecaster fails to be fitted.
     return_train_score : bool, optional (default=False)
@@ -773,7 +778,9 @@ class ForecastingRandomizedSearchCV(BaseGridSearch):
     verbose : int, optional (default=0)
     return_n_best_forecasters: int, default=1
         In case the n best forecaster should be returned, this value can be set
-        and the n best forecasters will be assigned to n_best_forecasters_
+        and the n best forecasters will be assigned to n_best_forecasters_.
+        Set return_n_best_forecasters to -1 to return all forecasters.
+
     random_state : int, RandomState instance or None, default=None
         Pseudo random number generator state used for random uniform sampling
         from lists of possible values instead of scipy.stats distributions.
@@ -1037,7 +1044,8 @@ class ForecastingSkoptSearchCV(BaseGridSearch):
         FitFailedWarning is raised.
     return_n_best_forecasters: int, default=1
         In case the n best forecaster should be returned, this value can be set
-        and the n best forecasters will be assigned to n_best_forecasters_
+        and the n best forecasters will be assigned to n_best_forecasters_.
+        Set return_n_best_forecasters to -1 to return all forecasters.
 
     backend : {"dask", "loky", "multiprocessing", "threading"}, by default "loky".
         Runs parallel evaluate if specified and ``strategy`` is set as "refit".

diff --git a/sktime/forecasting/model_selection/tests/test_tune.py b/sktime/forecasting/model_selection/tests/test_tune.py
@@ -4,6 +4,10 @@
 
 __author__ = ["mloning", "fkiraly"]
 
+
+from functools import reduce
+from typing import Dict, List, Union
+
 import numpy as np
 import pytest
 from sklearn.model_selection import ParameterGrid, ParameterSampler
@@ -348,3 +352,76 @@ def test_gscv_backends(backend_set):
         backend_params=backend_params,
     )
     gscv.fit(y, X)
+
+
+TEST_PARAMS_DICT = PIPE_GRID
+
+TEST_PARAMS_LIST = [
+    {
+        "window_length": [1, 2, 3],
+        "strategy": ["last", "mean"],
+        "transformer__degree": [1, 2, 3],
+        "forecaster__strategy": ["last", "mean", "seasonal_last"],
+    },
+    {
+        "window_length": [4, 5, 6],
+        "forecaster__strategy": ["last", "mean"],
+    },
+]
+
+
+@pytest.mark.parametrize("return_n_best_forecasters", [-1, 0, 3])
+@pytest.mark.parametrize(
+    "Forecaster, kwargs",
+    [
+        (ForecastingGridSearchCV, {"param_grid": TEST_PARAMS_DICT}),
+        (ForecastingGridSearchCV, {"param_grid": TEST_PARAMS_LIST}),
+        (ForecastingRandomizedSearchCV, {"param_distributions": TEST_PARAMS_LIST}),
+        (
+            ForecastingRandomizedSearchCV,
+            {"param_distributions": TEST_PARAMS_LIST, "n_iter": 100},
+        ),
+    ],
+)
+def test_return_n_best_forecasters(Forecaster, return_n_best_forecasters, kwargs):
+    y, X = load_longley()
+    searchCV = Forecaster(
+        forecaster=PIPE,
+        cv=CVs[0],
+        **kwargs,
+        return_n_best_forecasters=return_n_best_forecasters,
+    )
+    searchCV.fit(y, X)
+    if return_n_best_forecasters == -1:
+
+        def calculate_total_combinations(param_grid: Union[List[Dict], Dict]):
+            if isinstance(param_grid, dict):
+                return reduce(lambda x, y: x * y, [len(x) for x in param_grid.values()])
+            elif isinstance(param_grid, list):
+                return sum(calculate_total_combinations(i) for i in param_grid)
+            else:
+                error_message = "`param_grid` must be a dict or a list[dict]"
+                raise ValueError(error_message)
+
+        if "param_grid" in kwargs:
+            total_combinations = calculate_total_combinations(kwargs["param_grid"])
+            assert len(searchCV.n_best_forecasters_) == total_combinations
+        else:
+            try:
+                assert len(searchCV.n_best_forecasters_) == searchCV.n_iter
+            except AssertionError:
+                total_combinations = calculate_total_combinations(
+                    kwargs["param_distributions"]
+                )
+                assert len(searchCV.n_best_forecasters_) == total_combinations
+    else:
+        try:
+            assert len(searchCV.n_best_forecasters_) == return_n_best_forecasters
+        except AssertionError:
+            key = (
+                "param_distributions"
+                if "param_distributions" in kwargs
+                else "param_grid"
+            )
+            total_combinations = calculate_total_combinations(kwargs[key])
+            assert len(searchCV.n_best_forecasters_) == total_combinations