Skip to content

Commit

Permalink
[ENH] speed up BaseTransformer checks and conversion boilerplate (#…
Browse files Browse the repository at this point in the history
…5036)

This PR speeds up `BaseTransformer` checks and conversion boilerplate:

* replacing some `convert_to` calls by more specific `convert` calls
that ensure we do not repeat mtype checks
* specifying explicitly the needed metadata in an instance where all
metadata was returned, to avoid costly metadata queries that are
discarded
* refactoring `convert` and `convert_to`, moving logic that determines
the target mtype into a separate function, and allowing `convert` to
have `list` as `to_type`
* adding an option to `convert_to_scitype` to return the mtype of the
converted-to output. This is passed as an input to `convert` in the
`BaseTransformer` boilerplate to avoid checks to obtain this
information.
  • Loading branch information
fkiraly committed Aug 11, 2023
1 parent f6fea29 commit 0094f86
Show file tree
Hide file tree
Showing 3 changed files with 160 additions and 48 deletions.
74 changes: 49 additions & 25 deletions sktime/datatypes/_convert.py
Expand Up @@ -101,8 +101,9 @@ def convert(
obj : object to convert - any type, should comply with mtype spec for as_scitype
from_type : str - the type to convert "obj" to, a valid mtype string
valid mtype strings, with explanation, are in datatypes.MTYPE_REGISTER
to_type : str - the type to convert "obj" to, a valid mtype string
valid mtype strings, with explanation, are in datatypes.MTYPE_REGISTER
to_type : str - the mtype to convert "obj" to, a valid mtype string
or list of str, this specifies admissible types for conversion to;
if list, will convert to first mtype of the same scitype as from_mtype
as_scitype : str, optional - name of scitype the object "obj" is considered as
default = inferred from from_type
valid scitype strings, with explanation, are in datatypes.SCITYPE_REGISTER
Expand All @@ -127,9 +128,14 @@ def convert(
if obj is None:
return None

# if to_type is a list, we do the following:
# if on the list, then don't do a conversion (convert to from_type)
# if not on the list, we find and convert to first mtype that has same scitype
to_type = _get_first_mtype_of_same_scitype(
from_mtype=from_type, to_mtypes=to_type, varname="to_type"
)

# input type checks
if not isinstance(to_type, str):
raise TypeError("to_type must be a str")
if not isinstance(from_type, str):
raise TypeError("from_type must be a str")
if as_scitype is None:
Expand Down Expand Up @@ -186,8 +192,9 @@ def convert_to(
Parameters
----------
obj : object to convert - any type, should comply with mtype spec for as_scitype
to_type : str - the type to convert "obj" to, a valid mtype string
or list of str, this specifies admissible types for conversion to
to_type : str - the mtype to convert "obj" to, a valid mtype string
or list of str, this specifies admissible types for conversion to;
if list, will convert to first mtype of the same scitype as obj
valid mtype strings, with explanation, are in datatypes.MTYPE_REGISTER
as_scitype : str, optional - name of scitype the object "obj" is considered as
pre-specifying the scitype reduces the number of checks done in type inference
Expand Down Expand Up @@ -240,25 +247,6 @@ def convert_to(
from_type = infer_mtype(obj=obj, as_scitype=as_scitype)
as_scitype = mtype_to_scitype(from_type)

# if to_type is a list, we do the following:
# if on the list, then don't do a conversion (convert to from_type)
# if not on the list, we find and convert to first mtype that has same scitype
if isinstance(to_type, list):
# no conversion of from_type is in the list
if from_type in to_type:
to_type = from_type
# otherwise convert to first element of same scitype
else:
same_scitype_mtypes = [
mtype for mtype in to_type if mtype_to_scitype(mtype) == as_scitype
]
if len(same_scitype_mtypes) == 0:
raise TypeError(
"to_type contains no mtype compatible with the scitype of obj,"
f"which is {as_scitype}"
)
to_type = same_scitype_mtypes[0]

converted_obj = convert(
obj=obj,
from_type=from_type,
Expand All @@ -271,6 +259,42 @@ def convert_to(
return converted_obj


def _get_first_mtype_of_same_scitype(from_mtype, to_mtypes, varname="to_mtypes"):
"""Return first mtype in list mtypes that has same scitype as from_mtype.
Parameters
----------
from_mtype : str - mtype of object to convert from
to_mtypes : list of str - mtypes to convert to
varname : str - name of variable to_mtypes, for error message
Returns
-------
to_type : str - first mtype in to_mtypes that has same scitype as from_mtype
"""
if isinstance(to_mtypes, str):
return to_mtypes

if not isinstance(to_mtypes, list):
raise TypeError(f"{varname} must be a str or a list of str")

# no conversion of from_type is in the list
if from_mtype in to_mtypes:
return from_mtype
# otherwise convert to first element of same scitype
scitype = mtype_to_scitype(from_mtype)
same_scitype_mtypes = [
mtype for mtype in to_mtypes if mtype_to_scitype(mtype) == scitype
]
if len(same_scitype_mtypes) == 0:
raise TypeError(
f"{varname} contains no mtype compatible with the scitype of obj,"
f"which is {scitype}"
)
to_type = same_scitype_mtypes[0]
return to_type


def _conversions_defined(scitype: str):
"""Return an indicator matrix which conversions are defined for scitype.
Expand Down
98 changes: 81 additions & 17 deletions sktime/datatypes/_series_as_panel/_convert.py
Expand Up @@ -21,7 +21,7 @@
from sktime.datatypes import convert_to, scitype


def convert_Series_to_Panel(obj, store=None):
def convert_Series_to_Panel(obj, store=None, return_to_mtype=False):
"""Convert series to a single-series panel.
Adds a dummy dimension to the series.
Expand All @@ -35,6 +35,10 @@ def convert_Series_to_Panel(obj, store=None):
Parameters
----------
obj: an object of scitype Series, of mtype pd.DataFrame, pd.Series, or np.ndarray.
store: dict, optional
converter store for back-conversion
return_to_mtype: bool, optional (default=False)
if True, also returns the str of the mtype converted to
Returns
-------
Expand All @@ -46,7 +50,10 @@ def convert_Series_to_Panel(obj, store=None):
obj = pd.DataFrame(obj)

if isinstance(obj, pd.DataFrame):
return [obj]
if return_to_mtype:
return [obj], "df-list"
else:
return [obj]

if isinstance(obj, np.ndarray):
if len(obj.shape) == 2:
Expand All @@ -55,18 +62,23 @@ def convert_Series_to_Panel(obj, store=None):
# numpy3D = (instances, variables, time)
obj = np.expand_dims(obj, 0)
obj = np.swapaxes(obj, 1, 2)
obj_mtype = "numpy3D"
elif len(obj.shape) == 1:
# from numpy1D to numpy3D
# numpy1D = (time)
# numpy3D = (instances, variables, time)
obj = np.expand_dims(obj, (0, 1))
obj_mtype = "numpy3D"
else:
raise ValueError("if obj is np.ndarray, must be of dim 1 or 2")

return obj
if return_to_mtype:
return obj, obj_mtype
else:
return obj


def convert_Panel_to_Series(obj, store=None):
def convert_Panel_to_Series(obj, store=None, return_to_mtype=False):
"""Convert single-series panel to a series.
Removes panel index from the single-series panel to obtain a series.
Expand All @@ -78,6 +90,10 @@ def convert_Panel_to_Series(obj, store=None):
Parameters
----------
obj: an object of scitype Panel, of mtype pd-multiindex, numpy3d, or df-list.
store: dict, optional
converter store for back-conversion
return_to_mtype: bool, optional (default=False)
if True, also returns the str of the mtype converted to
Returns
-------
Expand All @@ -86,12 +102,16 @@ def convert_Panel_to_Series(obj, store=None):
"""
if isinstance(obj, list):
if len(obj) == 1:
return obj[0]
if return_to_mtype:
return obj[0], "pd.DataFrame"
else:
return obj[0]
else:
raise ValueError("obj must be of length 1")

if isinstance(obj, pd.DataFrame):
obj.index = obj.index.droplevel(level=0)
obj_mtype = "pd.DataFrame"

if isinstance(obj, np.ndarray):
if obj.ndim != 3 or obj.shape[0] != 1:
Expand All @@ -101,11 +121,15 @@ def convert_Panel_to_Series(obj, store=None):
# numpy3D = (instances, variables, time)
obj = np.reshape(obj, (obj.shape[1], obj.shape[2]))
obj = np.swapaxes(obj, 0, 1)
obj_mtype = "np.ndarray"

return obj
if return_to_mtype:
return obj, obj_mtype
else:
return obj


def convert_Series_to_Hierarchical(obj, store=None):
def convert_Series_to_Hierarchical(obj, store=None, return_to_mtype=False):
"""Convert series to a single-series hierarchical object.
Adds two dimensions to the series to obtain a 3-level MultiIndex, 2 levels added.
Expand All @@ -117,6 +141,10 @@ def convert_Series_to_Hierarchical(obj, store=None):
Parameters
----------
obj: an object of scitype Series, of mtype pd.DataFrame, pd.Series, or np.ndarray.
store: dict, optional
converter store for back-conversion
return_to_mtype: bool, optional (default=False)
if True, also returns the str of the mtype converted to
Returns
-------
Expand All @@ -128,10 +156,14 @@ def convert_Series_to_Hierarchical(obj, store=None):
obj_df["__level2"] = 0
obj_df = obj_df.set_index(["__level1", "__level2"], append=True)
obj_df = obj_df.reorder_levels([1, 2, 0])
return obj_df

if return_to_mtype:
return obj_df, "pd_multiindex_hier"
else:
return obj_df


def convert_Hierarchical_to_Series(obj, store=None):
def convert_Hierarchical_to_Series(obj, store=None, return_to_mtype=False):
"""Convert single-series hierarchical object to a series.
Removes two dimensions to obtain a series, by removing 2 levels from MultiIndex.
Expand All @@ -143,6 +175,10 @@ def convert_Hierarchical_to_Series(obj, store=None):
Parameters
----------
obj: an object of scitype Hierarchical.
store: dict, optional
converter store for back-conversion
return_to_mtype: bool, optional (default=False)
if True, also returns the str of the mtype converted to
Returns
-------
Expand All @@ -151,10 +187,14 @@ def convert_Hierarchical_to_Series(obj, store=None):
obj_df = convert_to(obj, to_type="pd_multiindex_hier", as_scitype="Hierarchical")
obj_df = obj_df.copy()
obj_df.index = obj_df.index.get_level_values(-1)
return obj_df

if return_to_mtype:
return obj_df, "pd.DataFrame"
else:
return obj_df

def convert_Panel_to_Hierarchical(obj, store=None):

def convert_Panel_to_Hierarchical(obj, store=None, return_to_mtype=False):
"""Convert panel to a single-panel hierarchical object.
Adds a dimensions to the panel to obtain a 3-level MultiIndex, 1 level is added.
Expand All @@ -166,6 +206,10 @@ def convert_Panel_to_Hierarchical(obj, store=None):
Parameters
----------
obj: an object of scitype Panel.
store: dict, optional
converter store for back-conversion
return_to_mtype: bool, optional (default=False)
if True, also returns the str of the mtype converted to
Returns
-------
Expand All @@ -176,10 +220,14 @@ def convert_Panel_to_Hierarchical(obj, store=None):
obj_df["__level2"] = 0
obj_df = obj_df.set_index(["__level2"], append=True)
obj_df = obj_df.reorder_levels([2, 0, 1])
return obj_df

if return_to_mtype:
return obj_df, "pd_multiindex_hier"
else:
return obj_df


def convert_Hierarchical_to_Panel(obj, store=None):
def convert_Hierarchical_to_Panel(obj, store=None, return_to_mtype=False):
"""Convert single-series hierarchical object to a series.
Removes one dimensions to obtain a panel, by removing 1 level from MultiIndex.
Expand All @@ -190,7 +238,11 @@ def convert_Hierarchical_to_Panel(obj, store=None):
Parameters
----------
obj: an object of scitype Hierarchical.
obj: an object of scitype Hierarchical
store: dict, optional
converter store for back-conversion
return_to_mtype: bool, optional (default=False)
if True, also returns the str of the mtype converted to
Returns
-------
Expand All @@ -199,10 +251,20 @@ def convert_Hierarchical_to_Panel(obj, store=None):
obj_df = convert_to(obj, to_type="pd_multiindex_hier", as_scitype="Hierarchical")
obj_df = obj_df.copy()
obj_df.index = obj_df.index.get_level_values([-2, -1])
return obj_df

if return_to_mtype:
return obj_df, "pd-multiindex"
else:
return obj_df


def convert_to_scitype(obj, to_scitype, from_scitype=None, store=None):
def convert_to_scitype(
obj,
to_scitype,
from_scitype=None,
store=None,
return_to_mtype=False,
):
"""Convert single-series or single-panel between mtypes.
Assumes input is conformant with one of the mtypes
Expand All @@ -218,6 +280,8 @@ def convert_to_scitype(obj, to_scitype, from_scitype=None, store=None):
scitype that obj is of, and being converted from
if avoided, function will skip type inference from obj
store : dict, optional. Converter store for back-conversion.
return_to_mtype: bool, optional (default=False)
if True, also returns the str of the mtype converted to
Returns
-------
Expand All @@ -239,4 +303,4 @@ def convert_to_scitype(obj, to_scitype, from_scitype=None, store=None):
func_name = f"convert_{from_scitype}_to_{to_scitype}"
func = eval(func_name)

return func(obj, store=store)
return func(obj, store=store, return_to_mtype=return_to_mtype)

0 comments on commit 0094f86

Please sign in to comment.