Fixes #97, added additional tests

sktime · Jul 24, 2019 · f1ea2ce · f1ea2ce
1 parent f62e23e
commit f1ea2ce
Show file tree

Hide file tree

Showing 4 changed files with 114 additions and 61 deletions.
diff --git a/sktime/pipeline.py b/sktime/pipeline.py
@@ -1,5 +1,6 @@
 from sklearn.pipeline import Pipeline as skPipeline
 from sklearn.pipeline import FeatureUnion as skFeatureUnion
+from sklearn.pipeline import _fit_one, _transform_one, _fit_transform_one
 from sklearn.utils._joblib import Parallel, delayed
 import pandas as pd
 import numpy as np
@@ -88,27 +89,39 @@ def check_input(self, check_input):
                     step[1].set_params(**{'check_input': self.check_input})
 
 
-def _fit_one_transformer(transformer, X, y, weight=None, **fit_params):
-    return transformer.fit(X, y)
-
-
-def _transform_one(transformer, X, y, weight, **fit_params):
-    res = X.apply(transformer.transform)
-    # if we have a weight for this transformer, multiply output
-    if weight is None:
-        return res
-    return res * weight
-
-
-def _fit_transform_one(transformer, X, y, weight, **fit_params):
-    if hasattr(transformer, 'fit_transform'):
-        res = X.apply(transformer.fit_transform, **fit_params)
-    else:
-        res = X.apply(transformer.fit(X, y, **fit_params).transform)
-    # if we have a weight for this transformer, multiply output
-    if weight is None:
-        return res, transformer
-    return res * weight, transformer
+# def _fit_one_transformer(transformer, X, y, weight=None, **fit_params):
+#     return transformer.fit(X, y)
+#
+#
+# def _transform_one(transformer, X, y, weight, **fit_params):
+#     # res = X.apply(transformer.transform)
+#     # res = pd.concat([pd.Series(col.apply(transformer.transform, **fit_params))
+#     #                  for _, col in X.items()], axis=1)
+#
+#     res = transformer.transform(X, y)
+#
+#     # if we have a weight for this transformer, multiply output
+#     if weight is None:
+#         return res
+#     return res * weight
+#
+#
+# def _fit_transform_one(transformer, X, y, weight, **fit_params):
+#     if hasattr(transformer, 'fit_transform'):
+#         # res = X.apply(transformer.fit_transform, **fit_params)
+#         # res = pd.concat([pd.Series(col.apply(transformer.fit_transform, **fit_params))
+#         #                  for _, col in X.items()], axis=1)
+#         res = transformer.fit_transform(X, y, **fit_params)
+#     else:
+#         # res = X.apply(transformer.fit(X, y, **fit_params).transform)
+#         # res = pd.concat([pd.Series(col.apply(transformer.fit(X, y, **fit_params).transform))
+#         #                  for _, col in X.items()], axis=1)
+#         res = transformer.fit(X, y, **fit_params).transform(X, y)
+#
+#     # if we have a weight for this transformer, multiply output
+#     if weight is None:
+#         return res, transformer
+#     return res * weight, transformer
 
 
 class FeatureUnion(skFeatureUnion):

diff --git a/sktime/tests/test_pipeline.py b/sktime/tests/test_pipeline.py
@@ -10,6 +10,7 @@
 from sktime.transformers.series_to_series import RandomIntervalSegmenter
 from sktime.transformers.series_to_tabular import RandomIntervalFeatureExtractor
 
+
 # load data
 X_train, y_train = load_gunpoint("TRAIN", return_X_y=True)
 X_train = pd.concat([X_train, X_train], axis=1)
@@ -106,3 +107,15 @@ def test_Pipeline_check_input():
     for step in pipe.steps:
         assert step[1].check_input == ci
         assert step[1].get_params()['check_input'] == ci
+
+
+def test_FeatureUnion():
+    X, y = load_gunpoint(return_X_y=True)
+    ft = FunctionTransformer(func=np.mean, validate=False)
+    t = RowwiseTransformer(ft)
+    fu = FeatureUnion([
+        ('mean', t),
+        ('std', RowwiseTransformer(FunctionTransformer(func=np.std, validate=False)))
+    ])
+    Xt = fu.fit_transform(X, y)
+    assert Xt.shape == (X.shape[0], X.shape[1] * len(fu.transformer_list))
diff --git a/sktime/transformers/compose.py b/sktime/transformers/compose.py
@@ -184,6 +184,8 @@ def fit(self, X, y=None):
 
         # check the validity of input
         X = check_ts_array(X)
+        if not isinstance(X, pd.DataFrame):
+            raise ValueError(f"Input must be pandas DataFrame, but found: {type(X)}")
 
         # fitting - this transformer needs no fitting
         self.is_fitted_ = True
@@ -206,18 +208,22 @@ def transform(self, X):
         """
         # check the validity of input
         X = check_ts_array(X)
+        if not isinstance(X, pd.DataFrame):
+            raise ValueError(f"Input must be pandas DataFrame, but found: {type(X)}")
         check_is_fitted(self, 'is_fitted_')
 
         # Works on single column, but on multiple columns only if columns have equal-length series.
-        try:
-            Xt = X.apply(self.transformer.fit_transform)
-
-        # Otherwise call apply on each column separately.
-        except ValueError as e:
-            if str(e) == 'arrays must all be same length':
-                Xt = pd.concat([pd.Series(col.apply(self.transformer.fit_transform)) for _, col in X.items()], axis=1)
-            else:
-                raise
+        # try:
+        #     Xt = X.apply(self.transformer.fit_transform)
+        #
+        # # Otherwise call apply on each column separately.
+        # except ValueError as e:
+        #     if str(e) == 'arrays must all be same length':
+        #         Xt = pd.concat([pd.Series(col.apply(self.transformer.fit_transform)) for _, col in X.items()], axis=1)
+        #     else:
+        #         raise
+        Xt = pd.concat([pd.Series(col.apply(self.transformer.fit_transform))
+                        for _, col in X.items()], axis=1)
 
         return Xt
 

diff --git a/sktime/transformers/tests/test_compose.py b/sktime/transformers/tests/test_compose.py
@@ -6,28 +6,32 @@
 from sktime.pipeline import Pipeline
 from sktime.tests.test_pipeline import X_train, y_train, X_test, y_test
 from sktime.transformers.compose import ColumnTransformer, Tabulariser, RowwiseTransformer
-from sktime.datasets import load_gunpoint
+from sktime.datasets import load_basic_motions
+from sktime.utils.transformations import tabularise
 
 # load data
-X_train, y_train = load_gunpoint("TRAIN", return_X_y=True)
-X_train = pd.concat([X_train, X_train], axis=1)
-X_train.columns = ['ts', 'ts_copy']
+X_train, y_train = load_basic_motions("TRAIN", return_X_y=True)
+X_test, y_test = load_basic_motions("TEST", return_X_y=True)
 
-X_test, y_test = load_gunpoint("TEST", return_X_y=True)
-X_test = pd.concat([X_test, X_test], axis=1)
-X_test.columns = ['ts', 'ts_copy']
 
+def test_Rowwise_transformer():
+    X, y = load_basic_motions(return_X_y=True)
+    ft = FunctionTransformer(func=np.mean, validate=False)
+    t = RowwiseTransformer(ft)
+    Xt = t.fit_transform(X, y)
+    assert Xt.shape == X.shape
 
 def test_ColumnTransformer_pipeline():
     # using Identity function transformers (transform series to series)
     id_func = lambda X: X
-    column_transformer = ColumnTransformer(
-        [('ts', FunctionTransformer(func=id_func, validate=False), 'ts'),
-         ('ts_copy', FunctionTransformer(func=id_func, validate=False), 'ts_copy')])
+    column_transformer = ColumnTransformer([
+        ('id0', FunctionTransformer(func=id_func, validate=False), ['dim_0']),
+        ('id1', FunctionTransformer(func=id_func, validate=False), ['dim_1'])
+    ])
     steps = [
-        ('feature_extract', column_transformer),
+        ('extract', column_transformer),
         ('tabularise', Tabulariser()),
-        ('rfestimator', RandomForestClassifier(n_estimators=2))]
+        ('classify', RandomForestClassifier(n_estimators=2))]
     model = Pipeline(steps=steps)
     model.fit(X_train, y_train)
     y_pred = model.predict(X_test)
@@ -36,30 +40,47 @@ def test_ColumnTransformer_pipeline():
 
 
 def test_RowwiseTransformer_pipeline():
+
     # using pure sklearn
-    mean_func = lambda X: pd.DataFrame([np.mean(row) for row in X])
-    first_func = lambda X: pd.DataFrame([row[0] for row in X])
-    column_transformer = ColumnTransformer(
-        [('mean', FunctionTransformer(func=mean_func, validate=False), 'ts'),
-         ('first', FunctionTransformer(func=first_func, validate=False), 'ts_copy')])
+    def rowwise_mean(X):
+        if isinstance(X, pd.Series):
+            X = pd.DataFrame(X)
+        Xt = pd.concat([pd.Series(col.apply(np.mean))
+                        for _, col in X.items()], axis=1)
+        return Xt
+
+    def rowwise_first(X):
+        if isinstance(X, pd.Series):
+            X = pd.DataFrame(X)
+        Xt = pd.concat([pd.Series(tabularise(col).iloc[:, 0])
+                        for _, col in X.items()], axis=1)
+        return Xt
+
+    # specify column as a list, otherwise pandas Series are selected and passed on to the transformers
+    transformer = ColumnTransformer([
+        ('mean', FunctionTransformer(func=rowwise_mean, validate=False), ['dim_0']),
+        ('first', FunctionTransformer(func=rowwise_first, validate=False), ['dim_1'])
+    ])
     estimator = RandomForestClassifier(n_estimators=2, random_state=1)
-    strategy = [
-        ('feature_extract', column_transformer),
-        ('rfestimator', estimator)]
-    model = Pipeline(steps=strategy)
+    steps = [
+        ('extract', transformer),
+        ('classify', estimator)
+    ]
+    model = Pipeline(steps=steps)
     model.fit(X_train, y_train)
     expected = model.predict(X_test)
 
     # using sktime with sklearn pipeline
-    first_func = lambda X: pd.DataFrame([row[0] for row in X])
-    column_transformer = ColumnTransformer(
-        [('mean', RowwiseTransformer(FunctionTransformer(func=np.mean, validate=False)), 'ts'),
-         ('first', FunctionTransformer(func=first_func, validate=False), 'ts_copy')])
+    transformer = ColumnTransformer([
+        ('mean', RowwiseTransformer(FunctionTransformer(func=np.mean, validate=False)), ['dim_0']),
+        ('first', FunctionTransformer(func=rowwise_first, validate=False), ['dim_1'])
+    ])
     estimator = RandomForestClassifier(n_estimators=2, random_state=1)
-    strategy = [
-        ('feature_extract', column_transformer),
-        ('rfestimator', estimator)]
-    model = Pipeline(steps=strategy)
+    steps = [
+        ('extract', transformer),
+        ('classify', estimator)
+    ]
+    model = Pipeline(steps=steps)
     model.fit(X_train, y_train)
-    got = model.predict(X_test)
-    np.testing.assert_array_equal(expected, got)
+    actual = model.predict(X_test)
+    np.testing.assert_array_equal(expected, actual)