Skip to content

Commit

Permalink
Fixes #97, added additional tests
Browse files Browse the repository at this point in the history
  • Loading branch information
mloning committed Jul 24, 2019
1 parent f62e23e commit f1ea2ce
Show file tree
Hide file tree
Showing 4 changed files with 114 additions and 61 deletions.
55 changes: 34 additions & 21 deletions sktime/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from sklearn.pipeline import Pipeline as skPipeline
from sklearn.pipeline import FeatureUnion as skFeatureUnion
from sklearn.pipeline import _fit_one, _transform_one, _fit_transform_one
from sklearn.utils._joblib import Parallel, delayed
import pandas as pd
import numpy as np
Expand Down Expand Up @@ -88,27 +89,39 @@ def check_input(self, check_input):
step[1].set_params(**{'check_input': self.check_input})


def _fit_one_transformer(transformer, X, y, weight=None, **fit_params):
return transformer.fit(X, y)


def _transform_one(transformer, X, y, weight, **fit_params):
res = X.apply(transformer.transform)
# if we have a weight for this transformer, multiply output
if weight is None:
return res
return res * weight


def _fit_transform_one(transformer, X, y, weight, **fit_params):
if hasattr(transformer, 'fit_transform'):
res = X.apply(transformer.fit_transform, **fit_params)
else:
res = X.apply(transformer.fit(X, y, **fit_params).transform)
# if we have a weight for this transformer, multiply output
if weight is None:
return res, transformer
return res * weight, transformer
# def _fit_one_transformer(transformer, X, y, weight=None, **fit_params):
# return transformer.fit(X, y)
#
#
# def _transform_one(transformer, X, y, weight, **fit_params):
# # res = X.apply(transformer.transform)
# # res = pd.concat([pd.Series(col.apply(transformer.transform, **fit_params))
# # for _, col in X.items()], axis=1)
#
# res = transformer.transform(X, y)
#
# # if we have a weight for this transformer, multiply output
# if weight is None:
# return res
# return res * weight
#
#
# def _fit_transform_one(transformer, X, y, weight, **fit_params):
# if hasattr(transformer, 'fit_transform'):
# # res = X.apply(transformer.fit_transform, **fit_params)
# # res = pd.concat([pd.Series(col.apply(transformer.fit_transform, **fit_params))
# # for _, col in X.items()], axis=1)
# res = transformer.fit_transform(X, y, **fit_params)
# else:
# # res = X.apply(transformer.fit(X, y, **fit_params).transform)
# # res = pd.concat([pd.Series(col.apply(transformer.fit(X, y, **fit_params).transform))
# # for _, col in X.items()], axis=1)
# res = transformer.fit(X, y, **fit_params).transform(X, y)
#
# # if we have a weight for this transformer, multiply output
# if weight is None:
# return res, transformer
# return res * weight, transformer


class FeatureUnion(skFeatureUnion):
Expand Down
13 changes: 13 additions & 0 deletions sktime/tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from sktime.transformers.series_to_series import RandomIntervalSegmenter
from sktime.transformers.series_to_tabular import RandomIntervalFeatureExtractor


# load data
X_train, y_train = load_gunpoint("TRAIN", return_X_y=True)
X_train = pd.concat([X_train, X_train], axis=1)
Expand Down Expand Up @@ -106,3 +107,15 @@ def test_Pipeline_check_input():
for step in pipe.steps:
assert step[1].check_input == ci
assert step[1].get_params()['check_input'] == ci


def test_FeatureUnion():
X, y = load_gunpoint(return_X_y=True)
ft = FunctionTransformer(func=np.mean, validate=False)
t = RowwiseTransformer(ft)
fu = FeatureUnion([
('mean', t),
('std', RowwiseTransformer(FunctionTransformer(func=np.std, validate=False)))
])
Xt = fu.fit_transform(X, y)
assert Xt.shape == (X.shape[0], X.shape[1] * len(fu.transformer_list))
24 changes: 15 additions & 9 deletions sktime/transformers/compose.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,8 @@ def fit(self, X, y=None):

# check the validity of input
X = check_ts_array(X)
if not isinstance(X, pd.DataFrame):
raise ValueError(f"Input must be pandas DataFrame, but found: {type(X)}")

# fitting - this transformer needs no fitting
self.is_fitted_ = True
Expand All @@ -206,18 +208,22 @@ def transform(self, X):
"""
# check the validity of input
X = check_ts_array(X)
if not isinstance(X, pd.DataFrame):
raise ValueError(f"Input must be pandas DataFrame, but found: {type(X)}")
check_is_fitted(self, 'is_fitted_')

# Works on single column, but on multiple columns only if columns have equal-length series.
try:
Xt = X.apply(self.transformer.fit_transform)

# Otherwise call apply on each column separately.
except ValueError as e:
if str(e) == 'arrays must all be same length':
Xt = pd.concat([pd.Series(col.apply(self.transformer.fit_transform)) for _, col in X.items()], axis=1)
else:
raise
# try:
# Xt = X.apply(self.transformer.fit_transform)
#
# # Otherwise call apply on each column separately.
# except ValueError as e:
# if str(e) == 'arrays must all be same length':
# Xt = pd.concat([pd.Series(col.apply(self.transformer.fit_transform)) for _, col in X.items()], axis=1)
# else:
# raise
Xt = pd.concat([pd.Series(col.apply(self.transformer.fit_transform))
for _, col in X.items()], axis=1)

return Xt

Expand Down
83 changes: 52 additions & 31 deletions sktime/transformers/tests/test_compose.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,32 @@
from sktime.pipeline import Pipeline
from sktime.tests.test_pipeline import X_train, y_train, X_test, y_test
from sktime.transformers.compose import ColumnTransformer, Tabulariser, RowwiseTransformer
from sktime.datasets import load_gunpoint
from sktime.datasets import load_basic_motions
from sktime.utils.transformations import tabularise

# load data
X_train, y_train = load_gunpoint("TRAIN", return_X_y=True)
X_train = pd.concat([X_train, X_train], axis=1)
X_train.columns = ['ts', 'ts_copy']
X_train, y_train = load_basic_motions("TRAIN", return_X_y=True)
X_test, y_test = load_basic_motions("TEST", return_X_y=True)

X_test, y_test = load_gunpoint("TEST", return_X_y=True)
X_test = pd.concat([X_test, X_test], axis=1)
X_test.columns = ['ts', 'ts_copy']

def test_Rowwise_transformer():
X, y = load_basic_motions(return_X_y=True)
ft = FunctionTransformer(func=np.mean, validate=False)
t = RowwiseTransformer(ft)
Xt = t.fit_transform(X, y)
assert Xt.shape == X.shape

def test_ColumnTransformer_pipeline():
# using Identity function transformers (transform series to series)
id_func = lambda X: X
column_transformer = ColumnTransformer(
[('ts', FunctionTransformer(func=id_func, validate=False), 'ts'),
('ts_copy', FunctionTransformer(func=id_func, validate=False), 'ts_copy')])
column_transformer = ColumnTransformer([
('id0', FunctionTransformer(func=id_func, validate=False), ['dim_0']),
('id1', FunctionTransformer(func=id_func, validate=False), ['dim_1'])
])
steps = [
('feature_extract', column_transformer),
('extract', column_transformer),
('tabularise', Tabulariser()),
('rfestimator', RandomForestClassifier(n_estimators=2))]
('classify', RandomForestClassifier(n_estimators=2))]
model = Pipeline(steps=steps)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
Expand All @@ -36,30 +40,47 @@ def test_ColumnTransformer_pipeline():


def test_RowwiseTransformer_pipeline():

# using pure sklearn
mean_func = lambda X: pd.DataFrame([np.mean(row) for row in X])
first_func = lambda X: pd.DataFrame([row[0] for row in X])
column_transformer = ColumnTransformer(
[('mean', FunctionTransformer(func=mean_func, validate=False), 'ts'),
('first', FunctionTransformer(func=first_func, validate=False), 'ts_copy')])
def rowwise_mean(X):
if isinstance(X, pd.Series):
X = pd.DataFrame(X)
Xt = pd.concat([pd.Series(col.apply(np.mean))
for _, col in X.items()], axis=1)
return Xt

def rowwise_first(X):
if isinstance(X, pd.Series):
X = pd.DataFrame(X)
Xt = pd.concat([pd.Series(tabularise(col).iloc[:, 0])
for _, col in X.items()], axis=1)
return Xt

# specify column as a list, otherwise pandas Series are selected and passed on to the transformers
transformer = ColumnTransformer([
('mean', FunctionTransformer(func=rowwise_mean, validate=False), ['dim_0']),
('first', FunctionTransformer(func=rowwise_first, validate=False), ['dim_1'])
])
estimator = RandomForestClassifier(n_estimators=2, random_state=1)
strategy = [
('feature_extract', column_transformer),
('rfestimator', estimator)]
model = Pipeline(steps=strategy)
steps = [
('extract', transformer),
('classify', estimator)
]
model = Pipeline(steps=steps)
model.fit(X_train, y_train)
expected = model.predict(X_test)

# using sktime with sklearn pipeline
first_func = lambda X: pd.DataFrame([row[0] for row in X])
column_transformer = ColumnTransformer(
[('mean', RowwiseTransformer(FunctionTransformer(func=np.mean, validate=False)), 'ts'),
('first', FunctionTransformer(func=first_func, validate=False), 'ts_copy')])
transformer = ColumnTransformer([
('mean', RowwiseTransformer(FunctionTransformer(func=np.mean, validate=False)), ['dim_0']),
('first', FunctionTransformer(func=rowwise_first, validate=False), ['dim_1'])
])
estimator = RandomForestClassifier(n_estimators=2, random_state=1)
strategy = [
('feature_extract', column_transformer),
('rfestimator', estimator)]
model = Pipeline(steps=strategy)
steps = [
('extract', transformer),
('classify', estimator)
]
model = Pipeline(steps=steps)
model.fit(X_train, y_train)
got = model.predict(X_test)
np.testing.assert_array_equal(expected, got)
actual = model.predict(X_test)
np.testing.assert_array_equal(expected, actual)

0 comments on commit f1ea2ce

Please sign in to comment.