From da98204b0bd54cd36d1828d5dbc63e1b1fc19a3d Mon Sep 17 00:00:00 2001 From: rasbt Date: Sun, 17 Jul 2016 14:00:26 -0500 Subject: [PATCH] update densetransformer --- docs/sources/CHANGELOG.md | 3 +- .../preprocessing/DenseTransformer.ipynb | 14 ++-- mlxtend/__init__.py | 2 +- mlxtend/preprocessing/__init__.py | 4 +- .../{transformer.py => copy_transformer.py} | 2 +- mlxtend/preprocessing/dense_transformer.py | 25 ++++--- mlxtend/preprocessing/mean_centering.py | 3 +- mlxtend/preprocessing/shuffle.py | 53 +++++++++++++++ .../tests/test_copy_transformer.py | 68 +++++++++++++++++++ .../tests/test_dense_transformer.py | 42 ++++++++++++ 10 files changed, 193 insertions(+), 23 deletions(-) rename mlxtend/preprocessing/{transformer.py => copy_transformer.py} (93%) create mode 100644 mlxtend/preprocessing/tests/test_copy_transformer.py create mode 100644 mlxtend/preprocessing/tests/test_dense_transformer.py diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md index 2400e14b1..0941538ae 100755 --- a/docs/sources/CHANGELOG.md +++ b/docs/sources/CHANGELOG.md @@ -3,7 +3,7 @@ --- -### Version 0.4.2dev +### Version 0.4.2dev0 ##### Downloads @@ -22,6 +22,7 @@ - Fixed a bug in `classifier.SoftmaxRegression` where the mean values of the offsets were used to update the bias units rather than their sum - Fixed rare bug in MLP layer_mapping functions that caused a swap between the random number generation seed when initializing weights and biases - More rigorous type and shape checks in `evaluate.plot_decision_regions` +- Changes in `DenseTransformer` so that it doesn't fail if the input array is not sparse ### Version 0.4.1 (2016-05-01) diff --git a/docs/sources/user_guide/preprocessing/DenseTransformer.ipynb b/docs/sources/user_guide/preprocessing/DenseTransformer.ipynb index 61c12ff3c..6c8078df3 100644 --- a/docs/sources/user_guide/preprocessing/DenseTransformer.ipynb +++ b/docs/sources/user_guide/preprocessing/DenseTransformer.ipynb @@ -25,15 +25,15 @@ "output_type": "stream", "text": [ "Sebastian Raschka \n", - "last updated: 2016-01-30 \n", + "last updated: 2016-07-17 \n", "\n", "CPython 3.5.1\n", - "IPython 4.0.3\n", + "IPython 5.0.0\n", "\n", "matplotlib 1.5.1\n", - "numpy 1.10.2\n", - "scipy 0.16.1\n", - "mlxtend 0.3.0\n" + "numpy 1.11.0\n", + "scipy 0.17.1\n", + "mlxtend 0.4.2.dev0\n" ] } ], @@ -53,7 +53,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "A simple transformer that converts a sparse into a dense numpy array, e.g., required for scikit-learn's `Pipeline` when, for example, `CountVectorizers` are used in combination with `RandomForest`s." + "A simple transformer that converts a sparse into a dense numpy array, e.g., required for scikit-learn's `Pipeline` when, for example, `CountVectorizers` are used in combination with estimators that are not compatible with sparse matrices." ] }, { @@ -157,7 +157,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": { "collapsed": false }, diff --git a/mlxtend/__init__.py b/mlxtend/__init__.py index aa57d00d9..3915881d5 100644 --- a/mlxtend/__init__.py +++ b/mlxtend/__init__.py @@ -4,4 +4,4 @@ # # License: BSD 3 clause -__version__ = '0.4.2dev' +__version__ = '0.4.2dev0' diff --git a/mlxtend/preprocessing/__init__.py b/mlxtend/preprocessing/__init__.py index df0e53b6e..c5f0a92b3 100644 --- a/mlxtend/preprocessing/__init__.py +++ b/mlxtend/preprocessing/__init__.py @@ -4,14 +4,14 @@ # # License: BSD 3 clause -from .transformer import TransformerObj from .mean_centering import MeanCenterer from .shuffle import shuffle_arrays_unison from .scaling import minmax_scaling from .scaling import standardize from .dense_transformer import DenseTransformer +from .copy_transformer import CopyTransformer from .onehot import one_hot -__all__ = ["MeanCenterer", "shuffle_arrays_unison", "TransformerObj", +__all__ = ["MeanCenterer", "shuffle_arrays_unison", "CopyTransformer", "minmax_scaling", "standardize", "DenseTransformer", "one_hot"] diff --git a/mlxtend/preprocessing/transformer.py b/mlxtend/preprocessing/copy_transformer.py similarity index 93% rename from mlxtend/preprocessing/transformer.py rename to mlxtend/preprocessing/copy_transformer.py index cea8e0136..6a221df2e 100644 --- a/mlxtend/preprocessing/transformer.py +++ b/mlxtend/preprocessing/copy_transformer.py @@ -9,7 +9,7 @@ import numpy as np -class TransformerObj(object): +class CopyTransformer(object): def __init__(self): self.ary = None diff --git a/mlxtend/preprocessing/dense_transformer.py b/mlxtend/preprocessing/dense_transformer.py index d0e245318..9fb7e30ce 100644 --- a/mlxtend/preprocessing/dense_transformer.py +++ b/mlxtend/preprocessing/dense_transformer.py @@ -7,21 +7,28 @@ # License: BSD 3 clause -class DenseTransformer(object): +from sklearn.base import BaseEstimator +from scipy.sparse import issparse - """Convert a sparse matrix into a dense matrix.""" - def __init__(self, some_param=True): - pass +class DenseTransformer(BaseEstimator): + """Convert a sparse array into a dense array.""" + + def __init__(self, return_copy=True): + self.return_copy = return_copy + self.is_fitted = False def transform(self, X, y=None): - return X.toarray() + if issparse(X): + return X.toarray() + elif self.return_copy: + return X.copy() + else: + return X def fit(self, X, y=None): + self.is_fitted = True return self def fit_transform(self, X, y=None): - return X.toarray() - - def get_params(self, deep=True): - return {'some_param': True} + return self.transform(X=X, y=y) diff --git a/mlxtend/preprocessing/mean_centering.py b/mlxtend/preprocessing/mean_centering.py index 90cf76740..2048de54d 100644 --- a/mlxtend/preprocessing/mean_centering.py +++ b/mlxtend/preprocessing/mean_centering.py @@ -7,10 +7,9 @@ # License: BSD 3 clause import numpy as np -from .transformer import TransformerObj -class MeanCenterer(TransformerObj): +class MeanCenterer(object): """Column centering of vectors and matrices. diff --git a/mlxtend/preprocessing/shuffle.py b/mlxtend/preprocessing/shuffle.py index fd83b128d..54a7ec998 100644 --- a/mlxtend/preprocessing/shuffle.py +++ b/mlxtend/preprocessing/shuffle.py @@ -5,6 +5,7 @@ # License: BSD 3 clause import numpy as np +from mlxtend.utils import check_Xy def shuffle_arrays_unison(arrays, random_seed=None): @@ -40,3 +41,55 @@ def shuffle_arrays_unison(arrays, random_seed=None): assert(len(a) == n) idx = np.random.permutation(n) return [a[idx] for a in arrays] + + +def shuffled_split(X, y, shuffle=True, train_size=0.75, random_seed=None): + """Splits feature and target arrays into training and test subsets. + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + Initial dataset, where n_samples is the number of samples and + n_features is the number of features. + y : array-like, shape = [n_samples] + Target values. + shuffle : bool (default: True) + Doesn't shuffle the arrays if False + train_size : float (default: 0.75) + Proportion of data in the training arrays. For example, 0.75 will + put 75% of the data into the training array, and 25% of the data + into the test array. + random_seed : int (default: None) + Sets the random state. + + Returns + ---------- + X_train : array-like, shape = [n_samples * train_size, n_features] + Training dataset, where n_samples is the number of samples and + n_features is the number of features. + y_train : array-like, shape = [n_samples * train_size] + Training target values. + X_test : array-like, shape = [n_samples * (1-train_size), n_features] + Dataset for testing, where n_samples is the number of samples and + n_features is the number of features. + y_test : array-like, shape = [n_samples * (1-train_size)] + Target values for testing. + + """ + check_Xy(X, y, y_int=False) + + if train_size <= 0.0 or train_size >= 1.0: + raise ValueError('train_size must be a float in the range (0.0, 1.0)') + + if shuffle: + X_ary, y_ary = shuffle_arrays_unison(arrays=[X.copy(), y.copy()], + random_seed=random_seed) + else: + X_ary, y_ary = X.copy(), y.copy() + + idx = np.arange(0, y.shape[0]) + train_absize = round(train_size * y.shape[0]) + + X_train, y_train = X_ary[:train_absize], y_ary[:train_absize] + X_test, y_test = X_ary[train_absize:], y_ary[train_absize:] + return X_train, y_train, X_test, y_test diff --git a/mlxtend/preprocessing/tests/test_copy_transformer.py b/mlxtend/preprocessing/tests/test_copy_transformer.py new file mode 100644 index 000000000..a4aa03acc --- /dev/null +++ b/mlxtend/preprocessing/tests/test_copy_transformer.py @@ -0,0 +1,68 @@ +# Sebastian Raschka 2014-2016 +# mlxtend Machine Learning Library Extensions +# Author: Sebastian Raschka +# +# License: BSD 3 clause + +import numpy as np +from mlxtend.preprocessing import one_hot +from nose.tools import raises + + +def test_default(): + y = np.array([0, 1, 2, 3, 4, 2]) + expect = np.array([[1., 0., 0., 0., 0.], + [0., 1., 0., 0., 0.], + [0., 0., 1., 0., 0.], + [0., 0., 0., 1., 0.], + [0., 0., 0., 0., 1.], + [0., 0., 1., 0., 0.]], dtype='float') + out = one_hot(y) + np.testing.assert_array_equal(expect, out) + + +def test_autoguessing(): + y = np.array([0, 4, 0, 4]) + expect = np.array([[1., 0., 0., 0., 0.], + [0., 0., 0., 0., 1.], + [1., 0., 0., 0., 0.], + [0., 0., 0., 0., 1.]], dtype='float') + out = one_hot(y) + np.testing.assert_array_equal(expect, out) + + +def test_list(): + y = [0, 1, 2, 3, 4, 2] + expect = np.array([[1., 0., 0., 0., 0.], + [0., 1., 0., 0., 0.], + [0., 0., 1., 0., 0.], + [0., 0., 0., 1., 0.], + [0., 0., 0., 0., 1.], + [0., 0., 1., 0., 0.]], dtype='float') + out = one_hot(y) + np.testing.assert_array_equal(expect, out) + + +@raises(AttributeError) +def test_multidim_list(): + y = [[0, 1, 2, 3, 4, 2]] + one_hot(y) + + +@raises(AttributeError) +def test_multidim_array(): + y = np.array([[0], [1], [2], [3], [4], [2]]) + one_hot(y) + + +def test_oneclass(): + np.testing.assert_array_equal(one_hot([0]), + np.array([[0.]], dtype='float')) + + +def test_list_morelabels(): + y = [0, 1] + expect = np.array([[1., 0., 0.], + [0., 1., 0.]], dtype='float') + out = one_hot(y, num_labels=3) + np.testing.assert_array_equal(expect, out) diff --git a/mlxtend/preprocessing/tests/test_dense_transformer.py b/mlxtend/preprocessing/tests/test_dense_transformer.py new file mode 100644 index 000000000..973247f40 --- /dev/null +++ b/mlxtend/preprocessing/tests/test_dense_transformer.py @@ -0,0 +1,42 @@ +# Sebastian Raschka 2014-2016 +# mlxtend Machine Learning Library Extensions +# Author: Sebastian Raschka +# +# License: BSD 3 clause + +import numpy as np +from mlxtend.preprocessing import DenseTransformer +from sklearn.datasets import load_iris +from sklearn.pipeline import make_pipeline +from sklearn.grid_search import GridSearchCV +from sklearn.ensemble import RandomForestClassifier +from sklearn.preprocessing import StandardScaler +from sklearn.feature_extraction.text import TfidfTransformer +from scipy.sparse import issparse + + +iris = load_iris() +X, y = iris.data, iris.target + + +def test_dense_to_dense(): + todense = DenseTransformer(return_copy=False) + np.testing.assert_array_equal(X, todense.transform(X)) + + +def test_sparse_to_dense(): + todense = DenseTransformer() + tfidf = TfidfTransformer() + X_t = tfidf.fit_transform([[1, 2, 3]]) + assert issparse(X_t) + X_dense = todense.transform(X_t) + expect = np.array([[0.26726124, 0.53452248, 0.80178373]]) + assert np.allclose(X_dense, expect) + + +def test_pipeline(): + rf = RandomForestClassifier() + param_grid = [{'randomforestclassifier__n_estimators': [1, 5, 10]}] + pipe = make_pipeline(StandardScaler(), DenseTransformer(), rf) + grid = GridSearchCV(pipe, param_grid, cv=3, n_jobs=1) + grid.fit(X, y)