update densetransformer

rasbt · Jul 17, 2016 · da98204 · da98204
1 parent 0587634
commit da98204
Show file tree

Hide file tree

Showing 10 changed files with 193 additions and 23 deletions.
diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md
@@ -3,7 +3,7 @@
 ---
 
 
-### Version 0.4.2dev
+### Version 0.4.2dev0
 
 ##### Downloads
 
@@ -22,6 +22,7 @@
 - Fixed a bug in `classifier.SoftmaxRegression` where the mean values of the offsets were used to update the bias units rather than their sum
 - Fixed rare bug in MLP layer_mapping functions that caused a swap between the random number generation seed when initializing weights and biases
 - More rigorous type and shape checks in `evaluate.plot_decision_regions`
+- Changes in `DenseTransformer` so that it doesn't fail if the input array is not sparse
 
 ### Version 0.4.1 (2016-05-01)
 

diff --git a/docs/sources/user_guide/preprocessing/DenseTransformer.ipynb b/docs/sources/user_guide/preprocessing/DenseTransformer.ipynb
@@ -25,15 +25,15 @@
      "output_type": "stream",
      "text": [
       "Sebastian Raschka \n",
-      "last updated: 2016-01-30 \n",
+      "last updated: 2016-07-17 \n",
       "\n",
       "CPython 3.5.1\n",
-      "IPython 4.0.3\n",
+      "IPython 5.0.0\n",
       "\n",
       "matplotlib 1.5.1\n",
-      "numpy 1.10.2\n",
-      "scipy 0.16.1\n",
-      "mlxtend 0.3.0\n"
+      "numpy 1.11.0\n",
+      "scipy 0.17.1\n",
+      "mlxtend 0.4.2.dev0\n"
      ]
     }
    ],
@@ -53,7 +53,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "A simple transformer that converts a sparse into a dense numpy array, e.g., required for scikit-learn's `Pipeline` when, for example, `CountVectorizers` are used in combination with `RandomForest`s."
+    "A simple transformer that converts a sparse into a dense numpy array, e.g., required for scikit-learn's `Pipeline` when, for example, `CountVectorizers` are used in combination with estimators that are not compatible with sparse matrices."
    ]
   },
   {
@@ -157,7 +157,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {
     "collapsed": false
    },

diff --git a/mlxtend/__init__.py b/mlxtend/__init__.py
@@ -4,4 +4,4 @@
 #
 # License: BSD 3 clause
 
-__version__ = '0.4.2dev'
+__version__ = '0.4.2dev0'
diff --git a/mlxtend/preprocessing/__init__.py b/mlxtend/preprocessing/__init__.py
@@ -4,14 +4,14 @@
 #
 # License: BSD 3 clause
 
-from .transformer import TransformerObj
 from .mean_centering import MeanCenterer
 from .shuffle import shuffle_arrays_unison
 from .scaling import minmax_scaling
 from .scaling import standardize
 from .dense_transformer import DenseTransformer
+from .copy_transformer import CopyTransformer
 from .onehot import one_hot
 
 
-__all__ = ["MeanCenterer", "shuffle_arrays_unison", "TransformerObj",
+__all__ = ["MeanCenterer", "shuffle_arrays_unison", "CopyTransformer",
            "minmax_scaling", "standardize", "DenseTransformer", "one_hot"]
diff --git a/mlxtend/preprocessing/transformer.py → mlxtend/preprocessing/copy_transformer.py b/mlxtend/preprocessing/transformer.py → mlxtend/preprocessing/copy_transformer.py
@@ -9,7 +9,7 @@
 import numpy as np
 
 
-class TransformerObj(object):
+class CopyTransformer(object):
     def __init__(self):
         self.ary = None
 

diff --git a/mlxtend/preprocessing/dense_transformer.py b/mlxtend/preprocessing/dense_transformer.py
@@ -7,21 +7,28 @@
 # License: BSD 3 clause
 
 
-class DenseTransformer(object):
+from sklearn.base import BaseEstimator
+from scipy.sparse import issparse
 
-    """Convert a sparse matrix into a dense matrix."""
 
-    def __init__(self, some_param=True):
-        pass
+class DenseTransformer(BaseEstimator):
+    """Convert a sparse array into a dense array."""
+
+    def __init__(self, return_copy=True):
+        self.return_copy = return_copy
+        self.is_fitted = False
 
     def transform(self, X, y=None):
-        return X.toarray()
+        if issparse(X):
+            return X.toarray()
+        elif self.return_copy:
+            return X.copy()
+        else:
+            return X
 
     def fit(self, X, y=None):
+        self.is_fitted = True
         return self
 
     def fit_transform(self, X, y=None):
-        return X.toarray()
-
-    def get_params(self, deep=True):
-        return {'some_param': True}
+        return self.transform(X=X, y=y)
diff --git a/mlxtend/preprocessing/mean_centering.py b/mlxtend/preprocessing/mean_centering.py
@@ -7,10 +7,9 @@
 # License: BSD 3 clause
 
 import numpy as np
-from .transformer import TransformerObj
 
 
-class MeanCenterer(TransformerObj):
+class MeanCenterer(object):
 
     """Column centering of vectors and matrices.
 

diff --git a/mlxtend/preprocessing/shuffle.py b/mlxtend/preprocessing/shuffle.py
@@ -5,6 +5,7 @@
 # License: BSD 3 clause
 
 import numpy as np
+from mlxtend.utils import check_Xy
 
 
 def shuffle_arrays_unison(arrays, random_seed=None):
@@ -40,3 +41,55 @@ def shuffle_arrays_unison(arrays, random_seed=None):
         assert(len(a) == n)
     idx = np.random.permutation(n)
     return [a[idx] for a in arrays]
+
+
+def shuffled_split(X, y, shuffle=True, train_size=0.75, random_seed=None):
+    """Splits feature and target arrays into training and test subsets.
+
+    Parameters
+    ----------
+    X : array-like, shape = [n_samples, n_features]
+        Initial dataset, where n_samples is the number of samples and
+        n_features is the number of features.
+    y : array-like, shape = [n_samples]
+        Target values.
+    shuffle : bool (default: True)
+        Doesn't shuffle the arrays if False
+    train_size : float (default: 0.75)
+        Proportion of data in the training arrays. For example, 0.75 will
+        put 75% of the data into the training array, and 25% of the data
+        into the test array.
+    random_seed : int (default: None)
+        Sets the random state.
+
+    Returns
+    ----------
+    X_train : array-like, shape = [n_samples * train_size, n_features]
+        Training dataset, where n_samples is the number of samples and
+        n_features is the number of features.
+    y_train : array-like, shape = [n_samples * train_size]
+        Training target values.
+    X_test : array-like, shape = [n_samples * (1-train_size), n_features]
+        Dataset for testing, where n_samples is the number of samples and
+        n_features is the number of features.
+    y_test : array-like, shape = [n_samples * (1-train_size)]
+         Target values for testing.
+
+    """
+    check_Xy(X, y, y_int=False)
+
+    if train_size <= 0.0 or train_size >= 1.0:
+        raise ValueError('train_size must be a float in the range (0.0, 1.0)')
+
+    if shuffle:
+        X_ary, y_ary = shuffle_arrays_unison(arrays=[X.copy(), y.copy()],
+                                             random_seed=random_seed)
+    else:
+        X_ary, y_ary = X.copy(), y.copy()
+
+    idx = np.arange(0, y.shape[0])
+    train_absize = round(train_size * y.shape[0])
+
+    X_train, y_train = X_ary[:train_absize], y_ary[:train_absize]
+    X_test, y_test = X_ary[train_absize:], y_ary[train_absize:]
+    return X_train, y_train, X_test, y_test
diff --git a/mlxtend/preprocessing/tests/test_copy_transformer.py b/mlxtend/preprocessing/tests/test_copy_transformer.py
@@ -0,0 +1,68 @@
+# Sebastian Raschka 2014-2016
+# mlxtend Machine Learning Library Extensions
+# Author: Sebastian Raschka <sebastianraschka.com>
+#
+# License: BSD 3 clause
+
+import numpy as np
+from mlxtend.preprocessing import one_hot
+from nose.tools import raises
+
+
+def test_default():
+    y = np.array([0, 1, 2, 3, 4, 2])
+    expect = np.array([[1., 0., 0., 0., 0.],
+                       [0., 1., 0., 0., 0.],
+                       [0., 0., 1., 0., 0.],
+                       [0., 0., 0., 1., 0.],
+                       [0., 0., 0., 0., 1.],
+                       [0., 0., 1., 0., 0.]], dtype='float')
+    out = one_hot(y)
+    np.testing.assert_array_equal(expect, out)
+
+
+def test_autoguessing():
+    y = np.array([0, 4, 0, 4])
+    expect = np.array([[1., 0., 0., 0., 0.],
+                       [0., 0., 0., 0., 1.],
+                       [1., 0., 0., 0., 0.],
+                       [0., 0., 0., 0., 1.]], dtype='float')
+    out = one_hot(y)
+    np.testing.assert_array_equal(expect, out)
+
+
+def test_list():
+    y = [0, 1, 2, 3, 4, 2]
+    expect = np.array([[1., 0., 0., 0., 0.],
+                       [0., 1., 0., 0., 0.],
+                       [0., 0., 1., 0., 0.],
+                       [0., 0., 0., 1., 0.],
+                       [0., 0., 0., 0., 1.],
+                       [0., 0., 1., 0., 0.]], dtype='float')
+    out = one_hot(y)
+    np.testing.assert_array_equal(expect, out)
+
+
+@raises(AttributeError)
+def test_multidim_list():
+    y = [[0, 1, 2, 3, 4, 2]]
+    one_hot(y)
+
+
+@raises(AttributeError)
+def test_multidim_array():
+    y = np.array([[0], [1], [2], [3], [4], [2]])
+    one_hot(y)
+
+
+def test_oneclass():
+    np.testing.assert_array_equal(one_hot([0]),
+                                  np.array([[0.]], dtype='float'))
+
+
+def test_list_morelabels():
+    y = [0, 1]
+    expect = np.array([[1., 0., 0.],
+                       [0., 1., 0.]], dtype='float')
+    out = one_hot(y, num_labels=3)
+    np.testing.assert_array_equal(expect, out)
diff --git a/mlxtend/preprocessing/tests/test_dense_transformer.py b/mlxtend/preprocessing/tests/test_dense_transformer.py
@@ -0,0 +1,42 @@
+# Sebastian Raschka 2014-2016
+# mlxtend Machine Learning Library Extensions
+# Author: Sebastian Raschka <sebastianraschka.com>
+#
+# License: BSD 3 clause
+
+import numpy as np
+from mlxtend.preprocessing import DenseTransformer
+from sklearn.datasets import load_iris
+from sklearn.pipeline import make_pipeline
+from sklearn.grid_search import GridSearchCV
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.preprocessing import StandardScaler
+from sklearn.feature_extraction.text import TfidfTransformer
+from scipy.sparse import issparse
+
+
+iris = load_iris()
+X, y = iris.data, iris.target
+
+
+def test_dense_to_dense():
+    todense = DenseTransformer(return_copy=False)
+    np.testing.assert_array_equal(X, todense.transform(X))
+
+
+def test_sparse_to_dense():
+    todense = DenseTransformer()
+    tfidf = TfidfTransformer()
+    X_t = tfidf.fit_transform([[1, 2, 3]])
+    assert issparse(X_t)
+    X_dense = todense.transform(X_t)
+    expect = np.array([[0.26726124, 0.53452248, 0.80178373]])
+    assert np.allclose(X_dense, expect)
+
+
+def test_pipeline():
+    rf = RandomForestClassifier()
+    param_grid = [{'randomforestclassifier__n_estimators': [1, 5, 10]}]
+    pipe = make_pipeline(StandardScaler(), DenseTransformer(), rf)
+    grid = GridSearchCV(pipe, param_grid, cv=3, n_jobs=1)
+    grid.fit(X, y)