Skip to content

Commit

Permalink
update densetransformer
Browse files Browse the repository at this point in the history
  • Loading branch information
rasbt committed Jul 17, 2016
1 parent 0587634 commit da98204
Show file tree
Hide file tree
Showing 10 changed files with 193 additions and 23 deletions.
3 changes: 2 additions & 1 deletion docs/sources/CHANGELOG.md
Expand Up @@ -3,7 +3,7 @@
---


### Version 0.4.2dev
### Version 0.4.2dev0

##### Downloads

Expand All @@ -22,6 +22,7 @@
- Fixed a bug in `classifier.SoftmaxRegression` where the mean values of the offsets were used to update the bias units rather than their sum
- Fixed rare bug in MLP layer_mapping functions that caused a swap between the random number generation seed when initializing weights and biases
- More rigorous type and shape checks in `evaluate.plot_decision_regions`
- Changes in `DenseTransformer` so that it doesn't fail if the input array is not sparse

### Version 0.4.1 (2016-05-01)

Expand Down
14 changes: 7 additions & 7 deletions docs/sources/user_guide/preprocessing/DenseTransformer.ipynb
Expand Up @@ -25,15 +25,15 @@
"output_type": "stream",
"text": [
"Sebastian Raschka \n",
"last updated: 2016-01-30 \n",
"last updated: 2016-07-17 \n",
"\n",
"CPython 3.5.1\n",
"IPython 4.0.3\n",
"IPython 5.0.0\n",
"\n",
"matplotlib 1.5.1\n",
"numpy 1.10.2\n",
"scipy 0.16.1\n",
"mlxtend 0.3.0\n"
"numpy 1.11.0\n",
"scipy 0.17.1\n",
"mlxtend 0.4.2.dev0\n"
]
}
],
Expand All @@ -53,7 +53,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"A simple transformer that converts a sparse into a dense numpy array, e.g., required for scikit-learn's `Pipeline` when, for example, `CountVectorizers` are used in combination with `RandomForest`s."
"A simple transformer that converts a sparse into a dense numpy array, e.g., required for scikit-learn's `Pipeline` when, for example, `CountVectorizers` are used in combination with estimators that are not compatible with sparse matrices."
]
},
{
Expand Down Expand Up @@ -157,7 +157,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"metadata": {
"collapsed": false
},
Expand Down
2 changes: 1 addition & 1 deletion mlxtend/__init__.py
Expand Up @@ -4,4 +4,4 @@
#
# License: BSD 3 clause

__version__ = '0.4.2dev'
__version__ = '0.4.2dev0'
4 changes: 2 additions & 2 deletions mlxtend/preprocessing/__init__.py
Expand Up @@ -4,14 +4,14 @@
#
# License: BSD 3 clause

from .transformer import TransformerObj
from .mean_centering import MeanCenterer
from .shuffle import shuffle_arrays_unison
from .scaling import minmax_scaling
from .scaling import standardize
from .dense_transformer import DenseTransformer
from .copy_transformer import CopyTransformer
from .onehot import one_hot


__all__ = ["MeanCenterer", "shuffle_arrays_unison", "TransformerObj",
__all__ = ["MeanCenterer", "shuffle_arrays_unison", "CopyTransformer",
"minmax_scaling", "standardize", "DenseTransformer", "one_hot"]
Expand Up @@ -9,7 +9,7 @@
import numpy as np


class TransformerObj(object):
class CopyTransformer(object):
def __init__(self):
self.ary = None

Expand Down
25 changes: 16 additions & 9 deletions mlxtend/preprocessing/dense_transformer.py
Expand Up @@ -7,21 +7,28 @@
# License: BSD 3 clause


class DenseTransformer(object):
from sklearn.base import BaseEstimator
from scipy.sparse import issparse

"""Convert a sparse matrix into a dense matrix."""

def __init__(self, some_param=True):
pass
class DenseTransformer(BaseEstimator):
"""Convert a sparse array into a dense array."""

def __init__(self, return_copy=True):
self.return_copy = return_copy
self.is_fitted = False

def transform(self, X, y=None):
return X.toarray()
if issparse(X):
return X.toarray()
elif self.return_copy:
return X.copy()
else:
return X

def fit(self, X, y=None):
self.is_fitted = True
return self

def fit_transform(self, X, y=None):
return X.toarray()

def get_params(self, deep=True):
return {'some_param': True}
return self.transform(X=X, y=y)
3 changes: 1 addition & 2 deletions mlxtend/preprocessing/mean_centering.py
Expand Up @@ -7,10 +7,9 @@
# License: BSD 3 clause

import numpy as np
from .transformer import TransformerObj


class MeanCenterer(TransformerObj):
class MeanCenterer(object):

"""Column centering of vectors and matrices.
Expand Down
53 changes: 53 additions & 0 deletions mlxtend/preprocessing/shuffle.py
Expand Up @@ -5,6 +5,7 @@
# License: BSD 3 clause

import numpy as np
from mlxtend.utils import check_Xy


def shuffle_arrays_unison(arrays, random_seed=None):
Expand Down Expand Up @@ -40,3 +41,55 @@ def shuffle_arrays_unison(arrays, random_seed=None):
assert(len(a) == n)
idx = np.random.permutation(n)
return [a[idx] for a in arrays]


def shuffled_split(X, y, shuffle=True, train_size=0.75, random_seed=None):
"""Splits feature and target arrays into training and test subsets.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Initial dataset, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape = [n_samples]
Target values.
shuffle : bool (default: True)
Doesn't shuffle the arrays if False
train_size : float (default: 0.75)
Proportion of data in the training arrays. For example, 0.75 will
put 75% of the data into the training array, and 25% of the data
into the test array.
random_seed : int (default: None)
Sets the random state.
Returns
----------
X_train : array-like, shape = [n_samples * train_size, n_features]
Training dataset, where n_samples is the number of samples and
n_features is the number of features.
y_train : array-like, shape = [n_samples * train_size]
Training target values.
X_test : array-like, shape = [n_samples * (1-train_size), n_features]
Dataset for testing, where n_samples is the number of samples and
n_features is the number of features.
y_test : array-like, shape = [n_samples * (1-train_size)]
Target values for testing.
"""
check_Xy(X, y, y_int=False)

if train_size <= 0.0 or train_size >= 1.0:
raise ValueError('train_size must be a float in the range (0.0, 1.0)')

if shuffle:
X_ary, y_ary = shuffle_arrays_unison(arrays=[X.copy(), y.copy()],
random_seed=random_seed)
else:
X_ary, y_ary = X.copy(), y.copy()

idx = np.arange(0, y.shape[0])
train_absize = round(train_size * y.shape[0])

X_train, y_train = X_ary[:train_absize], y_ary[:train_absize]
X_test, y_test = X_ary[train_absize:], y_ary[train_absize:]
return X_train, y_train, X_test, y_test
68 changes: 68 additions & 0 deletions mlxtend/preprocessing/tests/test_copy_transformer.py
@@ -0,0 +1,68 @@
# Sebastian Raschka 2014-2016
# mlxtend Machine Learning Library Extensions
# Author: Sebastian Raschka <sebastianraschka.com>
#
# License: BSD 3 clause

import numpy as np
from mlxtend.preprocessing import one_hot
from nose.tools import raises


def test_default():
y = np.array([0, 1, 2, 3, 4, 2])
expect = np.array([[1., 0., 0., 0., 0.],
[0., 1., 0., 0., 0.],
[0., 0., 1., 0., 0.],
[0., 0., 0., 1., 0.],
[0., 0., 0., 0., 1.],
[0., 0., 1., 0., 0.]], dtype='float')
out = one_hot(y)
np.testing.assert_array_equal(expect, out)


def test_autoguessing():
y = np.array([0, 4, 0, 4])
expect = np.array([[1., 0., 0., 0., 0.],
[0., 0., 0., 0., 1.],
[1., 0., 0., 0., 0.],
[0., 0., 0., 0., 1.]], dtype='float')
out = one_hot(y)
np.testing.assert_array_equal(expect, out)


def test_list():
y = [0, 1, 2, 3, 4, 2]
expect = np.array([[1., 0., 0., 0., 0.],
[0., 1., 0., 0., 0.],
[0., 0., 1., 0., 0.],
[0., 0., 0., 1., 0.],
[0., 0., 0., 0., 1.],
[0., 0., 1., 0., 0.]], dtype='float')
out = one_hot(y)
np.testing.assert_array_equal(expect, out)


@raises(AttributeError)
def test_multidim_list():
y = [[0, 1, 2, 3, 4, 2]]
one_hot(y)


@raises(AttributeError)
def test_multidim_array():
y = np.array([[0], [1], [2], [3], [4], [2]])
one_hot(y)


def test_oneclass():
np.testing.assert_array_equal(one_hot([0]),
np.array([[0.]], dtype='float'))


def test_list_morelabels():
y = [0, 1]
expect = np.array([[1., 0., 0.],
[0., 1., 0.]], dtype='float')
out = one_hot(y, num_labels=3)
np.testing.assert_array_equal(expect, out)
42 changes: 42 additions & 0 deletions mlxtend/preprocessing/tests/test_dense_transformer.py
@@ -0,0 +1,42 @@
# Sebastian Raschka 2014-2016
# mlxtend Machine Learning Library Extensions
# Author: Sebastian Raschka <sebastianraschka.com>
#
# License: BSD 3 clause

import numpy as np
from mlxtend.preprocessing import DenseTransformer
from sklearn.datasets import load_iris
from sklearn.pipeline import make_pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import issparse


iris = load_iris()
X, y = iris.data, iris.target


def test_dense_to_dense():
todense = DenseTransformer(return_copy=False)
np.testing.assert_array_equal(X, todense.transform(X))


def test_sparse_to_dense():
todense = DenseTransformer()
tfidf = TfidfTransformer()
X_t = tfidf.fit_transform([[1, 2, 3]])
assert issparse(X_t)
X_dense = todense.transform(X_t)
expect = np.array([[0.26726124, 0.53452248, 0.80178373]])
assert np.allclose(X_dense, expect)


def test_pipeline():
rf = RandomForestClassifier()
param_grid = [{'randomforestclassifier__n_estimators': [1, 5, 10]}]
pipe = make_pipeline(StandardScaler(), DenseTransformer(), rf)
grid = GridSearchCV(pipe, param_grid, cv=3, n_jobs=1)
grid.fit(X, y)

0 comments on commit da98204

Please sign in to comment.