Skip to content

Commit

Permalink
Merge pull request #48 from paulgb/deprecate-cv
Browse files Browse the repository at this point in the history
Deprecate custom CV shims in documentation and code. Refs #11.
  • Loading branch information
dukebody committed Jan 16, 2016
2 parents d3fb586 + 367f049 commit 85ad5df
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 15 deletions.
10 changes: 8 additions & 2 deletions README.rst
Expand Up @@ -7,7 +7,7 @@ This module provides a bridge between `Scikit-Learn <http://scikit-learn.org/sta
In particular, it provides:

1. A way to map ``DataFrame`` columns to transformations, which are later recombined into features.
2. A way to cross-validate a pipeline that takes a pandas ``DataFrame`` as input.
2. A compatibility shim for old ``scikit-learn`` versions to cross-validate a pipeline that takes a pandas ``DataFrame`` as input. This is only needed for ``scikit-learn<0.16.0`` (see `#11 <https://github.com/paulgb/sklearn-pandas/issues/11>`__ for details). It is deprecated and will likely be dropped in ``skearn-pandas==2.0``.

Installation
------------
Expand Down Expand Up @@ -174,7 +174,7 @@ The stacking of the sparse features is done without ever densifying them.
Cross-Validation
----------------

Now that we can combine features from pandas DataFrames, we may want to use cross-validation to see whether our model works. Scikit-learn provides features for cross-validation, but they expect numpy data structures and won't work with ``DataFrameMapper``.
Now that we can combine features from pandas DataFrames, we may want to use cross-validation to see whether our model works. ``scikit-learn<0.16.0`` provided features for cross-validation, but they expect numpy data structures and won't work with ``DataFrameMapper``.

To get around this, sklearn-pandas provides a wrapper on sklearn's ``cross_val_score`` function which passes a pandas DataFrame to the estimator rather than a numpy array::

Expand All @@ -190,6 +190,12 @@ Sklearn-pandas' ``cross_val_score`` function provides exactly the same interface
Changelog
---------

Development
***********

* Deprecate custom cross-validation shim classes.


1.1.0 (2015-12-06)
*******************

Expand Down
15 changes: 15 additions & 0 deletions sklearn_pandas/cross_validation.py
@@ -1,13 +1,24 @@
import warnings
from sklearn import cross_validation
from sklearn import grid_search

DEPRECATION_MSG = '''
Custom cross-validation compatibility shims are no longer needed for
scikit-learn>=0.16.0 and will be dropped in sklearn-pandas==2.0.
'''


def cross_val_score(model, X, *args, **kwargs):
warnings.warn(DEPRECATION_MSG, DeprecationWarning)
X = DataWrapper(X)
return cross_validation.cross_val_score(model, X, *args, **kwargs)


class GridSearchCV(grid_search.GridSearchCV):
def __init__(self, *args, **kwargs):
warnings.warn(DEPRECATION_MSG, DeprecationWarning)
super(GridSearchCV, self).__init__(*args, **kwargs)

def fit(self, X, *params, **kwparams):
return super(GridSearchCV, self).fit(DataWrapper(X), *params, **kwparams)

Expand All @@ -17,6 +28,10 @@ def predict(self, X, *params, **kwparams):

try:
class RandomizedSearchCV(grid_search.RandomizedSearchCV):
def __init__(self, *args, **kwargs):
warnings.warn(DEPRECATION_MSG, DeprecationWarning)
super(RandomizedSearchCV, self).__init__(*args, **kwargs)

def fit(self, X, *params, **kwparams):
return super(RandomizedSearchCV, self).fit(DataWrapper(X), *params, **kwparams)

Expand Down
25 changes: 25 additions & 0 deletions tests/test_dataframe_mapper.py
@@ -1,4 +1,5 @@
import pytest
from pkg_resources import parse_version

# In py3, mock is included with the unittest standard library
# In py2, it's a separate package
Expand All @@ -10,6 +11,8 @@
from pandas import DataFrame
import pandas as pd
from scipy import sparse
from sklearn import __version__ as sklearn_version
from sklearn.cross_validation import cross_val_score as sklearn_cv_score
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
Expand Down Expand Up @@ -277,3 +280,25 @@ def test_with_car_dataframe(cars_dataframe):
labels = cars_dataframe["model"]
scores = cross_val_score(pipeline, data, labels)
assert scores.mean() > 0.30


@pytest.mark.skipIf(parse_version(sklearn_version) < parse_version('0.16'))
def test_direct_cross_validation(iris_dataframe):
"""
Starting with sklearn>=0.16.0 we no longer need CV wrappers for dataframes.
See https://github.com/paulgb/sklearn-pandas/issues/11
"""
pipeline = Pipeline([
("preprocess", DataFrameMapper([
("petal length (cm)", None),
("petal width (cm)", None),
("sepal length (cm)", None),
("sepal width (cm)", None),
])),
("classify", SVC(kernel='linear'))
])
data = iris_dataframe.drop("species", axis=1)
labels = iris_dataframe["species"]
scores = sklearn_cv_score(pipeline, data, labels)
assert scores.mean() > 0.96
assert (scores.std() * 2) < 0.04
25 changes: 12 additions & 13 deletions tox.ini
Expand Up @@ -3,18 +3,17 @@ envlist = py27, py34

[testenv]
deps =
pip==7.0.1
pytest==2.7.1
setuptools==16.0
wheel==0.24.0
flake8==2.4.1
py27: mock==1.3.0
pip==7.0.1
pytest==2.7.1
setuptools==16.0
wheel==0.24.0
flake8==2.4.1
py27: mock==1.3.0

commands =
pip install numpy --no-index
pip install scipy --no-index
pip install pandas --no-index
pip install scikit-learn --no-index
python setup.py develop
flake8 tests
py.test
pip install numpy --no-index
pip install scipy --no-index
pip install pandas --no-index
pip install scikit-learn --no-index
flake8 tests
py.test

0 comments on commit 85ad5df

Please sign in to comment.