Merge pull request #48 from paulgb/deprecate-cv

dukebody · dukebody · commit 85ad5dfea787 · 2016-01-16T11:39:25.000+01:00
Deprecate custom CV shims in documentation and code. Refs #11.
diff --git a/README.rst b/README.rst
@@ -7,7 +7,7 @@ This module provides a bridge between `Scikit-Learn <http://scikit-learn.org/sta
 In particular, it provides:
 
 1. A way to map ``DataFrame`` columns to transformations, which are later recombined into features.
-2. A way to cross-validate a pipeline that takes a pandas ``DataFrame`` as input.
+2. A compatibility shim for old ``scikit-learn`` versions to cross-validate a pipeline that takes a pandas ``DataFrame`` as input. This is only needed for ``scikit-learn<0.16.0`` (see `#11 <https://github.com/paulgb/sklearn-pandas/issues/11>`__ for details). It is deprecated and will likely be dropped in ``skearn-pandas==2.0``.
 
 Installation
 ------------
@@ -174,7 +174,7 @@ The stacking of the sparse features is done without ever densifying them.
 Cross-Validation
 ----------------
 
-Now that we can combine features from pandas DataFrames, we may want to use cross-validation to see whether our model works. Scikit-learn provides features for cross-validation, but they expect numpy data structures and won't work with ``DataFrameMapper``.
+Now that we can combine features from pandas DataFrames, we may want to use cross-validation to see whether our model works. ``scikit-learn<0.16.0`` provided features for cross-validation, but they expect numpy data structures and won't work with ``DataFrameMapper``.
 
 To get around this, sklearn-pandas provides a wrapper on sklearn's ``cross_val_score`` function which passes a pandas DataFrame to the estimator rather than a numpy array::
 
@@ -190,6 +190,12 @@ Sklearn-pandas' ``cross_val_score`` function provides exactly the same interface
 Changelog
 ---------
 
+Development
+***********
+
+* Deprecate custom cross-validation shim classes.
+
+
 1.1.0 (2015-12-06)
 *******************
 
diff --git a/sklearn_pandas/cross_validation.py b/sklearn_pandas/cross_validation.py
@@ -1,13 +1,24 @@
+import warnings
 from sklearn import cross_validation
 from sklearn import grid_search
 
+DEPRECATION_MSG = '''
+    Custom cross-validation compatibility shims are no longer needed for
+    scikit-learn>=0.16.0 and will be dropped in sklearn-pandas==2.0.
+'''
+
 
 def cross_val_score(model, X, *args, **kwargs):
+    warnings.warn(DEPRECATION_MSG, DeprecationWarning)
     X = DataWrapper(X)
     return cross_validation.cross_val_score(model, X, *args, **kwargs)
 
 
 class GridSearchCV(grid_search.GridSearchCV):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(DEPRECATION_MSG, DeprecationWarning)
+        super(GridSearchCV, self).__init__(*args, **kwargs)
+
     def fit(self, X, *params, **kwparams):
         return super(GridSearchCV, self).fit(DataWrapper(X), *params, **kwparams)
 
@@ -17,6 +28,10 @@ def predict(self, X, *params, **kwparams):
 
 try:
     class RandomizedSearchCV(grid_search.RandomizedSearchCV):
+        def __init__(self, *args, **kwargs):
+            warnings.warn(DEPRECATION_MSG, DeprecationWarning)
+            super(RandomizedSearchCV, self).__init__(*args, **kwargs)
+
         def fit(self, X, *params, **kwparams):
             return super(RandomizedSearchCV, self).fit(DataWrapper(X), *params, **kwparams)
 
diff --git a/tests/test_dataframe_mapper.py b/tests/test_dataframe_mapper.py
@@ -1,4 +1,5 @@
 import pytest
+from pkg_resources import parse_version
 
 # In py3, mock is included with the unittest standard library
 # In py2, it's a separate package
@@ -10,6 +11,8 @@
 from pandas import DataFrame
 import pandas as pd
 from scipy import sparse
+from sklearn import __version__ as sklearn_version
+from sklearn.cross_validation import cross_val_score as sklearn_cv_score
 from sklearn.datasets import load_iris
 from sklearn.pipeline import Pipeline
 from sklearn.svm import SVC
@@ -277,3 +280,25 @@ def test_with_car_dataframe(cars_dataframe):
     labels = cars_dataframe["model"]
     scores = cross_val_score(pipeline, data, labels)
     assert scores.mean() > 0.30
+
+
+@pytest.mark.skipIf(parse_version(sklearn_version) < parse_version('0.16'))
+def test_direct_cross_validation(iris_dataframe):
+    """
+    Starting with sklearn>=0.16.0 we no longer need CV wrappers for dataframes.
+    See https://github.com/paulgb/sklearn-pandas/issues/11
+    """
+    pipeline = Pipeline([
+        ("preprocess", DataFrameMapper([
+            ("petal length (cm)", None),
+            ("petal width (cm)", None),
+            ("sepal length (cm)", None),
+            ("sepal width (cm)", None),
+        ])),
+        ("classify", SVC(kernel='linear'))
+    ])
+    data = iris_dataframe.drop("species", axis=1)
+    labels = iris_dataframe["species"]
+    scores = sklearn_cv_score(pipeline, data, labels)
+    assert scores.mean() > 0.96
+    assert (scores.std() * 2) < 0.04
diff --git a/tox.ini b/tox.ini
@@ -3,18 +3,17 @@ envlist = py27, py34
 
 [testenv]
 deps =
-     pip==7.0.1
-     pytest==2.7.1
-     setuptools==16.0
-     wheel==0.24.0
-     flake8==2.4.1
-     py27: mock==1.3.0
+    pip==7.0.1
+    pytest==2.7.1
+    setuptools==16.0
+    wheel==0.24.0
+    flake8==2.4.1
+    py27: mock==1.3.0
 
 commands =
-         pip install numpy --no-index
-         pip install scipy --no-index
-         pip install pandas --no-index
-         pip install scikit-learn --no-index
-         python setup.py develop
-         flake8 tests
-         py.test
+    pip install numpy --no-index
+    pip install scipy --no-index
+    pip install pandas --no-index
+    pip install scikit-learn --no-index
+    flake8 tests
+    py.test