scikit-learn · jnothman · Jul 20, 2018 · Apr 22, 2018 · Apr 22, 2018 · Apr 22, 2018
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -65,6 +65,13 @@ jobs:
           path: ~/log.txt
           destination: log.txt
 
+  pypy3:
+    docker:
+      - image: pypy:3-6.0.0
+    steps:
+      - checkout
+      - run: ./build_tools/circle/build_test_pypy.sh
+
   deploy:
     docker:
       - image: circleci/python:3.6.1
@@ -88,6 +95,7 @@ workflows:
     jobs:
       - python3
       - python2
+      - pypy3
       - deploy:
           requires:
             - python3

diff --git a/build_tools/circle/build_test_pypy.sh b/build_tools/circle/build_test_pypy.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+set -x
+set -e
+
+apt-get -yq update
+apt-get -yq install libatlas-dev libatlas-base-dev liblapack-dev gfortran
+
+pip install virtualenv
+
+if command -v pypy3; then
+    virtualenv -p $(command -v pypy3) pypy-env
+elif command -v pypy; then
+    virtualenv -p $(command -v pypy) pypy-env
+fi
+
+source pypy-env/bin/activate
+
+python --version
+which python
+
+pip install --extra-index https://antocuni.github.io/pypy-wheels/ubuntu numpy==1.14.4 Cython pytest
+pip install "scipy>=1.1.0" sphinx numpydoc docutils
+pip install -e .
+
+make test
diff --git a/conftest.py b/conftest.py
@@ -5,13 +5,23 @@
 # doc/modules/clustering.rst and use sklearn from the local folder rather than
 # the one from site-packages.
 
+import sys
 from distutils.version import LooseVersion
 
 import pytest
 from _pytest.doctest import DoctestItem
 
 
 def pytest_collection_modifyitems(config, items):
+
+    # FeatureHasher is not compatible with PyPy
+    if '__pypy__' in sys.modules:
+        skip_marker = pytest.mark.skip(
+            reason='FeatureHasher is not compatible with PyPy')
+        for item in items:
+            if item.name == 'sklearn.feature_extraction.hashing.FeatureHasher':
+                item.add_marker(skip_marker)
+
     # numpy changed the str/repr formatting of numpy arrays in 1.14. We want to
     # run doctests only for numpy >= 1.14.
     skip_doctests = True

diff --git a/doc/conftest.py b/doc/conftest.py
@@ -1,8 +1,10 @@
+import os
 from os.path import exists
 from os.path import join
 
 import numpy as np
 
+from sklearn.utils import IS_PYPY
 from sklearn.utils.testing import SkipTest
 from sklearn.utils.testing import check_skip_network
 from sklearn.datasets import get_data_home
@@ -55,6 +57,8 @@ def setup_twenty_newsgroups():
 
 
 def setup_working_with_text_data():
+    if IS_PYPY and os.environ.get('CI', None):
+        raise SkipTest('Skipping too slow test with PyPy on CI')
     check_skip_network()
     cache_path = _pkl_filepath(get_data_home(), CACHE_NAME)
     if not exists(cache_path):
@@ -91,6 +95,8 @@ def pytest_runtest_setup(item):
         setup_working_with_text_data()
     elif fname.endswith('modules/compose.rst') or is_index:
         setup_compose()
+    elif IS_PYPY and fname.endswith('modules/feature_extraction.rst'):
+        raise SkipTest('FeatureHasher is not compatible with PyPy')
     elif fname.endswith('modules/impute.rst'):
         setup_impute()
 

diff --git a/doc/developers/advanced_installation.rst b/doc/developers/advanced_installation.rst
@@ -38,6 +38,12 @@ Scikit-learn requires:
 - NumPy (>= 1.8.2),
 - SciPy (>= 0.13.3).
 
+.. note::
+
+   For installing on PyPy, PyPy3-v5.10+, Numpy 1.14.0+, and scipy 1.1.0+
+   are required. For PyPy, only installation instructions with pip apply.
+
+
 Building Scikit-learn also requires
 
 - Cython >=0.23 

diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst
@@ -352,7 +352,7 @@ and Cython optimizations.
 
    * Travis is used for testing on Linux platforms
    * Appveyor is used for testing on Windows platforms
-   * CircleCI is used to build the docs for viewing
+   * CircleCI is used to build the docs for viewing and for testing with PyPy on Linux
 
    Please note that if one of the following markers appear in the latest commit
    message, the following actions are taken.

diff --git a/doc/install.rst b/doc/install.rst
@@ -46,6 +46,12 @@ it as ``scikit-learn[alldeps]``. The most common use case for this is in a
 application or a Docker image. This option is not intended for manual
 installation from the command line.
 
+.. note::
+
+   For installing on PyPy, PyPy3-v5.10+, Numpy 1.14.0+, and scipy 1.1.0+
+   are required.
+
+
 For installation instructions for more distributions see
 :ref:`other distributions <install_by_distribution>`.
 For compiling the development version from source, or building the package

diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
@@ -408,6 +408,12 @@ Miscellaneous
   :issue:`9101` by :user:`alex-33 <alex-33>`
   and :user:`Maskani Filali Mohamed <maskani-moh>`.
 
+- Add almost complete PyPy 3 support. Known unsupported functionalities are
+  :func:`datasets.load_svmlight_file`, :class:`feature_extraction.FeatureHasher` and
+  :class:`feature_extraction.text.HashingVectorizer`.  For running on PyPy, PyPy3-v5.10+,
+  Numpy 1.14.0+, and scipy 1.1.0+ are required.
+  :issue:`11010` by :user:`Ronan Lamy <rlamy>` and `Roman Yurchak`_.
+
 Bug fixes
 .........
 

diff --git a/setup.py b/setup.py
@@ -41,8 +41,12 @@
 
 VERSION = sklearn.__version__
 
-SCIPY_MIN_VERSION = '0.13.3'
-NUMPY_MIN_VERSION = '1.8.2'
+if '__pypy__' in sys.modules:
+    SCIPY_MIN_VERSION = '1.1.0'
+    NUMPY_MIN_VERSION = '1.4.0'
+else:
+    SCIPY_MIN_VERSION = '0.13.3'
+    NUMPY_MIN_VERSION = '1.8.2'
 
 
 # Optional setuptools features
@@ -185,6 +189,10 @@ def setup_package():
                                  'Programming Language :: Python :: 3.4',
                                  'Programming Language :: Python :: 3.5',
                                  'Programming Language :: Python :: 3.6',
+                                 ('Programming Language :: Python :: '
+                                  'Implementation :: CPython'),
+                                 ('Programming Language :: Python :: '
+                                  'Implementation :: PyPy')
                                  ],
                     cmdclass=cmdclass,
                     install_requires=[

diff --git a/sklearn/datasets/setup.py b/sklearn/datasets/setup.py
@@ -1,6 +1,7 @@
 
 import numpy
 import os
+import sys
 
 
 def configuration(parent_package='', top_path=None):
@@ -10,9 +11,10 @@ def configuration(parent_package='', top_path=None):
     config.add_data_dir('descr')
     config.add_data_dir('images')
     config.add_data_dir(os.path.join('tests', 'data'))
-    config.add_extension('_svmlight_format',
-                         sources=['_svmlight_format.pyx'],
-                         include_dirs=[numpy.get_include()])
+    if '__pypy__' not in sys.modules:
+        config.add_extension('_svmlight_format',
+                             sources=['_svmlight_format.pyx'],
+                             include_dirs=[numpy.get_include()])
     config.add_subpackage('tests')
     return config
 

diff --git a/sklearn/datasets/svmlight_format.py b/sklearn/datasets/svmlight_format.py
@@ -22,12 +22,18 @@
 import numpy as np
 import scipy.sparse as sp
 
-from ._svmlight_format import _load_svmlight_file
 from .. import __version__
 from ..externals import six
 from ..externals.six import u, b
 from ..externals.six.moves import range, zip
-from ..utils import check_array
+from ..utils import check_array, IS_PYPY
+
+if not IS_PYPY:
+    from ._svmlight_format import _load_svmlight_file
+else:
+    def _load_svmlight_file(*args, **kwargs):
+        raise NotImplementedError('load_svmlight_file is currently not '
+                                  'compatible with PyPy.')
 
 
 def load_svmlight_file(f, n_features=None, dtype=np.float64,

diff --git a/sklearn/datasets/tests/test_svmlight_format.py b/sklearn/datasets/tests/test_svmlight_format.py
@@ -18,6 +18,7 @@
 from sklearn.utils.testing import assert_raises
 from sklearn.utils.testing import assert_raises_regex
 from sklearn.utils.testing import assert_in
+from sklearn.utils.testing import fails_if_pypy
 from sklearn.utils.fixes import sp_version
 
 import sklearn
@@ -30,6 +31,8 @@
 invalidfile = os.path.join(currdir, "data", "svmlight_invalid.txt")
 invalidfile2 = os.path.join(currdir, "data", "svmlight_invalid_order.txt")
 
+pytestmark = fails_if_pypy
+
 
 def test_load_svmlight_file():
     X, y = load_svmlight_file(datafile)
@@ -119,7 +122,8 @@ def test_load_compressed():
     with NamedTemporaryFile(prefix="sklearn-test", suffix=".gz") as tmp:
         tmp.close()  # necessary under windows
         with open(datafile, "rb") as f:
-            shutil.copyfileobj(f, gzip.open(tmp.name, "wb"))
+            with gzip.open(tmp.name, "wb") as fh_out:
+                shutil.copyfileobj(f, fh_out)
         Xgz, ygz = load_svmlight_file(tmp.name)
         # because we "close" it manually and write to it,
         # we need to remove it manually.
@@ -130,7 +134,8 @@ def test_load_compressed():
     with NamedTemporaryFile(prefix="sklearn-test", suffix=".bz2") as tmp:
         tmp.close()  # necessary under windows
         with open(datafile, "rb") as f:
-            shutil.copyfileobj(f, BZ2File(tmp.name, "wb"))
+            with BZ2File(tmp.name, "wb") as fh_out:
+                shutil.copyfileobj(f, fh_out)
         Xbz, ybz = load_svmlight_file(tmp.name)
         # because we "close" it manually and write to it,
         # we need to remove it manually.

diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py
@@ -931,12 +931,14 @@ def _resize_state(self):
             raise ValueError('resize with smaller n_estimators %d < %d' %
                              (total_n_estimators, self.estimators_[0]))
 
-        self.estimators_.resize((total_n_estimators, self.loss_.K))
-        self.train_score_.resize(total_n_estimators)
+        self.estimators_ = np.resize(self.estimators_,
+                                     (total_n_estimators, self.loss_.K))
+        self.train_score_ = np.resize(self.train_score_, total_n_estimators)
         if (self.subsample < 1 or hasattr(self, 'oob_improvement_')):
             # if do oob resize arrays or create new if not available
             if hasattr(self, 'oob_improvement_'):
-                self.oob_improvement_.resize(total_n_estimators)
+                self.oob_improvement_ = np.resize(self.oob_improvement_,
+                                                  total_n_estimators)
             else:
                 self.oob_improvement_ = np.zeros((total_n_estimators,),
                                                  dtype=np.float64)

diff --git a/sklearn/feature_extraction/hashing.py b/sklearn/feature_extraction/hashing.py
@@ -7,9 +7,15 @@
 import numpy as np
 import scipy.sparse as sp
 
-from . import _hashing
+from ..utils import IS_PYPY
 from ..base import BaseEstimator, TransformerMixin
 
+if not IS_PYPY:
+    from ._hashing import transform as _hashing_transform
+else:
+    def _hashing_transform(*args, **kwargs):
+        raise NotImplementedError('FeatureHasher is not compatible with PyPy.')
+
 
 def _iteritems(d):
     """Like d.iteritems, but accepts any collections.Mapping."""
@@ -155,7 +161,7 @@ def transform(self, raw_X):
         elif self.input_type == "string":
             raw_X = (((f, 1) for f in x) for x in raw_X)
         indices, indptr, values = \
-            _hashing.transform(raw_X, self.n_features, self.dtype,
+            _hashing_transform(raw_X, self.n_features, self.dtype,
                                self.alternate_sign)
         n_samples = indptr.shape[0] - 1
 

diff --git a/sklearn/feature_extraction/setup.py b/sklearn/feature_extraction/setup.py
@@ -1,4 +1,5 @@
 import os
+import sys
 
 
 def configuration(parent_package='', top_path=None):
@@ -10,10 +11,11 @@ def configuration(parent_package='', top_path=None):
     if os.name == 'posix':
         libraries.append('m')
 
-    config.add_extension('_hashing',
-                         sources=['_hashing.pyx'],
-                         include_dirs=[numpy.get_include()],
-                         libraries=libraries)
+    if "__pypy__" not in sys.modules:
+        config.add_extension('_hashing',
+                             sources=['_hashing.pyx'],
+                             include_dirs=[numpy.get_include()],
+                             libraries=libraries)
     config.add_subpackage("tests")
 
     return config
diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py
@@ -5,7 +5,9 @@
 
 from sklearn.feature_extraction import FeatureHasher
 from sklearn.utils.testing import (assert_raises, assert_true, assert_equal,
-                                   ignore_warnings)
+                                   ignore_warnings, fails_if_pypy)
+
+pytestmark = fails_if_pypy
 
 
 def test_feature_hasher_dicts():

diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
@@ -26,12 +26,13 @@
 import numpy as np
 from numpy.testing import assert_array_almost_equal
 from numpy.testing import assert_array_equal
+from sklearn.utils import IS_PYPY
 from sklearn.utils.testing import (assert_equal, assert_false, assert_true,
                                    assert_not_equal, assert_almost_equal,
                                    assert_in, assert_less, assert_greater,
                                    assert_warns_message, assert_raise_message,
                                    clean_warning_registry, ignore_warnings,
-                                   SkipTest, assert_raises,
+                                   SkipTest, assert_raises, fails_if_pypy,
                                    assert_allclose_dense_sparse)
 from sklearn.utils.fixes import _Mapping as Mapping
 from collections import defaultdict
@@ -502,6 +503,7 @@ def test_tfidf_vectorizer_setters():
     assert_true(tv._tfidf.sublinear_tf)
 
 
+@fails_if_pypy
 @ignore_warnings(category=DeprecationWarning)
 def test_hashing_vectorizer():
     v = HashingVectorizer()
@@ -684,6 +686,7 @@ def test_count_binary_occurrences():
     assert_equal(X_sparse.dtype, np.float32)
 
 
+@fails_if_pypy
 @ignore_warnings(category=DeprecationWarning)
 def test_hashed_binary_occurrences():
     # by default multiple occurrences are counted as longs
@@ -819,6 +822,7 @@ def test_vectorizer_pipeline_cross_validation():
     assert_array_equal(cv_scores, [1., 1., 1.])
 
 
+@fails_if_pypy
 @ignore_warnings(category=DeprecationWarning)
 def test_vectorizer_unicode():
     # tests that the count vectorizer works with cyrillic.
@@ -886,9 +890,12 @@ def test_pickling_vectorizer():
         copy = pickle.loads(s)
         assert_equal(type(copy), orig.__class__)
         assert_equal(copy.get_params(), orig.get_params())
-        assert_array_equal(
-            copy.fit_transform(JUNK_FOOD_DOCS).toarray(),
-            orig.fit_transform(JUNK_FOOD_DOCS).toarray())
+        if IS_PYPY and isinstance(orig, HashingVectorizer):
+            continue
+        else:
+            assert_array_equal(
+                copy.fit_transform(JUNK_FOOD_DOCS).toarray(),
+                orig.fit_transform(JUNK_FOOD_DOCS).toarray())
 
 
 def test_countvectorizer_vocab_sets_when_pickling():
@@ -990,6 +997,7 @@ def test_non_unique_vocab():
     assert_raises(ValueError, vect.fit, [])
 
 
+@fails_if_pypy
 def test_hashingvectorizer_nan_in_docs():
     # np.nan can appear when using pandas to load text fields from a csv file
     # with missing values.