scikit-learn-contrib · glemaitre · Jul 7, 2023 · Jul 6, 2023 · Jul 6, 2023 · Jul 6, 2023
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -3,7 +3,7 @@ version: 2
 jobs:
   doc:
     docker:
-      - image: circleci/python:3.7.7-buster
+      - image: cimg/python:3.8.12
     environment:
       - USERNAME: "glemaitre"
       - ORGANIZATION: "imbalanced-learn"

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,26 +1,23 @@
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v2.3.0
+    rev: v4.3.0
     hooks:
     -   id: check-yaml
     -   id: end-of-file-fixer
     -   id: trailing-whitespace
 -   repo: https://github.com/psf/black
-    rev: 22.3.0
+    rev: 23.3.0
     hooks:
     -   id: black
--   repo: https://gitlab.com/pycqa/flake8
-    rev: 3.9.2
+-   repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.0.272
     hooks:
-    -   id: flake8
-        types: [file, python]
+    -   id: ruff
+        args: ["--fix", "--show-source"]
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v0.782
+    rev: v1.3.0
     hooks:
      -  id: mypy
-        files: sklearn/
+        files: imblearn/
         additional_dependencies: [pytest==6.2.4]
--   repo: https://github.com/PyCQA/isort
-    rev: 5.10.1
-    hooks:
-    -   id: isort
diff --git a/README.rst b/README.rst
@@ -29,7 +29,7 @@
 
 .. |PythonMinVersion| replace:: 3.8
 .. |NumPyMinVersion| replace:: 1.17.3
-.. |SciPyMinVersion| replace:: 1.3.2
+.. |SciPyMinVersion| replace:: 1.5.0
 .. |ScikitLearnMinVersion| replace:: 1.0.2
 .. |MatplotlibMinVersion| replace:: 3.1.2
 .. |PandasMinVersion| replace:: 1.0.5

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -11,7 +11,7 @@ jobs:
 - job: git_commit
   displayName: Get Git Commit
   pool:
-    vmImage: ubuntu-20.04
+    vmImage: ubuntu-22.04
   steps:
     - bash: |
         set -ex
@@ -38,21 +38,21 @@ jobs:
     )
   displayName: Linting
   pool:
-    vmImage: ubuntu-20.04
+    vmImage: ubuntu-22.04
   steps:
     - task: UsePythonVersion@0
       inputs:
         versionSpec: '3.9'
     - bash: |
         # Include pytest compatibility with mypy
-        pip install pytest flake8 mypy==0.782 black==22.3 isort
+        pip install flake8 pytest mypy==1.3.0 black==23.3 ruff==0.0.272
       displayName: Install linters
     - bash: |
         black --check --diff .
       displayName: Run black
     - bash: |
-        isort --check --diff .
-      displayName: Run isort
+        ruff check --show-source .
+      displayName: Run ruff
     - bash: |
         ./build_tools/azure/linting.sh
       displayName: Run linting
@@ -63,7 +63,7 @@ jobs:
 - template: build_tools/azure/posix.yml
   parameters:
     name: Linux_Nightly
-    vmImage: ubuntu-20.04
+    vmImage: ubuntu-22.04
     dependsOn: [git_commit, linting]
     condition: |
       and(
@@ -86,7 +86,7 @@ jobs:
 - template: build_tools/azure/posix.yml
   parameters:
     name: Linux_Runs
-    vmImage: ubuntu-20.04
+    vmImage: ubuntu-22.04
     dependsOn: [git_commit]
     condition: |
       and(
@@ -125,7 +125,7 @@ jobs:
 - template: build_tools/azure/posix.yml
   parameters:
     name: Linux
-    vmImage: ubuntu-20.04
+    vmImage: ubuntu-22.04
     dependsOn: [linting, git_commit]
     condition: |
       and(
@@ -144,7 +144,7 @@ jobs:
         THREADPOOLCTL_VERSION: 'min'
         COVERAGE: 'false'
       # Linux + Python 3.8 build with OpenBLAS and without SITE_JOBLIB
-      py38_conda_defaults_openblas:
+      py38_conda_conda_forge_openblas:
         DISTRIB: 'conda'
         CONDA_CHANNEL: 'conda-forge'
         PYTHON_VERSION: '3.8'
@@ -170,6 +170,13 @@ jobs:
         TEST_DOCSTRINGS: 'true'
         CHECK_WARNINGS: 'false'
         SKLEARN_VERSION: '1.1.3'
+      pylatest_pip_openblas_sklearn_intermediate_bis:
+        DISTRIB: 'conda-pip-latest'
+        PYTHON_VERSION: '3.10'
+        TEST_DOCS: 'true'
+        TEST_DOCSTRINGS: 'true'
+        CHECK_WARNINGS: 'false'
+        SKLEARN_VERSION: '1.2.2'
       pylatest_pip_tensorflow:
         DISTRIB: 'conda-pip-latest-tensorflow'
         CONDA_CHANNEL: 'conda-forge'
@@ -263,12 +270,21 @@ jobs:
         CONDA_CHANNEL: 'conda-forge'
         CPU_COUNT: '3'
         TEST_DOCS: 'true'
-      pylatest_conda_mkl_no_openmp:
+      # TODO: re-enable when we find out why MKL on defaults segfaults
+      # It seems that scikit-learn from defaults channel is built with LLVM/CLANG OMP
+      # while we use MKL OMP. This could be the cause of the segfaults.
+      # pylatest_conda_mkl_no_openmp:
+      #   DISTRIB: 'conda'
+      #   BLAS: 'mkl'
+      #   SKLEARN_SKIP_OPENMP_TEST: 'true'
+      #   CPU_COUNT: '3'
+      #   TEST_DOCS: 'true'
+      conda_conda_forge_openblas:
         DISTRIB: 'conda'
-        BLAS: 'mkl'
-        SKLEARN_SKIP_OPENMP_TEST: 'true'
-        CPU_COUNT: '3'
+        CONDA_CHANNEL: 'conda-forge'
+        BLAS: 'openblas'
         TEST_DOCS: 'true'
+        CPU_COUNT: '3'
 
 - template: build_tools/azure/windows.yml
   parameters:

diff --git a/build_tools/azure/linting.sh b/build_tools/azure/linting.sh
@@ -4,9 +4,6 @@ set -e
 # pipefail is necessary to propagate exit codes
 set -o pipefail
 
-flake8 --show-source .
-echo -e "No problem detected by flake8\n"
-
 # For docstrings and warnings of deprecated attributes to be rendered
 # properly, the property decorator must come before the deprecated decorator
 # (else they are treated as functions)
@@ -33,11 +30,3 @@ then
     echo "$doctest_directive"
     exit 1
 fi
-
-joblib_import="$(git grep -l -A 10 -E "joblib import.+delayed" -- "*.py" ":!sklearn/utils/_joblib.py" ":!sklearn/utils/fixes.py")"
-
-if [ ! -z "$joblib_import" ]; then
-    echo "Use from sklearn.utils.fixes import delayed instead of joblib delayed. The following files contains imports to joblib.delayed:"
-    echo "$joblib_import"
-    exit 1
-fi
diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh
@@ -12,6 +12,9 @@ mkdir -p $TEST_DIR
 cp setup.cfg $TEST_DIR
 cd $TEST_DIR
 
+# python -c "import joblib; print(f'Number of cores (physical): \
+# {joblib.cpu_count()} ({joblib.cpu_count(only_physical_cores=True)})')"
+# python -c "import sklearn; sklearn.show_versions()"
 python -c "import imblearn; imblearn.show_versions()"
 
 if ! command -v conda &> /dev/null

diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh
@@ -89,12 +89,13 @@ if [[ `type -t deactivate` ]]; then
     deactivate
 fi
 
-# Install dependencies with miniconda
-wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh \
-    -O miniconda.sh
-chmod +x miniconda.sh && ./miniconda.sh -b -p $MINICONDA_PATH
-export PATH="$MINICONDA_PATH/bin:$PATH"
-conda update --yes --quiet conda
+MAMBAFORGE_PATH=$HOME/mambaforge
+# Install dependencies with mamba
+wget -q https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh \
+    -O mambaforge.sh
+chmod +x mambaforge.sh && ./mambaforge.sh -b -p $MAMBAFORGE_PATH
+export PATH="$MAMBAFORGE_PATH/bin:$PATH"
+mamba update --yes --quiet conda
 
 # imports get_dep
 source build_tools/shared.sh

diff --git a/doc/Makefile b/doc/Makefile
@@ -2,7 +2,7 @@
 #
 
 # You can set these variables from the command line.
-SPHINXOPTS    =
+SPHINXOPTS    = -v
 SPHINXBUILD   = sphinx-build
 PAPER         =
 BUILDDIR      = _build

diff --git a/doc/_templates/breadcrumbs.html b/doc/_templates/breadcrumbs.html
diff --git a/doc/common_pitfalls.rst b/doc/common_pitfalls.rst
@@ -53,10 +53,9 @@ increase the effect of the wrongdoings::
 
 Let's first check the balancing ratio on this dataset::
 
-  >>> y.value_counts(normalize=True)
-  <=50K    0.98801
-  >50K     0.01199
-  Name: class, dtype: float64
+  >>> from collections import Counter
+  >>> {key: value / len(y) for key, value in Counter(y).items()}
+  {'<=50K': 0.988..., '>50K': 0.011...}
 
 To later highlight some of the issue, we will keep aside a left-out set that we
 will not use for the evaluation of the model::
@@ -72,7 +71,6 @@ classifier, without any preprocessing to alleviate the bias toward the majority
 class. We evaluate the generalization performance of the classifier via
 cross-validation::
 
-  >>> from sklearn.experimental import enable_hist_gradient_boosting
   >>> from sklearn.ensemble import HistGradientBoostingClassifier
   >>> from sklearn.model_selection import cross_validate
   >>> model = HistGradientBoostingClassifier(random_state=0)
@@ -130,9 +128,9 @@ cross-validation::
   ...     f"{cv_results['test_score'].std():.3f}"
   ... )
   Balanced accuracy mean +/- std. dev.: 0.724 +/- 0.042
-  
-The cross-validation performance looks good, but evaluating the classifiers 
-on the left-out data shows a different picture:: 
+
+The cross-validation performance looks good, but evaluating the classifiers
+on the left-out data shows a different picture::
 
   >>> scores = []
   >>> for fold_id, cv_model in enumerate(cv_results["estimator"]):
@@ -147,7 +145,7 @@ on the left-out data shows a different picture::
   ... )
   Balanced accuracy mean +/- std. dev.: 0.698 +/- 0.014
 
-We see that the performance is now worse than the cross-validated performance. 
+We see that the performance is now worse than the cross-validated performance.
 Indeed, the data leakage gave us too optimistic results due to the reason
 stated earlier in this section.
 

diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst
@@ -38,10 +38,10 @@ randomly sampling with replacement the current available samples. The
 The augmented data set should be used instead of the original data set to train
 a classifier::
 
-  >>> from sklearn.svm import LinearSVC
-  >>> clf = LinearSVC()
+  >>> from sklearn.linear_model import LogisticRegression
+  >>> clf = LogisticRegression()
   >>> clf.fit(X_resampled, y_resampled)
-  LinearSVC(...)
+  LogisticRegression(...)
 
 In the figure below, we compare the decision functions of a classifier trained
 using the over-sampled data set and the original data set.
@@ -108,11 +108,11 @@ the same manner::
   >>> X_resampled, y_resampled = SMOTE().fit_resample(X, y)
   >>> print(sorted(Counter(y_resampled).items()))
   [(0, 4674), (1, 4674), (2, 4674)]
-  >>> clf_smote = LinearSVC().fit(X_resampled, y_resampled)
+  >>> clf_smote = LogisticRegression().fit(X_resampled, y_resampled)
   >>> X_resampled, y_resampled = ADASYN().fit_resample(X, y)
   >>> print(sorted(Counter(y_resampled).items()))
   [(0, 4673), (1, 4662), (2, 4674)]
-  >>> clf_adasyn = LinearSVC().fit(X_resampled, y_resampled)
+  >>> clf_adasyn = LogisticRegression().fit(X_resampled, y_resampled)
 
 The figure below illustrates the major difference of the different
 over-sampling methods.

diff --git a/doc/sphinxext/sphinx_issues.py b/doc/sphinxext/sphinx_issues.py
@@ -76,7 +76,6 @@ def cve_role(name, rawtext, text, lineno, inliner, options=None, content=None):
 
 
 class IssueRole(object):
-
     EXTERNAL_REPO_REGEX = re.compile(r"^(\w+)/(.+)([#@])([\w]+)$")
 
     def __init__(

diff --git a/examples/applications/plot_impact_imbalanced_classes.py b/examples/applications/plot_impact_imbalanced_classes.py
@@ -338,7 +338,6 @@
 # classifier within a :class:`~imblearn.ensemble.BalancedBaggingClassifier`.
 
 from sklearn.ensemble import HistGradientBoostingClassifier
-from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 
 from imblearn.ensemble import BalancedBaggingClassifier
 

diff --git a/examples/applications/plot_multi_class_under_sampling.py b/examples/applications/plot_multi_class_under_sampling.py
@@ -43,9 +43,7 @@
 print(f"Testing target statistics: {Counter(y_test)}")
 
 # Create a pipeline
-pipeline = make_pipeline(
-    NearMiss(version=2), StandardScaler(), LogisticRegression(random_state=RANDOM_STATE)
-)
+pipeline = make_pipeline(NearMiss(version=2), StandardScaler(), LogisticRegression())
 pipeline.fit(X_train, y_train)
 
 # Classify and report the results

diff --git a/examples/combine/plot_comparison_combine.py b/examples/combine/plot_comparison_combine.py
@@ -102,7 +102,7 @@ def plot_decision_function(X, y, clf, ax):
 # :class:`~imblearn.combine.SMOTEENN` cleans more noisy data than
 # :class:`~imblearn.combine.SMOTETomek`.
 
-from sklearn.svm import LinearSVC
+from sklearn.linear_model import LogisticRegression
 
 from imblearn.combine import SMOTEENN, SMOTETomek
 
@@ -114,7 +114,7 @@ def plot_decision_function(X, y, clf, ax):
 
 fig, axs = plt.subplots(3, 2, figsize=(15, 25))
 for ax, sampler in zip(axs, samplers):
-    clf = make_pipeline(sampler, LinearSVC()).fit(X, y)
+    clf = make_pipeline(sampler, LogisticRegression()).fit(X, y)
     plot_decision_function(X, y, clf, ax[0])
     plot_resampling(X, y, sampler, ax[1])
 fig.tight_layout()

diff --git a/examples/evaluation/plot_classification_report.py b/examples/evaluation/plot_classification_report.py
@@ -14,9 +14,9 @@
 
 
 from sklearn import datasets
+from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import StandardScaler
-from sklearn.svm import LinearSVC
 
 from imblearn import over_sampling as os
 from imblearn import pipeline as pl
@@ -43,7 +43,7 @@
 pipeline = pl.make_pipeline(
     StandardScaler(),
     os.SMOTE(random_state=RANDOM_STATE),
-    LinearSVC(max_iter=10_000, random_state=RANDOM_STATE),
+    LogisticRegression(max_iter=10_000),
 )
 
 # Split the data

diff --git a/examples/evaluation/plot_metrics.py b/examples/evaluation/plot_metrics.py
@@ -48,10 +48,11 @@
 
 # %% [markdown]
 # We will create a pipeline made of a :class:`~imblearn.over_sampling.SMOTE`
-# over-sampler followed by a :class:`~sklearn.svm.LinearSVC` classifier.
+# over-sampler followed by a :class:`~sklearn.linear_model.LogisticRegression`
+# classifier.
 
+from sklearn.linear_model import LogisticRegression
 from sklearn.preprocessing import StandardScaler
-from sklearn.svm import LinearSVC
 
 from imblearn.over_sampling import SMOTE
 
@@ -61,7 +62,7 @@
 model = make_pipeline(
     StandardScaler(),
     SMOTE(random_state=RANDOM_STATE),
-    LinearSVC(max_iter=10_000, random_state=RANDOM_STATE),
+    LogisticRegression(max_iter=10_000, random_state=RANDOM_STATE),
 )
 
 # %% [markdown]