From 729386e1c5385b9b4344c16867915b931a18d64b Mon Sep 17 00:00:00 2001 From: Martin Fleischmann Date: Wed, 24 Apr 2024 20:22:00 +0200 Subject: [PATCH 1/8] REF: speed-up desribe using numba --- momepy/functional/_diversity.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/momepy/functional/_diversity.py b/momepy/functional/_diversity.py index deac7176..0d881c31 100644 --- a/momepy/functional/_diversity.py +++ b/momepy/functional/_diversity.py @@ -4,7 +4,10 @@ from pandas import DataFrame, Series from scipy import stats -from ..utils import limit_range +try: + from numba import njit +except ModuleNotFoundError: + from libpysal.common import jit as njit __all__ = ["describe"] @@ -47,9 +50,30 @@ def describe( A DataFrame with descriptive statistics. """ + @njit + def _mode(array): + """Custom mode function for numba.""" + array = np.sort(array.ravel()) + mask = np.empty(array.shape, dtype=np.bool_) + mask[:1] = True + mask[1:] = array[1:] != array[:-1] + unique = array[mask] + idx = np.nonzero(mask)[0] + idx = np.append(idx, mask.size) + counts = np.diff(idx) + return unique[np.argmax(counts)] + + @njit def _describe(values, q, include_mode=False): """Helper function to calculate average.""" - values = limit_range(values.values, q) + nan_tracker = np.isnan(values) + + if (len(values) > 2) and (not nan_tracker.all()): + if nan_tracker.any(): + lower, higher = np.nanpercentile(values, q) + else: + lower, higher = np.percentile(values, q) + values = values[(lower <= values) & (values <= higher)] results = [ np.mean(values), @@ -60,7 +84,7 @@ def _describe(values, q, include_mode=False): np.sum(values), ] if include_mode: - results.append(stats.mode(values, keepdims=False)[0]) + results.append(_mode(values)) return results if not isinstance(y, Series): @@ -75,7 +99,7 @@ def _describe(values, q, include_mode=False): if include_mode: stat_["mode"] = grouper.agg(lambda x: stats.mode(x, keepdims=False)[0]) else: - agg = grouper.agg(_describe, q=q, include_mode=include_mode) + agg = grouper.agg(lambda x: _describe(x.values, q=q, include_mode=include_mode)) stat_ = DataFrame(zip(*agg, strict=True)).T cols = ["mean", "median", "std", "min", "max", "sum"] if include_mode: From ada1ac58bc4d3573dd4c9bdb12345a94d97b3d6f Mon Sep 17 00:00:00 2001 From: Martin Fleischmann Date: Wed, 24 Apr 2024 20:35:57 +0200 Subject: [PATCH 2/8] remove scipy mode --- momepy/functional/_diversity.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/momepy/functional/_diversity.py b/momepy/functional/_diversity.py index 0d881c31..530da630 100644 --- a/momepy/functional/_diversity.py +++ b/momepy/functional/_diversity.py @@ -2,7 +2,6 @@ from libpysal.graph import Graph from numpy.typing import NDArray from pandas import DataFrame, Series -from scipy import stats try: from numba import njit @@ -97,7 +96,7 @@ def _describe(values, q, include_mode=False): if q is None: stat_ = grouper.agg(["mean", "median", "std", "min", "max", "sum"]) if include_mode: - stat_["mode"] = grouper.agg(lambda x: stats.mode(x, keepdims=False)[0]) + stat_["mode"] = grouper.agg(lambda x: _mode(x.values)) else: agg = grouper.agg(lambda x: _describe(x.values, q=q, include_mode=include_mode)) stat_ = DataFrame(zip(*agg, strict=True)).T From 8d0d1d546ee02b0711b7b44f1775d3f9ab7c15d2 Mon Sep 17 00:00:00 2001 From: Martin Fleischmann Date: Wed, 24 Apr 2024 22:08:55 +0200 Subject: [PATCH 3/8] warn about missing numba --- momepy/functional/_diversity.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/momepy/functional/_diversity.py b/momepy/functional/_diversity.py index 530da630..5ee80bf9 100644 --- a/momepy/functional/_diversity.py +++ b/momepy/functional/_diversity.py @@ -1,13 +1,10 @@ +import warnings + import numpy as np from libpysal.graph import Graph from numpy.typing import NDArray from pandas import DataFrame, Series -try: - from numba import njit -except ModuleNotFoundError: - from libpysal.common import jit as njit - __all__ = ["describe"] @@ -28,6 +25,9 @@ def describe( The index of ``values`` must match the index along which the ``graph`` is built. + The numba package is used extensively in this function to accelerate the computation + of statistics. Without numba, these computations may become slow on large data. + Parameters ---------- y : NDArray[np.float_] | Series @@ -48,6 +48,17 @@ def describe( DataFrame A DataFrame with descriptive statistics. """ + try: + from numba import njit + except (ModuleNotFoundError, ImportError): + warnings.warn( + "The numba package is used extensively in this function to accelerate the" + " computation of statistics but it is not installed or cannot be imported." + " Without numba, these computations may become slow on large data.", + UserWarning, + stacklevel=2, + ) + from libpysal.common import jit as njit @njit def _mode(array): From 0ebed9739fb2330a6beefce5fa1ad8e6e71a56bb Mon Sep 17 00:00:00 2001 From: Martin Fleischmann Date: Wed, 24 Apr 2024 22:09:11 +0200 Subject: [PATCH 4/8] test envs with and without numba --- .github/workflows/tests.yaml | 4 +++- ci/envs/310-latest.yaml | 1 + ci/envs/310-oldest.yaml | 1 + ci/envs/311-latest.yaml | 1 + ci/envs/312-dev.yaml | 1 + ci/envs/312-latest.yaml | 1 + ci/envs/312-min.yaml | 32 ++++++++++++++++++++++++++++++++ 7 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 ci/envs/312-min.yaml diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 8e46afc5..f3187e26 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -31,11 +31,13 @@ jobs: - ci/envs/312-dev.yaml include: - environment-file: ci/envs/312-latest.yaml - os: macos-latest + os: macos-13 # Intel - environment-file: ci/envs/312-latest.yaml os: macos-14 # Apple Silicon - environment-file: ci/envs/312-latest.yaml os: windows-latest + - environment-file: ci/envs/312-min.yaml + os: ubuntu-latest defaults: run: shell: bash -l {0} diff --git a/ci/envs/310-latest.yaml b/ci/envs/310-latest.yaml index ede9f2d2..d9bbeb39 100644 --- a/ci/envs/310-latest.yaml +++ b/ci/envs/310-latest.yaml @@ -13,6 +13,7 @@ dependencies: - shapely>=2 - esda - tqdm + - numba # testing - codecov - pytest diff --git a/ci/envs/310-oldest.yaml b/ci/envs/310-oldest.yaml index b546ebe1..b5d6077b 100644 --- a/ci/envs/310-oldest.yaml +++ b/ci/envs/310-oldest.yaml @@ -13,6 +13,7 @@ dependencies: - pandas>=1.4.0,!=1.5.0,<2 - shapely=2.0 - tqdm=4.63.0 + - numba # testing - codecov - pytest diff --git a/ci/envs/311-latest.yaml b/ci/envs/311-latest.yaml index 57d4feb9..5e70a7ae 100644 --- a/ci/envs/311-latest.yaml +++ b/ci/envs/311-latest.yaml @@ -13,6 +13,7 @@ dependencies: - shapely>=2 - esda - tqdm + - numba # testing - codecov - pytest diff --git a/ci/envs/312-dev.yaml b/ci/envs/312-dev.yaml index c8373b30..698f825f 100644 --- a/ci/envs/312-dev.yaml +++ b/ci/envs/312-dev.yaml @@ -13,6 +13,7 @@ dependencies: - pandas!=1.5.0 - shapely>=2 - tqdm + - numba # testing - codecov - pytest diff --git a/ci/envs/312-latest.yaml b/ci/envs/312-latest.yaml index 11dc0350..0a4d0ff8 100644 --- a/ci/envs/312-latest.yaml +++ b/ci/envs/312-latest.yaml @@ -14,6 +14,7 @@ dependencies: - shapely>=2 - esda - tqdm + - numba # testing - codecov - pytest diff --git a/ci/envs/312-min.yaml b/ci/envs/312-min.yaml new file mode 100644 index 00000000..11dc0350 --- /dev/null +++ b/ci/envs/312-min.yaml @@ -0,0 +1,32 @@ +name: test +channels: + - conda-forge +dependencies: + - python=3.12 + - geopandas + - inequality + - libpysal>=4.10.0 + - mapclassify + - networkx + - osmnx + - packaging + - pandas!=1.5.0 + - shapely>=2 + - esda + - tqdm + # testing + - codecov + - pytest + - pytest-cov + # user guide testing + - dask + - inequality + - jupyter + - matplotlib + - osmnx + - clustergram + - bokeh + - geopy + - ipywidgets + - Iprogress + From 8bfc8a984bbeb3150cea5935582de4cb00102945 Mon Sep 17 00:00:00 2001 From: Martin Fleischmann Date: Wed, 24 Apr 2024 22:23:35 +0200 Subject: [PATCH 5/8] numba everywhere --- docs/install.rst | 8 +++++++- environment.yml | 3 +-- pyproject.toml | 8 ++++++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/docs/install.rst b/docs/install.rst index ccb67dc3..89609c5a 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -117,18 +117,24 @@ Some functions also depend on additional packages, which are optional: - `mapclassify`_ (>= 2.4.2) - `inequality`_ +- `numba`_ +- `esda`_ .. _geopandas: https://geopandas.org/ .. _mapclassify: http://pysal.org/mapclassify +.. _esda: http://pysal.org/esda + .. _libpysal: http://pysal.org/libpysal -.. _inequality: https://inequality.readthedocs.io +.. _inequality: http://pysal.org/inequality .. _networkx: http://networkx.github.io +.. _numba: https://numba.pydata.org + .. _tqdm: http://networkx.github.io .. _pysal: http://pysal.org diff --git a/environment.yml b/environment.yml index fbc34c7e..0cfa68b5 100644 --- a/environment.yml +++ b/environment.yml @@ -9,10 +9,9 @@ dependencies: - libpysal>=4.10.0 - mapclassify - matplotlib - - momepy - networkx - osmnx - pandas!=1.5.0 - shapely>=2 - - rtree - tqdm + - numba diff --git a/pyproject.toml b/pyproject.toml index dfe374d5..50e871b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,14 @@ dependencies = [ Home = "http://momepy.org" Repository = "https://github.com/pysal/momepy" +[project.optional-dependencies] +plus = [ + "numba", + "inequality", + "mapclassify", + "esda", +] + [tool.setuptools.packages.find] include = ["momepy", "momepy.*"] From 382f617e0063345342d77b0248bbcb26cb22dc5d Mon Sep 17 00:00:00 2001 From: Martin Fleischmann Date: Wed, 24 Apr 2024 22:23:59 +0200 Subject: [PATCH 6/8] pin oldest numba --- ci/envs/310-oldest.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/envs/310-oldest.yaml b/ci/envs/310-oldest.yaml index b5d6077b..1ee49498 100644 --- a/ci/envs/310-oldest.yaml +++ b/ci/envs/310-oldest.yaml @@ -13,7 +13,7 @@ dependencies: - pandas>=1.4.0,!=1.5.0,<2 - shapely=2.0 - tqdm=4.63.0 - - numba + - numba=0.55.1 # testing - codecov - pytest From 45378ae87be77465f944fa1e4c9cbde2728485f4 Mon Sep 17 00:00:00 2001 From: Martin Fleischmann Date: Wed, 24 Apr 2024 22:31:15 +0200 Subject: [PATCH 7/8] esda to env --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index 0cfa68b5..644812c7 100644 --- a/environment.yml +++ b/environment.yml @@ -15,3 +15,4 @@ dependencies: - shapely>=2 - tqdm - numba + - esda From 5b757429dd3eaeac6c6db2808ecb37fa443e9bbe Mon Sep 17 00:00:00 2001 From: Martin Fleischmann Date: Wed, 24 Apr 2024 22:32:08 +0200 Subject: [PATCH 8/8] numba compatible with oldest numpy --- ci/envs/310-oldest.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/envs/310-oldest.yaml b/ci/envs/310-oldest.yaml index 1ee49498..d87bfc12 100644 --- a/ci/envs/310-oldest.yaml +++ b/ci/envs/310-oldest.yaml @@ -13,7 +13,7 @@ dependencies: - pandas>=1.4.0,!=1.5.0,<2 - shapely=2.0 - tqdm=4.63.0 - - numba=0.55.1 + - numba=0.56 # testing - codecov - pytest