diff --git a/.gitignore b/.gitignore index ad8665a..4b3bb25 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ mapclassify/tests/.ropeproject/ .DS_Store .vscode/settings.json __pycache__ +/notebooks/.ipynb_checkpoints/ diff --git a/mapclassify/_classify_API.py b/mapclassify/_classify_API.py index 034677f..27fa19c 100644 --- a/mapclassify/_classify_API.py +++ b/mapclassify/_classify_API.py @@ -51,6 +51,7 @@ def classify( initial=100, bins=None, lowest=None, + anchor=False, ): """ @@ -94,6 +95,9 @@ def classify( Scalar minimum value of lowest class. Default is to set the minimum to ``-inf`` if ``y.min()`` > first upper bound (which will override the default), otherwise minimum is set to ``y.min()``. + anchor : bool (default False) + Anchor upper bound of one class to the sample mean. + Returns @@ -182,7 +186,7 @@ def classify( classifier = _classifiers[scheme](y, pct) elif scheme == "stdmean": - classifier = _classifiers[scheme](y, multiples) + classifier = _classifiers[scheme](y, multiples, anchor) elif scheme == "jenkscaspallsampled": classifier = _classifiers[scheme](y, k, pct_sampled) diff --git a/mapclassify/classifiers.py b/mapclassify/classifiers.py index 0c792ca..627d396 100644 --- a/mapclassify/classifiers.py +++ b/mapclassify/classifiers.py @@ -1520,12 +1520,14 @@ class StdMean(MapClassifier): Parameters ---------- - y : numpy.array - :math:`(n,1)`, values to classify. + :math:`(n,1)`, values to classify multiples : numpy.array (default [-2, -1, 1, 2]) The multiples of the standard deviation to add/subtract from - the sample mean to define the bins + the sample mean to define the bins. + anchor : bool (default False) + Anchor upper bound of one class to the sample mean. + Attributes ---------- @@ -1539,6 +1541,17 @@ class StdMean(MapClassifier): counts : numpy.array :math:`(k,1)`, the number of observations falling in each class. + Notes + ----- + + If anchor is True, one of the intervals will have its closed upper bound + equal to the mean of y. Intermediate intervals will have widths equal to + the standard deviation of y. The first interval will be closed on the + minimum value of y, and the last interval will be closed on the maximum of + y. The first and last intervals may have widths different from the + intermediate intervals. + + Examples -------- @@ -1562,11 +1575,20 @@ class StdMean(MapClassifier): >>> list(st3.counts) [0, 0, 57, 0, 1] - + >>> stda = mapclassify.StdMean(cal, anchor=True) + >>> stda.k + 9 + >>> stda.bins + array([ 125.92810345, 672.57333208, 1219.21856072, 1765.86378936, + 2312.50901799, 2859.15424663, 3405.79947527, 3952.4447039 , + 4111.45 ]) + >>> cal.mean(), cal.std(), cal.min(), cal.max() + (125.92810344827588, 546.6452286365233, 0.13, 4111.45) """ - def __init__(self, y, multiples=[-2, -1, 1, 2]): + def __init__(self, y, multiples=[-2, -1, 1, 2], anchor=False): self.multiples = multiples + self.anchor = anchor MapClassifier.__init__(self, y) self.name = "StdMean" @@ -1574,6 +1596,10 @@ def _set_bins(self): y = self.y s = y.std(ddof=1) m = y.mean() + if self.anchor: + min_z = int((y.min() - m) / s) + max_z = int((y.max() - m) / s) + 1 + self.multiples = list(range(min_z, max_z)) cuts = [m + s * w for w in self.multiples] y_max = y.max() if cuts[-1] < y_max: diff --git a/mapclassify/tests/test_mapclassify.py b/mapclassify/tests/test_mapclassify.py index 3a5b607..8e41b73 100644 --- a/mapclassify/tests/test_mapclassify.py +++ b/mapclassify/tests/test_mapclassify.py @@ -631,6 +631,30 @@ def test_UserDefined_lowest(self): assert ud.get_legend_classes() == classes +class TestStdMeanAnchor: + def setup_method(self): + self.V = load_example() + + def test_StdMeanAnchor(self): + sm = StdMean(self.V, anchor=True) + bins = numpy.array( + [ + 125.92810345, + 672.57333208, + 1219.21856072, + 1765.86378936, + 2312.50901799, + 2859.15424663, + 3405.79947527, + 3952.4447039, + 4111.45, + ] + ) + counts = numpy.array([50, 6, 1, 0, 0, 0, 0, 0, 1]) + numpy.testing.assert_array_almost_equal(sm.bins, bins) + numpy.testing.assert_array_almost_equal(sm.counts, counts) + + class TestMaxP: def setup_method(self): self.V = load_example() diff --git a/notebooks/06_api.ipynb b/notebooks/06_api.ipynb index ca6a3a0..1eae870 100644 --- a/notebooks/06_api.ipynb +++ b/notebooks/06_api.ipynb @@ -23,7 +23,8 @@ "ExecuteTime": { "end_time": "2022-11-05T15:10:19.167785Z", "start_time": "2022-11-05T15:10:14.404320Z" - } + }, + "tags": [] }, "outputs": [], "source": [ @@ -46,13 +47,14 @@ "ExecuteTime": { "end_time": "2022-11-05T15:10:19.182165Z", "start_time": "2022-11-05T15:10:19.171353Z" - } + }, + "tags": [] }, "outputs": [ { "data": { "text/plain": [ - "'2.5.0+8.g34341a22.dirty'" + "'2.4.2+107.gb97c316a.dirty'" ] }, "execution_count": 2, @@ -71,7 +73,8 @@ "ExecuteTime": { "end_time": "2022-11-05T15:10:19.586837Z", "start_time": "2022-11-05T15:10:19.187232Z" - } + }, + "tags": [] }, "outputs": [ { @@ -295,7 +298,8 @@ "ExecuteTime": { "end_time": "2022-11-05T15:10:19.595711Z", "start_time": "2022-11-05T15:10:19.589037Z" - } + }, + "tags": [] }, "outputs": [ { @@ -339,7 +343,8 @@ "ExecuteTime": { "end_time": "2022-11-05T15:10:19.603460Z", "start_time": "2022-11-05T15:10:19.598526Z" - } + }, + "tags": [] }, "outputs": [ { @@ -374,7 +379,8 @@ "ExecuteTime": { "end_time": "2022-11-05T15:10:19.611996Z", "start_time": "2022-11-05T15:10:19.608075Z" - } + }, + "tags": [] }, "outputs": [ { @@ -399,7 +405,8 @@ "ExecuteTime": { "end_time": "2022-11-05T15:10:19.619168Z", "start_time": "2022-11-05T15:10:19.614412Z" - } + }, + "tags": [] }, "outputs": [ { @@ -440,7 +447,8 @@ "ExecuteTime": { "end_time": "2022-11-05T15:10:19.627988Z", "start_time": "2022-11-05T15:10:19.621853Z" - } + }, + "tags": [] }, "outputs": [ { @@ -474,7 +482,8 @@ "ExecuteTime": { "end_time": "2022-11-05T15:10:19.634396Z", "start_time": "2022-11-05T15:10:19.629847Z" - } + }, + "tags": [] }, "outputs": [ { @@ -508,7 +517,8 @@ "ExecuteTime": { "end_time": "2022-11-05T15:10:19.641115Z", "start_time": "2022-11-05T15:10:19.636017Z" - } + }, + "tags": [] }, "outputs": [ { @@ -537,258 +547,62 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2022-11-05T15:10:19.691302Z", "start_time": "2022-11-05T15:10:19.645124Z" - } + }, + "tags": [] }, "outputs": [ { "data": { "text/plain": [ - "\u001b[0;31mSignature:\u001b[0m\n", - "\u001b[0mmapclassify\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclassify\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mscheme\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mpct\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m10\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m50\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m90\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m99\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m100\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mpct_sampled\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mtruncate\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mhinge\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1.5\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mmultiples\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mmindiff\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0minitial\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mbins\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mDocstring:\u001b[0m\n", - "Classify your data with ``mapclassify.classify``.\n", - "Input parameters are dependent on classifier used.\n", - "\n", - "Parameters\n", - "----------\n", - "\n", - "y : numpy.array\n", - " :math:`(n,1)`, values to classify.\n", - "scheme : str\n", - " ``pysal.mapclassify`` classification scheme.\n", - "k : int (default 5)\n", - " The number of classes.\n", - "pct : numpy.array (default [1, 10, 50, 90, 99, 100])\n", - " Percentiles used for classification with ``percentiles``.\n", - "pct_sampled : float default (0.10)\n", - " The percentage of n that should form the sample\n", - " (``JenksCaspallSampled``, ``FisherJenksSampled``)\n", - " If ``pct`` is specified such that ``n*pct > 1000``, then ``pct=1000``.\n", - "truncate : bool (default True)\n", - " Truncate ``pct_sampled`` in cases where ``pct * n > 1000``.\n", - "hinge : float (default 1.5)\n", - " Multiplier for *IQR* when ``BoxPlot`` classifier used.\n", - "multiples : numpy.array (default [-2,-1,1,2])\n", - " The multiples of the standard deviation to add/subtract from\n", - " the sample mean to define the bins using ``std_mean``.\n", - "mindiff : float (default is 0)\n", - " The minimum difference between class breaks\n", - " if using ``maximum_breaks`` classifier.\n", - "initial : int (default 100)\n", - " Number of initial solutions to generate or number of runs when using\n", - " ``natural_breaks`` or ``max_p_classifier``. Setting initial to ``0``\n", - " will result in the quickest calculation of bins.\n", - "bins : numpy.array (default None)\n", - " :math:`(k,1)`, upper bounds of classes (have to be monotically\n", - " increasing) if using ``user_defined`` classifier.\n", - " Default is ``None``. For example: ``[20, max(y)]``.\n", + "StdMean\n", "\n", - "Returns\n", - "-------\n", - "classifier : mapclassify.classifiers.MapClassifier\n", - " Object containing bin ids for each observation (``.yb``),\n", - " upper bounds of each class (``.bins``), number of classes (``.k``)\n", - " and number of observations falling in each class (``.counts``).\n", - "\n", - "Notes\n", - "-----\n", - "\n", - "Supported classifiers include:\n", - "\n", - "* ``quantiles``\n", - "* ``box_plot``\n", - "* ``equal_interval``\n", - "* ``fisher_jenks``\n", - "* ``fisher_jenks_sampled``\n", - "* ``headtail_breaks``\n", - "* ``jenks_caspall``\n", - "* ``jenks_caspall_sampled``\n", - "* ``jenks_caspall_forced``\n", - "* ``max_p``\n", - "* ``maximum_breaks``\n", - "* ``natural_breaks``\n", - "* ``percentiles``\n", - "* ``std_mean``\n", - "* ``user_defined``\n", - "\n", - "Examples\n", - "--------\n", - "\n", - ">>> import libpysal\n", - ">>> import geopandas\n", - ">>> from mapclassify import classify\n", - "\n", - "Load example data.\n", - "\n", - ">>> link_to_data = libpysal.examples.get_path(\"columbus.shp\")\n", - ">>> gdf = geopandas.read_file(link_to_data)\n", - ">>> x = gdf['HOVAL'].values\n", - "\n", - "Classify values by quantiles.\n", - "\n", - ">>> quantiles = classify(x, \"quantiles\")\n", - "\n", - "Classify values by box_plot and set hinge to ``2``.\n", - "\n", - ">>> box_plot = classify(x, 'box_plot', hinge=2)\n", - ">>> box_plot\n", - "BoxPlot\n", - "\n", " Interval Count\n", "----------------------\n", - "( -inf, -9.50] | 0\n", - "(-9.50, 25.70] | 13\n", - "(25.70, 33.50] | 12\n", - "(33.50, 43.30] | 12\n", - "(43.30, 78.50] | 9\n", - "(78.50, 96.40] | 3\n", - "\u001b[0;31mFile:\u001b[0m ~/Documents/p/pysal/src/subpackages/mapclassify/mapclassify/_classify_API.py\n", - "\u001b[0;31mType:\u001b[0m function\n" + "( -inf, 1.50] | 0\n", + "( 1.50, 19.97] | 5\n", + "(19.97, 56.90] | 37\n", + "(56.90, 75.37] | 3\n", + "(75.37, 96.40] | 4" ] }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "mapclassify.classify?" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "ExecuteTime": { - "end_time": "2022-11-05T15:10:19.641115Z", - "start_time": "2022-11-05T15:10:19.636017Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "UserDefined\n", - "\n", - " Interval Count\n", - "------------------------\n", - "( -inf, 0.00] | 0\n", - "( 0.00, 50.00] | 40\n", - "( 50.00, 100.00] | 9" - ] - }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "mapclassify.classify(y, \"User_Defined\", bins=[0,50, 100])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Finding bins for new data" + "mapclassify.classify(y, 'Std_Mean')" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "metadata": { "ExecuteTime": { - "end_time": "2022-11-05T15:10:19.641115Z", - "start_time": "2022-11-05T15:10:19.636017Z" - } + "end_time": "2022-10-26T03:01:45.977181Z", + "start_time": "2022-10-26T03:01:45.931234Z" + }, + "tags": [] }, - "outputs": [], - "source": [ - "r = mapclassify.classify(y, \"User_Defined\", bins=[0,50, 100])" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\u001b[0;31mSignature:\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfind_bin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mSource:\u001b[0m \n", - " \u001b[0;32mdef\u001b[0m \u001b[0mfind_bin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m\"\"\"\u001b[0m\n", - "\u001b[0;34m Sort input or inputs according to the current bin estimate.\u001b[0m\n", - "\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m Parameters\u001b[0m\n", - "\u001b[0;34m ----------\u001b[0m\n", - "\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m x : numpy.array, int, float\u001b[0m\n", - "\u001b[0;34m A value or array of values to fit within the estimated bins.\u001b[0m\n", - "\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m Returns\u001b[0m\n", - "\u001b[0;34m -------\u001b[0m\n", - "\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m right : numpy.array, int\u001b[0m\n", - "\u001b[0;34m A bin index or array of bin indices that classify the\u001b[0m\n", - "\u001b[0;34m input into one of the classifiers' bins.\u001b[0m\n", - "\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m Notes\u001b[0m\n", - "\u001b[0;34m -----\u001b[0m\n", - "\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m This differs from similar functionality in\u001b[0m\n", - "\u001b[0;34m ``numpy.digitize(x, classi.bins, right=True)``.\u001b[0m\n", - "\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m This will always provide the closest bin, so data \"outside\" the classifier,\u001b[0m\n", - "\u001b[0;34m above and below the max/min breaks, will be classified into the nearest bin.\u001b[0m\n", - "\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m ``numpy.digitize`` returns :math:`k+1` for data greater than the greatest bin,\u001b[0m\n", - "\u001b[0;34m but retains 0 for data below the lowest bin.\u001b[0m\n", - "\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m \"\"\"\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mflatten\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mright\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdigitize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbins\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mright\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mright\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbins\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mright\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mright\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbins\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbins\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mright\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mFile:\u001b[0m ~/Documents/p/pysal/src/subpackages/mapclassify/mapclassify/classifiers.py\n", - "\u001b[0;31mType:\u001b[0m method\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "r.find_bin??" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(array([ 0, 50, 100]), array([ 0, 40, 9]))" + "StdMean\n", + "\n", + " Interval Count\n", + "----------------------\n", + "[17.90, 19.97] | 5\n", + "(19.97, 38.44] | 24\n", + "(38.44, 56.90] | 13\n", + "(56.90, 75.37] | 3\n", + "(75.37, 93.83] | 3\n", + "(93.83, 96.40] | 1" ] }, "execution_count": 15, @@ -797,18 +611,20 @@ } ], "source": [ - "r.bins, r.counts" + "mapclassify.classify(y, 'Std_Mean', anchor=True)" ] }, { "cell_type": "code", "execution_count": 16, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [ { "data": { "text/plain": [ - "array([1, 0, 2, 1])" + "(38.43622446938775, 18.466069465206047, 17.9, 96.400002)" ] }, "execution_count": 16, @@ -817,35 +633,15 @@ } ], "source": [ - "r.find_bin([7,0, 51, 33])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that `find_bin` does not recalibrate the classifier:" + "y.mean(), y.std(), y.min(), y.max()" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(array([ 0, 50, 100]), array([ 0, 40, 9]))" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "r.bins, r.counts" - ] + "outputs": [], + "source": [] } ], "metadata": { @@ -864,7 +660,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.10" } }, "nbformat": 4, diff --git a/notebooks/07_std_anchor.ipynb b/notebooks/07_std_anchor.ipynb new file mode 100644 index 0000000..58ef19d --- /dev/null +++ b/notebooks/07_std_anchor.ipynb @@ -0,0 +1,522 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction to mapclassify\n", + "\n", + "`mapclassify` implementsbins = [ ybar + trim * ystd for trim in range(-2, 2+1) ] a family of classification schemes for choropleth maps. \n", + "Its focus is on the determination of the number of classes, and the assignment of observations to those classes.\n", + "It is intended for use with upstream mapping and geovisualization packages (see [geopandas](https://geopandas.org/mapping.html) and [geoplot](https://residentmario.github.io/geoplot/user_guide/Customizing_Plots.html) for examples) that handle the rendering of the maps.\n", + "\n", + "In this notebook, the basic functionality of mapclassify is presented." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2022-10-26T02:53:25.104870Z", + "start_time": "2022-10-26T02:53:23.858480Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'2.4.2+55.g0155c6e6.dirty'" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import mapclassify as mc\n", + "\n", + "mc.__version__" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import geopandas as gpd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "gdf = gpd.read_file(\"data/nyc/nyc.shp\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "gdf.plot()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "gdf.plot(column='rent2008')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "StdMean \n", + "\n", + " Interval Count\n", + "--------------------------\n", + "[ 0.00, 68.04] | 3\n", + "( 68.04, 662.61] | 0\n", + "( 662.61, 1851.75] | 45\n", + "(1851.75, 2446.32] | 2\n", + "(2446.32, 2900.00] | 5" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "std = mc.StdMean(gdf.rent2008)\n", + "std" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "StdMean \n", + "\n", + " Interval Count\n", + "--------------------------\n", + "[ 0.00, 68.04] | 3\n", + "( 68.04, 662.61] | 0\n", + "( 662.61, 1257.18] | 33\n", + "(1257.18, 1851.75] | 12\n", + "(1851.75, 2446.32] | 2\n", + "(2446.32, 2900.00] | 5" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stda = mc.StdMean(gdf.rent2008, anchor=True)\n", + "stda" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1257.1818181818182" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y = gdf.rent2008\n", + "y.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "y1 = y.values\n", + "y1[0] = 5000" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "StdMean \n", + "\n", + " Interval Count\n", + "--------------------------\n", + "( -inf, -227.42] | 0\n", + "(-227.42, 551.24] | 3\n", + "( 551.24, 2108.58] | 45\n", + "(2108.58, 2887.24] | 5\n", + "(2887.24, 5000.00] | 2" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mc.StdMean(y1)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "StdMean \n", + "\n", + " Interval Count\n", + "--------------------------\n", + "[ 0.00, 551.24] | 3\n", + "( 551.24, 1329.91] | 35\n", + "(1329.91, 2108.58] | 10\n", + "(2108.58, 2887.24] | 5\n", + "(2887.24, 3665.91] | 1\n", + "(3665.91, 4444.57] | 0\n", + "(4444.57, 5000.00] | 1" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mc.StdMean(y1, anchor=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1329.909090909091" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y1.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "-2.1144409159637743" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "z.min()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "68.04306411189191" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y.mean() - 2 * y.std()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2.0000007194253047" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(2446.321 - y.mean()) / y.std()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(-2.1144409159637743, 2.7630386718042783)" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "min(z), max(z)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1257.1818181818182" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[-2, 2]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(map(int, (min(z), max(z))))" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "ybar = y.mean()\n", + "ystd = y.std()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "bins = [ ybar + trim * ystd for trim in range(-2, 2+1) ]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[68.04306411189191,\n", + " 662.6124411468551,\n", + " 1257.1818181818182,\n", + " 1851.7511952167815,\n", + " 2446.3205722517446]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bins" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "-2" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "int(-2.1)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "int(2.1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}