scrapinghub · manycoding · Jun 3, 2019 · May 28, 2019 · May 29, 2019 · May 29, 2019
diff --git a/CHANGES.md b/CHANGES.md
@@ -9,8 +9,10 @@ Note that the top-most release is changes in the unreleased master branch on Git
 
 [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+
 ## [0.3.6dev] (Work In Progress)
 ### Added
+- **Categories** rule with a plot showing unique values and count per field. By default, `report_all()` only includes fields which have less or equal to 10 unique values. See https://arche.readthedocs.io/en/latest/nbs/Rules.html#Category-fields, #100
 ### Changed
 - `Arche.report_all()` does not shorten report by default, added `short` parameter.
 - `expand=True` which enables nested data flattening is more than 100x faster and consumes ~2x less memory than before, #94

diff --git a/docs/source/nbs/Rules.ipynb b/docs/source/nbs/Rules.ipynb
@@ -7,6 +7,18 @@
     "# Rules"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook contains rules used in the library with examples. They are executed in `report_all()` with default parameters, unless stated otherwise.\n",
+    "\n",
+    "Some definitions here are used interchangeably:\n",
+    "* **df** - a dataframe which holds input data (from a job, collection or other source)\n",
+    "* Scrapy cloud item - a row in a **df**\n",
+    "* Items fields - columns in a **df**"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -41,7 +53,48 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "arche.rules.duplicates.find_by(items.df, [\"title\", \"category\"]).show()"
+    "df = items.df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Fields coverage on input data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "help(arche.rules.coverage.check_fields_coverage)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "arche.rules.coverage.check_fields_coverage(df).show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Category fields"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "help(arche.rules.category.get_categories)"
    ]
   },
   {
@@ -50,15 +103,34 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "arche.rules.coverage.check_fields_coverage(items.df).show()"
+    "arche.rules.category.get_categories(df, max_uniques=50).show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Find duplicates by columns (fields)\n",
+    "This rule is not included in `Arche.report_all()`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "help(arche.rules.duplicates.find_by)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "arche.rules.duplicates.find_by(df, [\"title\", \"category\"]).show()"
+   ]
   }
  ],
  "metadata": {

diff --git a/src/arche/arche.py b/src/arche/arche.py
@@ -158,6 +158,7 @@ def run_general_rules(self):
                 df.drop(columns=df.columns[df.columns.str.startswith("_")])
             )
         )
+        self.save_result(category_rules.get_categories(df))
 
     def validate_with_json_schema(self) -> None:
         """Run JSON schema check and output results. It will try to find all errors, but

diff --git a/src/arche/readers/items.py b/src/arche/readers/items.py
@@ -39,10 +39,21 @@ def flat_df(self) -> pd.DataFrame:
     def process_df(df: pd.DataFrame) -> pd.DataFrame:
         # clean empty objects - mainly lists and dicts, but keep everything else
         df = df.applymap(lambda x: x if x or isinstance(x, numbers.Real) else np.nan)
-        if "_type" in df.columns:
-            df["_type"] = df["_type"].astype("category")
+        Items.categorize(df)
         return df
 
+    @staticmethod
+    def categorize(df: pd.DataFrame) -> pd.DataFrame:
+        if len(df) < 100:
+            return
+        for c in df.columns:
+            try:
+                if df[c].nunique(dropna=False) <= 10:
+                    df[c] = df[c].astype("category")
+            # ignore lists and dicts columns
+            except TypeError:
+                continue
+
     def origin_column_name(self, new: str) -> str:
         if new in self.df.columns:
             return new

diff --git a/src/arche/rules/category.py b/src/arche/rules/category.py
@@ -78,3 +78,29 @@ def get_coverage_per_category(df: pd.DataFrame, category_names: List[str]) -> Re
     if not category_names:
         result.add_info(Outcome.SKIPPED)
     return result
+
+
+def get_categories(df: pd.DataFrame, max_uniques: int = 10) -> Result:
+    """Find category columns. A category column is the column which holds a limited number
+    of possible values, including `NAN`.
+
+    Args:
+        df: data
+        max_uniques: filter which determines which columns to use. Only columns with
+        the number of unique values less than or equal to `max_uniques` are category columns.
+
+    Returns:
+        A result with stats containing value counts.
+    """
+    result = Result("Categories")
+
+    result.stats = [
+        df[c].value_counts(dropna=False)
+        for c in df
+        if len(df[c].value_counts(dropna=False)) <= max_uniques
+    ]
+    if not result.stats:
+        result.add_info("Categories were not found")
+        return result
+    result.add_info(f"{len(result.stats)} category field(s)")
+    return result
diff --git a/src/arche/rules/result.py b/src/arche/rules/result.py
@@ -3,6 +3,7 @@
 from typing import Dict, List, Optional, Union
 
 import pandas as pd
+from plotly.colors import DEFAULT_PLOTLY_COLORS
 import plotly.graph_objs as go
 import plotly.io as pio
 
@@ -99,8 +100,8 @@ def stats(self, value):
 
     @property
     def figures(self):
-        if not self._figures:
-            self._figures = Result.create_figures(self.stats)
+        if not self._figures and self.stats:
+            self._figures = Result.create_figures(self.stats, self.name)
         return self._figures
 
     def add_info(self, summary, detailed=None, errors=None):
@@ -162,32 +163,25 @@ def show(self, short: bool = False, keys_limit: int = 10):
             pio.show(f)
 
     @staticmethod
-    def create_figures(stats: List[Stat]) -> List[go.FigureWidget]:
+    def create_figures(stats: List[Stat], name: str) -> List[go.FigureWidget]:
+        if name == "Categories":
+            data = Result.build_stack_bar_data(stats)
+            layout = Result.get_layout("Category fields", len(stats))
+            layout.barmode = "stack"
+            return [go.FigureWidget(data, layout)]
         figures = []
         for stat in stats:
+            y = stat.index.values.astype(str)
             if isinstance(stat, pd.Series):
-                data = [
-                    go.Bar(
-                        x=stat.values, y=stat.index.values.astype(str), orientation="h"
-                    )
-                ]
+                data = [go.Bar(x=stat.values, y=y, orientation="h", opacity=0.7)]
             else:
                 data = [
-                    go.Bar(
-                        x=stat[c].values, y=stat.index.values, orientation="h", name=c
-                    )
+                    go.Bar(x=stat[c].values, y=y, orientation="h", opacity=0.7, name=c)
                     for c in stat.columns
                 ]
 
-            layout = go.Layout(
-                title=stat.name,
-                bargap=0.1,
-                template="seaborn",
-                height=min(len(stat) * 20, 900),
-                hovermode="y",
-                margin=dict(l=200, t=35),
-                xaxis=go.layout.XAxis(range=[0, max(stat.values.max(), 1) * 1.05]),
-            )
+            layout = Result.get_layout(stat.name, len(stat))
+            layout.xaxis = go.layout.XAxis(range=[0, max(stat.values.max(), 1) * 1.05])
             if stat.name.startswith("Coverage"):
                 layout.xaxis.tickformat = ".2p"
             if stat.name == "Coverage for boolean fields":
@@ -198,6 +192,45 @@ def create_figures(stats: List[Stat]) -> List[go.FigureWidget]:
             figures.append(go.FigureWidget(data, layout))
         return figures
 
+    @staticmethod
+    def build_stack_bar_data(values_counts: List[pd.Series]) -> List[go.Bar]:
+        """Create data for plotly stack bar chart with consistent colors between
+        bars. Each bar values have indexes unique to the bar, without any correlation
+        to other obars.
+
+        Args:
+            values_counts: an array of value_counts series
+
+        Returns:
+            A list of Bar objects.
+        """
+        data = []
+        for vc in values_counts:
+            data = data + [
+                go.Bar(
+                    x=[counts],
+                    y=[vc.name],
+                    name=str(value)[:30],
+                    orientation="h",
+                    opacity=0.6,
+                    legendgroup=vc.name,
+                    marker_color=DEFAULT_PLOTLY_COLORS[i % 10],
+                )
+                for i, (value, counts) in enumerate(vc.items())
+            ]
+        return data
+
+    @staticmethod
+    def get_layout(name: str, rows_count: int) -> go.Layout:
+        return go.Layout(
+            title=name,
+            bargap=0.1,
+            template="seaborn",
+            height=min(rows_count * 20, 900),
+            hovermode="y",
+            margin=dict(l=200, t=35),
+        )
+
     @staticmethod
     def make_annotations(stat: pd.Series) -> List[Dict]:
         annotations = []

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -204,8 +204,14 @@ def pytest_assertrepr_compare(op, left, right):
         ):
             if left_n == "_stats":
                 for left_stat, right_stat in zip_longest(left_v, right_v):
-                    if not Result.tensors_equal(left_stat, right_stat):
+                    try:
+                        if isinstance(left_stat, pd.DataFrame):
+                            pd.testing.assert_frame_equal(left_stat, right_stat)
+                        else:
+                            pd.testing.assert_series_equal(left_stat, right_stat)
+                    except AssertionError as e:
                         assert_msgs.extend([f"{left_stat}", "!=", f"{right_stat}"])
+                        assert_msgs.extend(str(e).split("\n"))
             elif left_v != right_v:
                 assert_msgs.extend([f"{left_v}", "!=", f"{right_v}"])
         return assert_msgs

diff --git a/tests/readers/test_items.py b/tests/readers/test_items.py
@@ -131,10 +131,37 @@ def test_process_df():
         pd.DataFrame([[dict(), list(), "NameItem"]], columns=["a", "b", "_type"])
     )
     exp_df = pd.DataFrame([[np.nan, np.nan, "NameItem"]], columns=["a", "b", "_type"])
-    exp_df["_type"] = exp_df["_type"].astype("category")
     pd.testing.assert_frame_equal(df, exp_df)
 
 
+@pytest.mark.parametrize(
+    "data, expected_cats",
+    [
+        (
+            {
+                "a": [i for i in range(100)],
+                "b": [False for i in range(100)],
+                "c": [[False] for i in range(100)],
+                "d": [{0} for i in range(100)],
+            },
+            ["b"],
+        )
+    ],
+)
+def test_categorize(data, expected_cats):
+    df = pd.DataFrame(data)
+    Items.categorize(df)
+    np.testing.assert_array_equal(
+        df.select_dtypes(["category"]).columns.values, expected_cats
+    )
+
+
+def test_no_categorize():
+    df = pd.DataFrame({"a": [i for i in range(99)]})
+    Items.categorize(df)
+    assert df.select_dtypes(["category"]).empty
+
+
 flat_df_inputs = [
     (
         [{"name": "Bob", "alive": True, "_key": 0, "following": None}],

diff --git a/tests/rules/test_category.py b/tests/rules/test_category.py
@@ -81,3 +81,49 @@ def test_get_difference(source, target, categories, expected_messages, expected_
     ) == create_result(
         "Category Coverage Difference", expected_messages, stats=expected_stats
     )
+
+
+@pytest.mark.parametrize(
+    "data, expected_message",
+    [
+        (np.random.rand(100), "Categories were not found"),
+        (None, "Categories were not found"),
+    ],
+)
+def test_get_no_categories(data, expected_message):
+    result = c.get_categories(pd.DataFrame(data))
+    assert result == create_result("Categories", {Level.INFO: [(expected_message,)]})
+
+
+@pytest.mark.parametrize(
+    "data, max_uniques, expected_stats, expected_message",
+    [
+        (np.zeros(10), 2, [pd.Series(10, index=[0.0], name=0)], "1 category field(s)"),
+        (
+            [[np.nan]] * 10,
+            2,
+            [pd.Series(10, index=[np.nan], name=0)],
+            "1 category field(s)",
+        ),
+        (
+            {"a": [True] * 10, "b": [i for i in range(10)]},
+            2,
+            [pd.Series(10, index=[True], name="a")],
+            "1 category field(s)",
+        ),
+        (
+            {"b": [i for i in range(10)], "c": [np.nan] * 10},
+            10,
+            [
+                pd.Series([1] * 10, index=[i for i in range(10)][::-1], name="b"),
+                pd.Series(10, index=[np.nan], name="c"),
+            ],
+            "2 category field(s)",
+        ),
+    ],
+)
+def test_get_categories(data, max_uniques, expected_stats, expected_message):
+    result = c.get_categories(pd.DataFrame(data), max_uniques)
+    assert result == create_result(
+        "Categories", {Level.INFO: [(expected_message,)]}, stats=expected_stats
+    )