Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Plot categorical data #100

Merged
merged 13 commits into from
Jun 3, 2019
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@ Note that the top-most release is changes in the unreleased master branch on Git

[Keep a Changelog](https://keepachangelog.com/en/1.0.0/), [Semantic Versioning](https://semver.org/spec/v2.0.0.html).


## [0.3.6dev] (Work In Progress)
### Added
- **Categories** rule with a plot showing unique values and count per field. By default, `report_all()` only includes fields which have less or equal to 10 unique values. See https://arche.readthedocs.io/en/latest/nbs/Rules.html#Category-fields, #100
### Changed
- `Arche.report_all()` does not shorten report by default, added `short` parameter.
- `expand=True` which enables nested data flattening is more than 100x faster and consumes ~2x less memory than before, #94
Expand Down
78 changes: 75 additions & 3 deletions docs/source/nbs/Rules.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,18 @@
"# Rules"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook contains rules used in the library with examples. They are executed in `report_all()` with default parameters, unless stated otherwise.\n",
"\n",
"Some definitions here are used interchangeably:\n",
"* **df** - a dataframe which holds input data (from a job, collection or other source)\n",
"* Scrapy cloud item - a row in a **df**\n",
"* Items fields - columns in a **df**"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -41,7 +53,48 @@
"metadata": {},
"outputs": [],
"source": [
"arche.rules.duplicates.find_by(items.df, [\"title\", \"category\"]).show()"
"df = items.df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Fields coverage on input data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"help(arche.rules.coverage.check_fields_coverage)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"arche.rules.coverage.check_fields_coverage(df).show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Category fields"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"help(arche.rules.category.get_categories)"
]
},
{
Expand All @@ -50,15 +103,34 @@
"metadata": {},
"outputs": [],
"source": [
"arche.rules.coverage.check_fields_coverage(items.df).show()"
"arche.rules.category.get_categories(df, max_uniques=50).show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Find duplicates by columns (fields)\n",
"This rule is not included in `Arche.report_all()`."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"help(arche.rules.duplicates.find_by)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"arche.rules.duplicates.find_by(df, [\"title\", \"category\"]).show()"
]
}
],
"metadata": {
Expand Down
1 change: 1 addition & 0 deletions src/arche/arche.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ def run_general_rules(self):
df.drop(columns=df.columns[df.columns.str.startswith("_")])
)
)
self.save_result(category_rules.get_categories(df))

def validate_with_json_schema(self) -> None:
"""Run JSON schema check and output results. It will try to find all errors, but
Expand Down
15 changes: 13 additions & 2 deletions src/arche/readers/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,21 @@ def flat_df(self) -> pd.DataFrame:
def process_df(df: pd.DataFrame) -> pd.DataFrame:
# clean empty objects - mainly lists and dicts, but keep everything else
df = df.applymap(lambda x: x if x or isinstance(x, numbers.Real) else np.nan)
if "_type" in df.columns:
df["_type"] = df["_type"].astype("category")
Items.categorize(df)
return df

@staticmethod
def categorize(df: pd.DataFrame) -> pd.DataFrame:
if len(df) < 100:
return
for c in df.columns:
try:
if df[c].nunique(dropna=False) <= 10:
df[c] = df[c].astype("category")
# ignore lists and dicts columns
except TypeError:
continue

def origin_column_name(self, new: str) -> str:
if new in self.df.columns:
return new
Expand Down
26 changes: 26 additions & 0 deletions src/arche/rules/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,29 @@ def get_coverage_per_category(df: pd.DataFrame, category_names: List[str]) -> Re
if not category_names:
result.add_info(Outcome.SKIPPED)
return result


def get_categories(df: pd.DataFrame, max_uniques: int = 10) -> Result:
"""Find category columns. A category column is the column which holds a limited number
of possible values, including `NAN`.

Args:
df: data
max_uniques: filter which determines which columns to use. Only columns with
the number of unique values less than or equal to `max_uniques` are category columns.

Returns:
A result with stats containing value counts.
"""
result = Result("Categories")

result.stats = [
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If there's not a cache, then you're calculating value_counts twice here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Possible solution:

result.stats = map(lambda c: df[c].value_counts(dropna=False), df)
result.stats = [vc for vc in result.stats if len(vc) <= max_uniques]

df[c].value_counts(dropna=False)
for c in df
if len(df[c].value_counts(dropna=False)) <= max_uniques
]
if not result.stats:
result.add_info("Categories were not found")
return result
result.add_info(f"{len(result.stats)} category field(s)")
return result
73 changes: 53 additions & 20 deletions src/arche/rules/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Dict, List, Optional, Union

import pandas as pd
from plotly.colors import DEFAULT_PLOTLY_COLORS
import plotly.graph_objs as go
import plotly.io as pio

Expand Down Expand Up @@ -99,8 +100,8 @@ def stats(self, value):

@property
def figures(self):
if not self._figures:
self._figures = Result.create_figures(self.stats)
if not self._figures and self.stats:
self._figures = Result.create_figures(self.stats, self.name)
return self._figures

def add_info(self, summary, detailed=None, errors=None):
Expand Down Expand Up @@ -162,32 +163,25 @@ def show(self, short: bool = False, keys_limit: int = 10):
pio.show(f)

@staticmethod
def create_figures(stats: List[Stat]) -> List[go.FigureWidget]:
def create_figures(stats: List[Stat], name: str) -> List[go.FigureWidget]:
if name == "Categories":
data = Result.build_stack_bar_data(stats)
layout = Result.get_layout("Category fields", len(stats))
layout.barmode = "stack"
return [go.FigureWidget(data, layout)]
figures = []
for stat in stats:
y = stat.index.values.astype(str)
if isinstance(stat, pd.Series):
data = [
go.Bar(
x=stat.values, y=stat.index.values.astype(str), orientation="h"
)
]
data = [go.Bar(x=stat.values, y=y, orientation="h", opacity=0.7)]
else:
data = [
go.Bar(
x=stat[c].values, y=stat.index.values, orientation="h", name=c
)
go.Bar(x=stat[c].values, y=y, orientation="h", opacity=0.7, name=c)
for c in stat.columns
]

layout = go.Layout(
title=stat.name,
bargap=0.1,
template="seaborn",
height=min(len(stat) * 20, 900),
hovermode="y",
margin=dict(l=200, t=35),
xaxis=go.layout.XAxis(range=[0, max(stat.values.max(), 1) * 1.05]),
)
layout = Result.get_layout(stat.name, len(stat))
layout.xaxis = go.layout.XAxis(range=[0, max(stat.values.max(), 1) * 1.05])
if stat.name.startswith("Coverage"):
layout.xaxis.tickformat = ".2p"
if stat.name == "Coverage for boolean fields":
Expand All @@ -198,6 +192,45 @@ def create_figures(stats: List[Stat]) -> List[go.FigureWidget]:
figures.append(go.FigureWidget(data, layout))
return figures

@staticmethod
def build_stack_bar_data(values_counts: List[pd.Series]) -> List[go.Bar]:
"""Create data for plotly stack bar chart with consistent colors between
bars. Each bar values have indexes unique to the bar, without any correlation
to other obars.

Args:
values_counts: an array of value_counts series

Returns:
A list of Bar objects.
"""
data = []
for vc in values_counts:
data = data + [
go.Bar(
x=[counts],
y=[vc.name],
name=str(value)[:30],
orientation="h",
opacity=0.6,
legendgroup=vc.name,
marker_color=DEFAULT_PLOTLY_COLORS[i % 10],
)
for i, (value, counts) in enumerate(vc.items())
]
return data

@staticmethod
def get_layout(name: str, rows_count: int) -> go.Layout:
return go.Layout(
title=name,
bargap=0.1,
template="seaborn",
height=min(rows_count * 20, 900),
hovermode="y",
margin=dict(l=200, t=35),
)

@staticmethod
def make_annotations(stat: pd.Series) -> List[Dict]:
annotations = []
Expand Down
8 changes: 7 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,8 +204,14 @@ def pytest_assertrepr_compare(op, left, right):
):
if left_n == "_stats":
for left_stat, right_stat in zip_longest(left_v, right_v):
if not Result.tensors_equal(left_stat, right_stat):
try:
if isinstance(left_stat, pd.DataFrame):
pd.testing.assert_frame_equal(left_stat, right_stat)
else:
pd.testing.assert_series_equal(left_stat, right_stat)
except AssertionError as e:
assert_msgs.extend([f"{left_stat}", "!=", f"{right_stat}"])
assert_msgs.extend(str(e).split("\n"))
elif left_v != right_v:
assert_msgs.extend([f"{left_v}", "!=", f"{right_v}"])
return assert_msgs
Expand Down
29 changes: 28 additions & 1 deletion tests/readers/test_items.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,37 @@ def test_process_df():
pd.DataFrame([[dict(), list(), "NameItem"]], columns=["a", "b", "_type"])
)
exp_df = pd.DataFrame([[np.nan, np.nan, "NameItem"]], columns=["a", "b", "_type"])
exp_df["_type"] = exp_df["_type"].astype("category")
pd.testing.assert_frame_equal(df, exp_df)


@pytest.mark.parametrize(
"data, expected_cats",
[
(
{
"a": [i for i in range(100)],
"b": [False for i in range(100)],
"c": [[False] for i in range(100)],
"d": [{0} for i in range(100)],
},
["b"],
)
],
)
def test_categorize(data, expected_cats):
df = pd.DataFrame(data)
Items.categorize(df)
np.testing.assert_array_equal(
df.select_dtypes(["category"]).columns.values, expected_cats
)


def test_no_categorize():
df = pd.DataFrame({"a": [i for i in range(99)]})
Items.categorize(df)
assert df.select_dtypes(["category"]).empty


flat_df_inputs = [
(
[{"name": "Bob", "alive": True, "_key": 0, "following": None}],
Expand Down
46 changes: 46 additions & 0 deletions tests/rules/test_category.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,49 @@ def test_get_difference(source, target, categories, expected_messages, expected_
) == create_result(
"Category Coverage Difference", expected_messages, stats=expected_stats
)


@pytest.mark.parametrize(
"data, expected_message",
[
(np.random.rand(100), "Categories were not found"),
(None, "Categories were not found"),
],
)
def test_get_no_categories(data, expected_message):
result = c.get_categories(pd.DataFrame(data))
assert result == create_result("Categories", {Level.INFO: [(expected_message,)]})


@pytest.mark.parametrize(
"data, max_uniques, expected_stats, expected_message",
[
(np.zeros(10), 2, [pd.Series(10, index=[0.0], name=0)], "1 category field(s)"),
(
[[np.nan]] * 10,
2,
[pd.Series(10, index=[np.nan], name=0)],
"1 category field(s)",
),
(
{"a": [True] * 10, "b": [i for i in range(10)]},
2,
[pd.Series(10, index=[True], name="a")],
"1 category field(s)",
),
(
{"b": [i for i in range(10)], "c": [np.nan] * 10},
10,
[
pd.Series([1] * 10, index=[i for i in range(10)][::-1], name="b"),
pd.Series(10, index=[np.nan], name="c"),
],
"2 category field(s)",
),
],
)
def test_get_categories(data, max_uniques, expected_stats, expected_message):
result = c.get_categories(pd.DataFrame(data), max_uniques)
assert result == create_result(
"Categories", {Level.INFO: [(expected_message,)]}, stats=expected_stats
)
Loading