Skip to content

Commit

Permalink
Analyze tables options: --common-limit, --no-most, --no-least
Browse files Browse the repository at this point in the history
Closes #544
  • Loading branch information
simonw committed May 21, 2023
1 parent e047cc3 commit d2a7b15
Show file tree
Hide file tree
Showing 6 changed files with 178 additions and 40 deletions.
11 changes: 7 additions & 4 deletions docs/cli-reference.rst
Expand Up @@ -564,10 +564,13 @@ See :ref:`cli_analyze_tables`.
sqlite-utils analyze-tables data.db trees

Options:
-c, --column TEXT Specific columns to analyze
--save Save results to _analyze_tables table
--load-extension TEXT Path to SQLite extension, with optional :entrypoint
-h, --help Show this message and exit.
-c, --column TEXT Specific columns to analyze
--save Save results to _analyze_tables table
--common-limit INTEGER How many common values
--no-most Skip most common values
--no-least Skip least common values
--load-extension TEXT Path to SQLite extension, with optional :entrypoint
-h, --help Show this message and exit.


.. _cli_ref_convert:
Expand Down
10 changes: 7 additions & 3 deletions docs/cli.rst
Expand Up @@ -730,11 +730,15 @@ For each column this tool displays the number of null rows, the number of blank

If you do not specify any tables every table in the database will be analyzed::

$ sqlite-utils analyze-tables github.db
sqlite-utils analyze-tables github.db

If you wish to analyze one or more specific columns, use the ``-c`` option::

$ sqlite-utils analyze-tables github.db tags -c sha
sqlite-utils analyze-tables github.db tags -c sha

To show more than 10 common values, use ``--common-limit 20``. To skip the most common or least common value analysis, use ``--no-most`` or ``--no-least``::

sqlite-utils analyze-tables github.db tags --common-limit 20 --no-least

.. _cli_analyze_tables_save:

Expand All @@ -743,7 +747,7 @@ Saving the analyzed table details

``analyze-tables`` can take quite a while to run for large database files. You can save the results of the analysis to a database table called ``_analyze_tables_`` using the ``--save`` option::

$ sqlite-utils analyze-tables github.db --save
sqlite-utils analyze-tables github.db --save

The ``_analyze_tables_`` table has the following schema::

Expand Down
25 changes: 20 additions & 5 deletions docs/python-api.rst
Expand Up @@ -1115,7 +1115,26 @@ You can inspect the database to see the results like this::
Analyzing a column
==================

The ``table.analyze_column(column, common_limit=10, value_truncate=None)`` method is used by the :ref:`analyze-tables <cli_analyze_tables>` CLI command. It returns a ``ColumnDetails`` named tuple with the following fields:
The ``table.analyze_column(column)`` method is used by the :ref:`analyze-tables <cli_analyze_tables>` CLI command.

It takes the following arguments and options:

``column`` - required
The name of the column to analyze

``common_limit``
The number of most common values to return. Defaults to 10.

``value_truncate``
If set to an integer, values longer than this will be truncated to this length. Defaults to None.

``most_common``
If set to False, the ``most_common`` field of the returned ``ColumnDetails`` will be set to None. Defaults to True.

``least_common``
If set to False, the ``least_common`` field of the returned ``ColumnDetails`` will be set to None. Defaults to True.

And returns a ``ColumnDetails`` named tuple with the following fields:

``table``
The name of the table
Expand All @@ -1141,10 +1160,6 @@ The ``table.analyze_column(column, common_limit=10, value_truncate=None)`` metho
``least_common``
The ``N`` least common values as a list of ``(value, count)`` tuples`, or ``None`` if the table is entirely distinct or if the number of distinct values is less than N (since they will already have been returned in ``most_common``)

``N`` defaults to 10, or you can pass a custom ``N`` using the ``common_limit`` parameter.

You can use the ``value_truncate`` parameter to truncate values in the ``most_common`` and ``least_common`` lists to a specified number of characters.

.. _python_api_add_column:

Adding columns
Expand Down
19 changes: 16 additions & 3 deletions sqlite_utils/cli.py
Expand Up @@ -2639,12 +2639,20 @@ def _content_text(p):
help="Specific columns to analyze",
)
@click.option("--save", is_flag=True, help="Save results to _analyze_tables table")
@click.option("--common-limit", type=int, default=10, help="How many common values")
@click.option("--no-most", is_flag=True, default=False, help="Skip most common values")
@click.option(
"--no-least", is_flag=True, default=False, help="Skip least common values"
)
@load_extension_option
def analyze_tables(
path,
tables,
columns,
save,
common_limit,
no_most,
no_least,
load_extension,
):
"""Analyze the columns in one or more tables
Expand All @@ -2656,10 +2664,10 @@ def analyze_tables(
"""
db = sqlite_utils.Database(path)
_load_extensions(db, load_extension)
_analyze(db, tables, columns, save)
_analyze(db, tables, columns, save, common_limit, no_most, no_least)


def _analyze(db, tables, columns, save):
def _analyze(db, tables, columns, save, common_limit=10, no_most=False, no_least=False):
if not tables:
tables = db.table_names()
todo = []
Expand All @@ -2672,7 +2680,12 @@ def _analyze(db, tables, columns, save):
# Now we now how many we need to do
for i, (table, column) in enumerate(todo):
column_details = db[table].analyze_column(
column, total_rows=table_counts[table], value_truncate=80
column,
common_limit=common_limit,
total_rows=table_counts[table],
value_truncate=80,
most_common=not no_most,
least_common=not no_least,
)
if save:
db["_analyze_tables_"].insert(
Expand Down
54 changes: 32 additions & 22 deletions sqlite_utils/db.py
Expand Up @@ -3419,7 +3419,13 @@ def analyze(self):
self.db.analyze(self.name)

def analyze_column(
self, column: str, common_limit: int = 10, value_truncate=None, total_rows=None
self,
column: str,
common_limit: int = 10,
value_truncate=None,
total_rows=None,
most_common: bool = True,
least_common: bool = True,
) -> "ColumnDetails":
"""
Return statistics about the specified column.
Expand All @@ -3430,6 +3436,8 @@ def analyze_column(
:param common_limit: Show this many column values
:param value_truncate: Truncate display of common values to this many characters
:param total_rows: Optimization - pass the total number of rows in the table to save running a fresh ``count(*)`` query
:param most_common: If ``True``, calculate the most common values
:param least_common: If ``True``, calculate the least common values
"""
db = self.db
table = self.name
Expand All @@ -3453,45 +3461,47 @@ def truncate(value):
num_distinct = db.execute(
"select count(distinct [{}]) from [{}]".format(column, table)
).fetchone()[0]
most_common = None
least_common = None
most_common_results = None
least_common_results = None
if num_distinct == 1:
value = db.execute(
"select [{}] from [{}] limit 1".format(column, table)
).fetchone()[0]
most_common = [(truncate(value), total_rows)]
most_common_results = [(truncate(value), total_rows)]
elif num_distinct != total_rows:
most_common = [
(truncate(r[0]), r[1])
for r in db.execute(
"select [{}], count(*) from [{}] group by [{}] order by count(*) desc, [{}] limit {}".format(
column, table, column, column, common_limit
)
).fetchall()
]
most_common.sort(key=lambda p: (p[1], p[0]), reverse=True)
if num_distinct <= common_limit:
# No need to run the query if it will just return the results in revers order
least_common = None
else:
least_common = [
if most_common:
most_common_results = [
(truncate(r[0]), r[1])
for r in db.execute(
"select [{}], count(*) from [{}] group by [{}] order by count(*), [{}] desc limit {}".format(
"select [{}], count(*) from [{}] group by [{}] order by count(*) desc, [{}] limit {}".format(
column, table, column, column, common_limit
)
).fetchall()
]
least_common.sort(key=lambda p: (p[1], p[0]))
most_common_results.sort(key=lambda p: (p[1], p[0]), reverse=True)
if least_common:
if num_distinct <= common_limit:
# No need to run the query if it will just return the results in revers order
least_common_results = None
else:
least_common_results = [
(truncate(r[0]), r[1])
for r in db.execute(
"select [{}], count(*) from [{}] group by [{}] order by count(*), [{}] desc limit {}".format(
column, table, column, column, common_limit
)
).fetchall()
]
least_common_results.sort(key=lambda p: (p[1], p[0]))
return ColumnDetails(
self.name,
column,
total_rows,
num_null,
num_blank,
num_distinct,
most_common,
least_common,
most_common_results,
least_common_results,
)

def add_geometry_column(
Expand Down
99 changes: 96 additions & 3 deletions tests/test_analyze_tables.py
Expand Up @@ -24,11 +24,34 @@ def db_to_analyze(fresh_db):
return fresh_db


@pytest.fixture
def big_db_to_analyze_path(tmpdir):
path = str(tmpdir / "test.db")
db = Database(path)
categories = {
"A": 40,
"B": 30,
"C": 20,
"D": 10,
}
to_insert = []
for category, count in categories.items():
for _ in range(count):
to_insert.append(
{
"category": category,
}
)
db["stuff"].insert_all(to_insert)
return path


@pytest.mark.parametrize(
"column,expected",
"column,extra_kwargs,expected",
[
(
"id",
{},
ColumnDetails(
table="stuff",
column="id",
Expand All @@ -42,6 +65,7 @@ def db_to_analyze(fresh_db):
),
(
"owner",
{},
ColumnDetails(
table="stuff",
column="owner",
Expand All @@ -55,6 +79,7 @@ def db_to_analyze(fresh_db):
),
(
"size",
{},
ColumnDetails(
table="stuff",
column="size",
Expand All @@ -66,11 +91,41 @@ def db_to_analyze(fresh_db):
least_common=None,
),
),
(
"owner",
{"most_common": False},
ColumnDetails(
table="stuff",
column="owner",
total_rows=8,
num_null=0,
num_blank=0,
num_distinct=4,
most_common=None,
least_common=[("Anne", 1), ("Terry...", 2)],
),
),
(
"owner",
{"least_common": False},
ColumnDetails(
table="stuff",
column="owner",
total_rows=8,
num_null=0,
num_blank=0,
num_distinct=4,
most_common=[("Joan", 3), ("Kumar", 2)],
least_common=None,
),
),
],
)
def test_analyze_column(db_to_analyze, column, expected):
def test_analyze_column(db_to_analyze, column, extra_kwargs, expected):
assert (
db_to_analyze["stuff"].analyze_column(column, common_limit=2, value_truncate=5)
db_to_analyze["stuff"].analyze_column(
column, common_limit=2, value_truncate=5, **extra_kwargs
)
== expected
)

Expand Down Expand Up @@ -164,3 +219,41 @@ def test_analyze_table_save(db_to_analyze_path):
"least_common": None,
},
]


@pytest.mark.parametrize(
"no_most,no_least",
(
(False, False),
(True, False),
(False, True),
(True, True),
),
)
def test_analyze_table_save_no_most_no_least_options(
no_most, no_least, big_db_to_analyze_path
):
args = ["analyze-tables", big_db_to_analyze_path, "--save", "--common-limit", "2"]
if no_most:
args.append("--no-most")
if no_least:
args.append("--no-least")
result = CliRunner().invoke(cli.cli, args)
assert result.exit_code == 0
rows = list(Database(big_db_to_analyze_path)["_analyze_tables_"].rows)
expected = {
"table": "stuff",
"column": "category",
"total_rows": 100,
"num_null": 0,
"num_blank": 0,
"num_distinct": 4,
"most_common": None,
"least_common": None,
}
if not no_most:
expected["most_common"] = '[["A", 40], ["B", 30]]'
if not no_least:
expected["least_common"] = '[["D", 10], ["C", 20]]'

assert rows == [expected]

0 comments on commit d2a7b15

Please sign in to comment.