From d2a7b15b2b930fe384e1f1715fc4af23386f4935 Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Sun, 21 May 2023 09:19:30 -0700 Subject: [PATCH] Analyze tables options: --common-limit, --no-most, --no-least Closes #544 --- docs/cli-reference.rst | 11 ++-- docs/cli.rst | 10 ++-- docs/python-api.rst | 25 +++++++-- sqlite_utils/cli.py | 19 +++++-- sqlite_utils/db.py | 54 ++++++++++++-------- tests/test_analyze_tables.py | 99 ++++++++++++++++++++++++++++++++++-- 6 files changed, 178 insertions(+), 40 deletions(-) diff --git a/docs/cli-reference.rst b/docs/cli-reference.rst index 8993213e1..c1081382f 100644 --- a/docs/cli-reference.rst +++ b/docs/cli-reference.rst @@ -564,10 +564,13 @@ See :ref:`cli_analyze_tables`. sqlite-utils analyze-tables data.db trees Options: - -c, --column TEXT Specific columns to analyze - --save Save results to _analyze_tables table - --load-extension TEXT Path to SQLite extension, with optional :entrypoint - -h, --help Show this message and exit. + -c, --column TEXT Specific columns to analyze + --save Save results to _analyze_tables table + --common-limit INTEGER How many common values + --no-most Skip most common values + --no-least Skip least common values + --load-extension TEXT Path to SQLite extension, with optional :entrypoint + -h, --help Show this message and exit. .. _cli_ref_convert: diff --git a/docs/cli.rst b/docs/cli.rst index 572b0e5f2..d567cbac4 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -730,11 +730,15 @@ For each column this tool displays the number of null rows, the number of blank If you do not specify any tables every table in the database will be analyzed:: - $ sqlite-utils analyze-tables github.db + sqlite-utils analyze-tables github.db If you wish to analyze one or more specific columns, use the ``-c`` option:: - $ sqlite-utils analyze-tables github.db tags -c sha + sqlite-utils analyze-tables github.db tags -c sha + +To show more than 10 common values, use ``--common-limit 20``. To skip the most common or least common value analysis, use ``--no-most`` or ``--no-least``:: + + sqlite-utils analyze-tables github.db tags --common-limit 20 --no-least .. _cli_analyze_tables_save: @@ -743,7 +747,7 @@ Saving the analyzed table details ``analyze-tables`` can take quite a while to run for large database files. You can save the results of the analysis to a database table called ``_analyze_tables_`` using the ``--save`` option:: - $ sqlite-utils analyze-tables github.db --save + sqlite-utils analyze-tables github.db --save The ``_analyze_tables_`` table has the following schema:: diff --git a/docs/python-api.rst b/docs/python-api.rst index 4d883db42..4b143c3da 100644 --- a/docs/python-api.rst +++ b/docs/python-api.rst @@ -1115,7 +1115,26 @@ You can inspect the database to see the results like this:: Analyzing a column ================== -The ``table.analyze_column(column, common_limit=10, value_truncate=None)`` method is used by the :ref:`analyze-tables ` CLI command. It returns a ``ColumnDetails`` named tuple with the following fields: +The ``table.analyze_column(column)`` method is used by the :ref:`analyze-tables ` CLI command. + +It takes the following arguments and options: + +``column`` - required + The name of the column to analyze + +``common_limit`` + The number of most common values to return. Defaults to 10. + +``value_truncate`` + If set to an integer, values longer than this will be truncated to this length. Defaults to None. + +``most_common`` + If set to False, the ``most_common`` field of the returned ``ColumnDetails`` will be set to None. Defaults to True. + +``least_common`` + If set to False, the ``least_common`` field of the returned ``ColumnDetails`` will be set to None. Defaults to True. + +And returns a ``ColumnDetails`` named tuple with the following fields: ``table`` The name of the table @@ -1141,10 +1160,6 @@ The ``table.analyze_column(column, common_limit=10, value_truncate=None)`` metho ``least_common`` The ``N`` least common values as a list of ``(value, count)`` tuples`, or ``None`` if the table is entirely distinct or if the number of distinct values is less than N (since they will already have been returned in ``most_common``) -``N`` defaults to 10, or you can pass a custom ``N`` using the ``common_limit`` parameter. - -You can use the ``value_truncate`` parameter to truncate values in the ``most_common`` and ``least_common`` lists to a specified number of characters. - .. _python_api_add_column: Adding columns diff --git a/sqlite_utils/cli.py b/sqlite_utils/cli.py index ce354e057..0c91a8f56 100644 --- a/sqlite_utils/cli.py +++ b/sqlite_utils/cli.py @@ -2639,12 +2639,20 @@ def _content_text(p): help="Specific columns to analyze", ) @click.option("--save", is_flag=True, help="Save results to _analyze_tables table") +@click.option("--common-limit", type=int, default=10, help="How many common values") +@click.option("--no-most", is_flag=True, default=False, help="Skip most common values") +@click.option( + "--no-least", is_flag=True, default=False, help="Skip least common values" +) @load_extension_option def analyze_tables( path, tables, columns, save, + common_limit, + no_most, + no_least, load_extension, ): """Analyze the columns in one or more tables @@ -2656,10 +2664,10 @@ def analyze_tables( """ db = sqlite_utils.Database(path) _load_extensions(db, load_extension) - _analyze(db, tables, columns, save) + _analyze(db, tables, columns, save, common_limit, no_most, no_least) -def _analyze(db, tables, columns, save): +def _analyze(db, tables, columns, save, common_limit=10, no_most=False, no_least=False): if not tables: tables = db.table_names() todo = [] @@ -2672,7 +2680,12 @@ def _analyze(db, tables, columns, save): # Now we now how many we need to do for i, (table, column) in enumerate(todo): column_details = db[table].analyze_column( - column, total_rows=table_counts[table], value_truncate=80 + column, + common_limit=common_limit, + total_rows=table_counts[table], + value_truncate=80, + most_common=not no_most, + least_common=not no_least, ) if save: db["_analyze_tables_"].insert( diff --git a/sqlite_utils/db.py b/sqlite_utils/db.py index ec4bbfc03..850a3aef3 100644 --- a/sqlite_utils/db.py +++ b/sqlite_utils/db.py @@ -3419,7 +3419,13 @@ def analyze(self): self.db.analyze(self.name) def analyze_column( - self, column: str, common_limit: int = 10, value_truncate=None, total_rows=None + self, + column: str, + common_limit: int = 10, + value_truncate=None, + total_rows=None, + most_common: bool = True, + least_common: bool = True, ) -> "ColumnDetails": """ Return statistics about the specified column. @@ -3430,6 +3436,8 @@ def analyze_column( :param common_limit: Show this many column values :param value_truncate: Truncate display of common values to this many characters :param total_rows: Optimization - pass the total number of rows in the table to save running a fresh ``count(*)`` query + :param most_common: If ``True``, calculate the most common values + :param least_common: If ``True``, calculate the least common values """ db = self.db table = self.name @@ -3453,36 +3461,38 @@ def truncate(value): num_distinct = db.execute( "select count(distinct [{}]) from [{}]".format(column, table) ).fetchone()[0] - most_common = None - least_common = None + most_common_results = None + least_common_results = None if num_distinct == 1: value = db.execute( "select [{}] from [{}] limit 1".format(column, table) ).fetchone()[0] - most_common = [(truncate(value), total_rows)] + most_common_results = [(truncate(value), total_rows)] elif num_distinct != total_rows: - most_common = [ - (truncate(r[0]), r[1]) - for r in db.execute( - "select [{}], count(*) from [{}] group by [{}] order by count(*) desc, [{}] limit {}".format( - column, table, column, column, common_limit - ) - ).fetchall() - ] - most_common.sort(key=lambda p: (p[1], p[0]), reverse=True) - if num_distinct <= common_limit: - # No need to run the query if it will just return the results in revers order - least_common = None - else: - least_common = [ + if most_common: + most_common_results = [ (truncate(r[0]), r[1]) for r in db.execute( - "select [{}], count(*) from [{}] group by [{}] order by count(*), [{}] desc limit {}".format( + "select [{}], count(*) from [{}] group by [{}] order by count(*) desc, [{}] limit {}".format( column, table, column, column, common_limit ) ).fetchall() ] - least_common.sort(key=lambda p: (p[1], p[0])) + most_common_results.sort(key=lambda p: (p[1], p[0]), reverse=True) + if least_common: + if num_distinct <= common_limit: + # No need to run the query if it will just return the results in revers order + least_common_results = None + else: + least_common_results = [ + (truncate(r[0]), r[1]) + for r in db.execute( + "select [{}], count(*) from [{}] group by [{}] order by count(*), [{}] desc limit {}".format( + column, table, column, column, common_limit + ) + ).fetchall() + ] + least_common_results.sort(key=lambda p: (p[1], p[0])) return ColumnDetails( self.name, column, @@ -3490,8 +3500,8 @@ def truncate(value): num_null, num_blank, num_distinct, - most_common, - least_common, + most_common_results, + least_common_results, ) def add_geometry_column( diff --git a/tests/test_analyze_tables.py b/tests/test_analyze_tables.py index 5795a7a1b..a3e4e2f2d 100644 --- a/tests/test_analyze_tables.py +++ b/tests/test_analyze_tables.py @@ -24,11 +24,34 @@ def db_to_analyze(fresh_db): return fresh_db +@pytest.fixture +def big_db_to_analyze_path(tmpdir): + path = str(tmpdir / "test.db") + db = Database(path) + categories = { + "A": 40, + "B": 30, + "C": 20, + "D": 10, + } + to_insert = [] + for category, count in categories.items(): + for _ in range(count): + to_insert.append( + { + "category": category, + } + ) + db["stuff"].insert_all(to_insert) + return path + + @pytest.mark.parametrize( - "column,expected", + "column,extra_kwargs,expected", [ ( "id", + {}, ColumnDetails( table="stuff", column="id", @@ -42,6 +65,7 @@ def db_to_analyze(fresh_db): ), ( "owner", + {}, ColumnDetails( table="stuff", column="owner", @@ -55,6 +79,7 @@ def db_to_analyze(fresh_db): ), ( "size", + {}, ColumnDetails( table="stuff", column="size", @@ -66,11 +91,41 @@ def db_to_analyze(fresh_db): least_common=None, ), ), + ( + "owner", + {"most_common": False}, + ColumnDetails( + table="stuff", + column="owner", + total_rows=8, + num_null=0, + num_blank=0, + num_distinct=4, + most_common=None, + least_common=[("Anne", 1), ("Terry...", 2)], + ), + ), + ( + "owner", + {"least_common": False}, + ColumnDetails( + table="stuff", + column="owner", + total_rows=8, + num_null=0, + num_blank=0, + num_distinct=4, + most_common=[("Joan", 3), ("Kumar", 2)], + least_common=None, + ), + ), ], ) -def test_analyze_column(db_to_analyze, column, expected): +def test_analyze_column(db_to_analyze, column, extra_kwargs, expected): assert ( - db_to_analyze["stuff"].analyze_column(column, common_limit=2, value_truncate=5) + db_to_analyze["stuff"].analyze_column( + column, common_limit=2, value_truncate=5, **extra_kwargs + ) == expected ) @@ -164,3 +219,41 @@ def test_analyze_table_save(db_to_analyze_path): "least_common": None, }, ] + + +@pytest.mark.parametrize( + "no_most,no_least", + ( + (False, False), + (True, False), + (False, True), + (True, True), + ), +) +def test_analyze_table_save_no_most_no_least_options( + no_most, no_least, big_db_to_analyze_path +): + args = ["analyze-tables", big_db_to_analyze_path, "--save", "--common-limit", "2"] + if no_most: + args.append("--no-most") + if no_least: + args.append("--no-least") + result = CliRunner().invoke(cli.cli, args) + assert result.exit_code == 0 + rows = list(Database(big_db_to_analyze_path)["_analyze_tables_"].rows) + expected = { + "table": "stuff", + "column": "category", + "total_rows": 100, + "num_null": 0, + "num_blank": 0, + "num_distinct": 4, + "most_common": None, + "least_common": None, + } + if not no_most: + expected["most_common"] = '[["A", 40], ["B", 30]]' + if not no_least: + expected["least_common"] = '[["D", 10], ["C", 20]]' + + assert rows == [expected]