Analyze tables options: --common-limit, --no-most, --no-least

Closes #544
simonw · May 21, 2023 · d2a7b15 · d2a7b15
1 parent e047cc3
commit d2a7b15
Show file tree

Hide file tree

Showing 6 changed files with 178 additions and 40 deletions.
diff --git a/docs/cli-reference.rst b/docs/cli-reference.rst
@@ -564,10 +564,13 @@ See :ref:`cli_analyze_tables`.
           sqlite-utils analyze-tables data.db trees
 
     Options:
-      -c, --column TEXT      Specific columns to analyze
-      --save                 Save results to _analyze_tables table
-      --load-extension TEXT  Path to SQLite extension, with optional :entrypoint
-      -h, --help             Show this message and exit.
+      -c, --column TEXT       Specific columns to analyze
+      --save                  Save results to _analyze_tables table
+      --common-limit INTEGER  How many common values
+      --no-most               Skip most common values
+      --no-least              Skip least common values
+      --load-extension TEXT   Path to SQLite extension, with optional :entrypoint
+      -h, --help              Show this message and exit.
 
 
 .. _cli_ref_convert:

diff --git a/docs/cli.rst b/docs/cli.rst
@@ -730,11 +730,15 @@ For each column this tool displays the number of null rows, the number of blank
 
 If you do not specify any tables every table in the database will be analyzed::
 
-    $ sqlite-utils analyze-tables github.db
+    sqlite-utils analyze-tables github.db
 
 If you wish to analyze one or more specific columns, use the ``-c`` option::
 
-    $ sqlite-utils analyze-tables github.db tags -c sha
+    sqlite-utils analyze-tables github.db tags -c sha
+
+To show more than 10 common values, use ``--common-limit 20``.  To skip the most common or least common value analysis, use ``--no-most`` or ``--no-least``::
+
+    sqlite-utils analyze-tables github.db tags --common-limit 20 --no-least
 
 .. _cli_analyze_tables_save:
 
@@ -743,7 +747,7 @@ Saving the analyzed table details
 
 ``analyze-tables`` can take quite a while to run for large database files. You can save the results of the analysis to a database table called ``_analyze_tables_`` using the ``--save`` option::
 
-    $ sqlite-utils analyze-tables github.db --save
+    sqlite-utils analyze-tables github.db --save
 
 The ``_analyze_tables_`` table has the following schema::
 

diff --git a/docs/python-api.rst b/docs/python-api.rst
@@ -1115,7 +1115,26 @@ You can inspect the database to see the results like this::
 Analyzing a column
 ==================
 
-The ``table.analyze_column(column, common_limit=10, value_truncate=None)`` method is used by the :ref:`analyze-tables <cli_analyze_tables>` CLI command. It returns a ``ColumnDetails`` named tuple with the following fields:
+The ``table.analyze_column(column)`` method is used by the :ref:`analyze-tables <cli_analyze_tables>` CLI command.
+
+It takes the following arguments and options:
+
+``column`` - required
+    The name of the column to analyze
+
+``common_limit``
+    The number of most common values to return. Defaults to 10.
+
+``value_truncate``
+    If set to an integer, values longer than this will be truncated to this length. Defaults to None.
+
+``most_common``
+    If set to False, the ``most_common`` field of the returned ``ColumnDetails`` will be set to None. Defaults to True.
+
+``least_common``
+    If set to False, the ``least_common`` field of the returned ``ColumnDetails`` will be set to None. Defaults to True.
+
+And returns a ``ColumnDetails`` named tuple with the following fields:
 
 ``table``
     The name of the table
@@ -1141,10 +1160,6 @@ The ``table.analyze_column(column, common_limit=10, value_truncate=None)`` metho
 ``least_common``
     The ``N`` least common values as a list of ``(value, count)`` tuples`, or ``None`` if the table is entirely distinct or if the number of distinct values is less than N (since they will already have been returned in ``most_common``)
 
-``N`` defaults to 10, or you can pass a custom ``N`` using the ``common_limit`` parameter.
-
-You can use the ``value_truncate`` parameter to truncate values in the ``most_common`` and ``least_common`` lists to a specified number of characters.
-
 .. _python_api_add_column:
 
 Adding columns

diff --git a/sqlite_utils/cli.py b/sqlite_utils/cli.py
@@ -2639,12 +2639,20 @@ def _content_text(p):
     help="Specific columns to analyze",
 )
 @click.option("--save", is_flag=True, help="Save results to _analyze_tables table")
+@click.option("--common-limit", type=int, default=10, help="How many common values")
+@click.option("--no-most", is_flag=True, default=False, help="Skip most common values")
+@click.option(
+    "--no-least", is_flag=True, default=False, help="Skip least common values"
+)
 @load_extension_option
 def analyze_tables(
     path,
     tables,
     columns,
     save,
+    common_limit,
+    no_most,
+    no_least,
     load_extension,
 ):
     """Analyze the columns in one or more tables
@@ -2656,10 +2664,10 @@ def analyze_tables(
     """
     db = sqlite_utils.Database(path)
     _load_extensions(db, load_extension)
-    _analyze(db, tables, columns, save)
+    _analyze(db, tables, columns, save, common_limit, no_most, no_least)
 
 
-def _analyze(db, tables, columns, save):
+def _analyze(db, tables, columns, save, common_limit=10, no_most=False, no_least=False):
     if not tables:
         tables = db.table_names()
     todo = []
@@ -2672,7 +2680,12 @@ def _analyze(db, tables, columns, save):
     # Now we now how many we need to do
     for i, (table, column) in enumerate(todo):
         column_details = db[table].analyze_column(
-            column, total_rows=table_counts[table], value_truncate=80
+            column,
+            common_limit=common_limit,
+            total_rows=table_counts[table],
+            value_truncate=80,
+            most_common=not no_most,
+            least_common=not no_least,
         )
         if save:
             db["_analyze_tables_"].insert(

diff --git a/sqlite_utils/db.py b/sqlite_utils/db.py
@@ -3419,7 +3419,13 @@ def analyze(self):
         self.db.analyze(self.name)
 
     def analyze_column(
-        self, column: str, common_limit: int = 10, value_truncate=None, total_rows=None
+        self,
+        column: str,
+        common_limit: int = 10,
+        value_truncate=None,
+        total_rows=None,
+        most_common: bool = True,
+        least_common: bool = True,
     ) -> "ColumnDetails":
         """
         Return statistics about the specified column.
@@ -3430,6 +3436,8 @@ def analyze_column(
         :param common_limit: Show this many column values
         :param value_truncate: Truncate display of common values to this many characters
         :param total_rows: Optimization - pass the total number of rows in the table to save running a fresh ``count(*)`` query
+        :param most_common: If ``True``, calculate the most common values
+        :param least_common: If ``True``, calculate the least common values
         """
         db = self.db
         table = self.name
@@ -3453,45 +3461,47 @@ def truncate(value):
         num_distinct = db.execute(
             "select count(distinct [{}]) from [{}]".format(column, table)
         ).fetchone()[0]
-        most_common = None
-        least_common = None
+        most_common_results = None
+        least_common_results = None
         if num_distinct == 1:
             value = db.execute(
                 "select [{}] from [{}] limit 1".format(column, table)
             ).fetchone()[0]
-            most_common = [(truncate(value), total_rows)]
+            most_common_results = [(truncate(value), total_rows)]
         elif num_distinct != total_rows:
-            most_common = [
-                (truncate(r[0]), r[1])
-                for r in db.execute(
-                    "select [{}], count(*) from [{}] group by [{}] order by count(*) desc, [{}] limit {}".format(
-                        column, table, column, column, common_limit
-                    )
-                ).fetchall()
-            ]
-            most_common.sort(key=lambda p: (p[1], p[0]), reverse=True)
-            if num_distinct <= common_limit:
-                # No need to run the query if it will just return the results in revers order
-                least_common = None
-            else:
-                least_common = [
+            if most_common:
+                most_common_results = [
                     (truncate(r[0]), r[1])
                     for r in db.execute(
-                        "select [{}], count(*) from [{}] group by [{}] order by count(*), [{}] desc limit {}".format(
+                        "select [{}], count(*) from [{}] group by [{}] order by count(*) desc, [{}] limit {}".format(
                             column, table, column, column, common_limit
                         )
                     ).fetchall()
                 ]
-                least_common.sort(key=lambda p: (p[1], p[0]))
+                most_common_results.sort(key=lambda p: (p[1], p[0]), reverse=True)
+            if least_common:
+                if num_distinct <= common_limit:
+                    # No need to run the query if it will just return the results in revers order
+                    least_common_results = None
+                else:
+                    least_common_results = [
+                        (truncate(r[0]), r[1])
+                        for r in db.execute(
+                            "select [{}], count(*) from [{}] group by [{}] order by count(*), [{}] desc limit {}".format(
+                                column, table, column, column, common_limit
+                            )
+                        ).fetchall()
+                    ]
+                    least_common_results.sort(key=lambda p: (p[1], p[0]))
         return ColumnDetails(
             self.name,
             column,
             total_rows,
             num_null,
             num_blank,
             num_distinct,
-            most_common,
-            least_common,
+            most_common_results,
+            least_common_results,
         )
 
     def add_geometry_column(

diff --git a/tests/test_analyze_tables.py b/tests/test_analyze_tables.py
@@ -24,11 +24,34 @@ def db_to_analyze(fresh_db):
     return fresh_db
 
 
+@pytest.fixture
+def big_db_to_analyze_path(tmpdir):
+    path = str(tmpdir / "test.db")
+    db = Database(path)
+    categories = {
+        "A": 40,
+        "B": 30,
+        "C": 20,
+        "D": 10,
+    }
+    to_insert = []
+    for category, count in categories.items():
+        for _ in range(count):
+            to_insert.append(
+                {
+                    "category": category,
+                }
+            )
+    db["stuff"].insert_all(to_insert)
+    return path
+
+
 @pytest.mark.parametrize(
-    "column,expected",
+    "column,extra_kwargs,expected",
     [
         (
             "id",
+            {},
             ColumnDetails(
                 table="stuff",
                 column="id",
@@ -42,6 +65,7 @@ def db_to_analyze(fresh_db):
         ),
         (
             "owner",
+            {},
             ColumnDetails(
                 table="stuff",
                 column="owner",
@@ -55,6 +79,7 @@ def db_to_analyze(fresh_db):
         ),
         (
             "size",
+            {},
             ColumnDetails(
                 table="stuff",
                 column="size",
@@ -66,11 +91,41 @@ def db_to_analyze(fresh_db):
                 least_common=None,
             ),
         ),
+        (
+            "owner",
+            {"most_common": False},
+            ColumnDetails(
+                table="stuff",
+                column="owner",
+                total_rows=8,
+                num_null=0,
+                num_blank=0,
+                num_distinct=4,
+                most_common=None,
+                least_common=[("Anne", 1), ("Terry...", 2)],
+            ),
+        ),
+        (
+            "owner",
+            {"least_common": False},
+            ColumnDetails(
+                table="stuff",
+                column="owner",
+                total_rows=8,
+                num_null=0,
+                num_blank=0,
+                num_distinct=4,
+                most_common=[("Joan", 3), ("Kumar", 2)],
+                least_common=None,
+            ),
+        ),
     ],
 )
-def test_analyze_column(db_to_analyze, column, expected):
+def test_analyze_column(db_to_analyze, column, extra_kwargs, expected):
     assert (
-        db_to_analyze["stuff"].analyze_column(column, common_limit=2, value_truncate=5)
+        db_to_analyze["stuff"].analyze_column(
+            column, common_limit=2, value_truncate=5, **extra_kwargs
+        )
         == expected
     )
 
@@ -164,3 +219,41 @@ def test_analyze_table_save(db_to_analyze_path):
             "least_common": None,
         },
     ]
+
+
+@pytest.mark.parametrize(
+    "no_most,no_least",
+    (
+        (False, False),
+        (True, False),
+        (False, True),
+        (True, True),
+    ),
+)
+def test_analyze_table_save_no_most_no_least_options(
+    no_most, no_least, big_db_to_analyze_path
+):
+    args = ["analyze-tables", big_db_to_analyze_path, "--save", "--common-limit", "2"]
+    if no_most:
+        args.append("--no-most")
+    if no_least:
+        args.append("--no-least")
+    result = CliRunner().invoke(cli.cli, args)
+    assert result.exit_code == 0
+    rows = list(Database(big_db_to_analyze_path)["_analyze_tables_"].rows)
+    expected = {
+        "table": "stuff",
+        "column": "category",
+        "total_rows": 100,
+        "num_null": 0,
+        "num_blank": 0,
+        "num_distinct": 4,
+        "most_common": None,
+        "least_common": None,
+    }
+    if not no_most:
+        expected["most_common"] = '[["A", 40], ["B", 30]]'
+    if not no_least:
+        expected["least_common"] = '[["D", 10], ["C", 20]]'
+
+    assert rows == [expected]