diff --git a/docs/changelog.rst b/docs/changelog.rst index 32162149..baa68647 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -11,6 +11,7 @@ Unreleased - The ``table.insert_all()`` and ``table.upsert_all()`` methods can now accept an iterator of lists or tuples as an alternative to dictionaries. The first item should be a list/tuple of column names. See :ref:`python_api_insert_lists` for details. (:issue:`672`) - **Breaking change:** The default floating point column type has been changed from ``FLOAT`` to ``REAL``, which is the correct SQLite type for floating point values. This affects auto-detected columns when inserting data. (:issue:`645`) +- **Breaking change:** Type detection is now the default behavior for the ``insert`` and ``upsert`` CLI commands when importing CSV or TSV data. Previously all columns were treated as ``TEXT`` unless the ``--detect-types`` flag was passed. Use the new ``--no-detect-types`` flag to restore the old behavior. The ``SQLITE_UTILS_DETECT_TYPES`` environment variable has been removed. (:issue:`679`) .. _v4_0a0: diff --git a/docs/cli-reference.rst b/docs/cli-reference.rst index eb8d5fd2..db0ef359 100644 --- a/docs/cli-reference.rst +++ b/docs/cli-reference.rst @@ -285,7 +285,8 @@ See :ref:`cli_inserting_data`, :ref:`cli_insert_csv_tsv`, :ref:`cli_insert_unstr --alter Alter existing table to add any missing columns --not-null TEXT Columns that should be created as NOT NULL --default ... Default value that should be set for a column - -d, --detect-types Detect types for columns in CSV/TSV data + -d, --detect-types Detect types for columns in CSV/TSV data (default) + --no-detect-types Treat all CSV/TSV columns as TEXT --analyze Run ANALYZE at the end of this operation --load-extension TEXT Path to SQLite extension, with optional :entrypoint --silent Do not show progress bar @@ -342,7 +343,8 @@ See :ref:`cli_upsert`. --alter Alter existing table to add any missing columns --not-null TEXT Columns that should be created as NOT NULL --default ... Default value that should be set for a column - -d, --detect-types Detect types for columns in CSV/TSV data + -d, --detect-types Detect types for columns in CSV/TSV data (default) + --no-detect-types Treat all CSV/TSV columns as TEXT --analyze Run ANALYZE at the end of this operation --load-extension TEXT Path to SQLite extension, with optional :entrypoint --silent Do not show progress bar diff --git a/docs/cli.rst b/docs/cli.rst index 91fff54a..1b9d0dad 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -508,7 +508,7 @@ Incoming CSV data will be assumed to use ``utf-8``. If your data uses a differen If you are joining across multiple CSV files they must all use the same encoding. -Column types will be automatically detected in CSV or TSV data, using the same mechanism as ``--detect-types`` described in :ref:`cli_insert_csv_tsv`. You can pass the ``--no-detect-types`` option to disable this automatic type detection and treat all CSV and TSV columns as ``TEXT``. +Column types will be automatically detected in CSV or TSV data, as described in :ref:`cli_insert_csv_tsv`. You can pass the ``--no-detect-types`` option to disable this automatic type detection and treat all CSV and TSV columns as ``TEXT``. .. _cli_memory_explicit: @@ -1263,7 +1263,7 @@ To stop inserting after a specified number of records - useful for getting a fas A progress bar is displayed when inserting data from a file. You can hide the progress bar using the ``--silent`` option. -By default every column inserted from a CSV or TSV file will be of type ``TEXT``. To automatically detect column types - resulting in a mix of ``TEXT``, ``INTEGER`` and ``REAL`` columns, use the ``--detect-types`` option (or its shortcut ``-d``). +By default, column types are automatically detected for CSV or TSV files - resulting in a mix of ``TEXT``, ``INTEGER`` and ``REAL`` columns. To disable type detection and treat all columns as ``TEXT``, use the ``--no-detect-types`` option. For example, given a ``creatures.csv`` file containing this: @@ -1277,9 +1277,9 @@ The following command: .. code-block:: bash - sqlite-utils insert creatures.db creatures creatures.csv --csv --detect-types + sqlite-utils insert creatures.db creatures creatures.csv --csv -Will produce this schema: +Will produce this schema with automatically detected types: .. code-block:: bash @@ -1293,11 +1293,11 @@ Will produce this schema: "weight" REAL ); -You can set the ``SQLITE_UTILS_DETECT_TYPES`` environment variable if you want ``--detect-types`` to be the default behavior: +To disable type detection and treat all columns as TEXT, use ``--no-detect-types``: .. code-block:: bash - export SQLITE_UTILS_DETECT_TYPES=1 + sqlite-utils insert creatures.db creatures creatures.csv --csv --no-detect-types If a CSV or TSV file includes empty cells, like this one: diff --git a/sqlite_utils/cli.py b/sqlite_utils/cli.py index 92bb6d55..94791211 100644 --- a/sqlite_utils/cli.py +++ b/sqlite_utils/cli.py @@ -898,8 +898,12 @@ def inner(fn): "-d", "--detect-types", is_flag=True, - envvar="SQLITE_UTILS_DETECT_TYPES", - help="Detect types for columns in CSV/TSV data", + help="Detect types for columns in CSV/TSV data (default)", + ), + click.option( + "--no-detect-types", + is_flag=True, + help="Treat all CSV/TSV columns as TEXT", ), click.option( "--analyze", @@ -951,6 +955,7 @@ def insert_upsert_implementation( not_null=None, default=None, detect_types=None, + no_detect_types=False, analyze=False, load_extension=None, silent=False, @@ -1019,7 +1024,8 @@ def insert_upsert_implementation( ) else: docs = (dict(zip(headers, row)) for row in reader) - if detect_types: + # detect_types is now the default, unless --no-detect-types is passed + if not no_detect_types: tracker = TypeTracker() docs = tracker.wrap(docs) elif lines: @@ -1191,6 +1197,7 @@ def insert( stop_after, alter, detect_types, + no_detect_types, analyze, load_extension, silent, @@ -1273,6 +1280,7 @@ def insert( replace=replace, truncate=truncate, detect_types=detect_types, + no_detect_types=no_detect_types, analyze=analyze, load_extension=load_extension, silent=silent, @@ -1311,6 +1319,7 @@ def upsert( not_null, default, detect_types, + no_detect_types, analyze, load_extension, silent, @@ -1356,6 +1365,7 @@ def upsert( not_null=not_null, default=default, detect_types=detect_types, + no_detect_types=no_detect_types, analyze=analyze, load_extension=load_extension, silent=silent, @@ -1443,6 +1453,7 @@ def bulk( not_null=set(), default={}, detect_types=False, + no_detect_types=True, load_extension=load_extension, silent=False, bulk_sql=sql, diff --git a/tests/test_cli.py b/tests/test_cli.py index 668068f6..4198727e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -4,7 +4,6 @@ from pathlib import Path import subprocess import sys -from unittest import mock import json import os import pytest @@ -1907,7 +1906,16 @@ def test_insert_encoding(tmpdir): # Using --encoding=latin-1 should work good_result = CliRunner().invoke( cli.cli, - ["insert", db_path, "places", csv_path, "--encoding", "latin-1", "--csv"], + [ + "insert", + db_path, + "places", + csv_path, + "--encoding", + "latin-1", + "--csv", + "--no-detect-types", + ], catch_exceptions=False, ) assert good_result.exit_code == 0 @@ -2196,7 +2204,7 @@ def test_import_no_headers(tmpdir, args, tsv): csv_file.write("Tracy{sep}Spider{sep}7\n".format(sep=sep)) result = CliRunner().invoke( cli.cli, - ["insert", db_path, "creatures", csv_path] + args, + ["insert", db_path, "creatures", csv_path] + args + ["--no-detect-types"], catch_exceptions=False, ) assert result.exit_code == 0, result.output @@ -2245,13 +2253,22 @@ def test_csv_insert_bom(tmpdir): fp.write(b"\xef\xbb\xbfname,age\nCleo,5") result = CliRunner().invoke( cli.cli, - ["insert", db_path, "broken", bom_csv_path, "--encoding", "utf-8", "--csv"], + [ + "insert", + db_path, + "broken", + bom_csv_path, + "--encoding", + "utf-8", + "--csv", + "--no-detect-types", + ], catch_exceptions=False, ) assert result.exit_code == 0 result2 = CliRunner().invoke( cli.cli, - ["insert", db_path, "fixed", bom_csv_path, "--csv"], + ["insert", db_path, "fixed", bom_csv_path, "--csv", "--no-detect-types"], catch_exceptions=False, ) assert result2.exit_code == 0 @@ -2263,43 +2280,40 @@ def test_csv_insert_bom(tmpdir): ] -@pytest.mark.parametrize("option_or_env_var", (None, "-d", "--detect-types")) -def test_insert_detect_types(tmpdir, option_or_env_var): +@pytest.mark.parametrize("option", (None, "-d", "--detect-types")) +def test_insert_detect_types(tmpdir, option): + """Test that type detection is now the default behavior""" db_path = str(tmpdir / "test.db") data = "name,age,weight\nCleo,6,45.5\nDori,1,3.5" extra = [] - if option_or_env_var: - extra = [option_or_env_var] + if option: + extra = [option] - def _test(): - result = CliRunner().invoke( - cli.cli, - ["insert", db_path, "creatures", "-", "--csv"] + extra, - catch_exceptions=False, - input=data, - ) - assert result.exit_code == 0 - db = Database(db_path) - assert list(db["creatures"].rows) == [ - {"name": "Cleo", "age": 6, "weight": 45.5}, - {"name": "Dori", "age": 1, "weight": 3.5}, - ] - - if option_or_env_var is None: - # Use environment variable instead of option - with mock.patch.dict(os.environ, {"SQLITE_UTILS_DETECT_TYPES": "1"}): - _test() - else: - _test() + result = CliRunner().invoke( + cli.cli, + ["insert", db_path, "creatures", "-", "--csv"] + extra, + catch_exceptions=False, + input=data, + ) + assert result.exit_code == 0 + db = Database(db_path) + assert list(db["creatures"].rows) == [ + {"name": "Cleo", "age": 6, "weight": 45.5}, + {"name": "Dori", "age": 1, "weight": 3.5}, + ] -@pytest.mark.parametrize("option", ("-d", "--detect-types")) +@pytest.mark.parametrize("option", (None, "-d", "--detect-types")) def test_upsert_detect_types(tmpdir, option): + """Test that type detection is now the default behavior for upsert""" db_path = str(tmpdir / "test.db") data = "id,name,age,weight\n1,Cleo,6,45.5\n2,Dori,1,3.5" + extra = [] + if option: + extra = [option] result = CliRunner().invoke( cli.cli, - ["upsert", db_path, "creatures", "-", "--csv", "--pk", "id"] + [option], + ["upsert", db_path, "creatures", "-", "--csv", "--pk", "id"] + extra, catch_exceptions=False, input=data, ) @@ -2312,12 +2326,12 @@ def test_upsert_detect_types(tmpdir, option): def test_csv_detect_types_creates_real_columns(tmpdir): - """Test that CSV import with --detect-types creates REAL columns for floats""" + """Test that CSV import creates REAL columns for floats (default behavior)""" db_path = str(tmpdir / "test.db") data = "name,age,weight\nCleo,6,45.5\nDori,1,3.5" result = CliRunner().invoke( cli.cli, - ["insert", db_path, "creatures", "-", "--csv", "--detect-types"], + ["insert", db_path, "creatures", "-", "--csv"], catch_exceptions=False, input=data, ) @@ -2333,6 +2347,68 @@ def test_csv_detect_types_creates_real_columns(tmpdir): ) +def test_insert_no_detect_types(tmpdir): + """Test that --no-detect-types treats all columns as TEXT""" + db_path = str(tmpdir / "test.db") + data = "name,age,weight\nCleo,6,45.5\nDori,1,3.5" + result = CliRunner().invoke( + cli.cli, + ["insert", db_path, "creatures", "-", "--csv", "--no-detect-types"], + catch_exceptions=False, + input=data, + ) + assert result.exit_code == 0 + db = Database(db_path) + # All columns should be TEXT when --no-detect-types is used + assert list(db["creatures"].rows) == [ + {"name": "Cleo", "age": "6", "weight": "45.5"}, + {"name": "Dori", "age": "1", "weight": "3.5"}, + ] + assert db["creatures"].schema == ( + 'CREATE TABLE "creatures" (\n' + ' "name" TEXT,\n' + ' "age" TEXT,\n' + ' "weight" TEXT\n' + ")" + ) + + +def test_upsert_no_detect_types(tmpdir): + """Test that --no-detect-types treats all columns as TEXT for upsert""" + db_path = str(tmpdir / "test.db") + data = "id,name,age,weight\n1,Cleo,6,45.5\n2,Dori,1,3.5" + result = CliRunner().invoke( + cli.cli, + [ + "upsert", + db_path, + "creatures", + "-", + "--csv", + "--pk", + "id", + "--no-detect-types", + ], + catch_exceptions=False, + input=data, + ) + assert result.exit_code == 0 + db = Database(db_path) + # All columns should be TEXT when --no-detect-types is used + assert list(db["creatures"].rows) == [ + {"id": "1", "name": "Cleo", "age": "6", "weight": "45.5"}, + {"id": "2", "name": "Dori", "age": "1", "weight": "3.5"}, + ] + assert db["creatures"].schema == ( + 'CREATE TABLE "creatures" (\n' + ' "id" TEXT PRIMARY KEY,\n' + ' "name" TEXT,\n' + ' "age" TEXT,\n' + ' "weight" TEXT\n' + ")" + ) + + def test_integer_overflow_error(tmpdir): db_path = str(tmpdir / "test.db") result = CliRunner().invoke( diff --git a/tests/test_cli_insert.py b/tests/test_cli_insert.py index a699c944..9f21b001 100644 --- a/tests/test_cli_insert.py +++ b/tests/test_cli_insert.py @@ -227,7 +227,7 @@ def test_insert_csv_tsv(content, options, db_path, tmpdir): fp.write(content) result = CliRunner().invoke( cli.cli, - ["insert", db_path, "data", file_path] + options, + ["insert", db_path, "data", file_path] + options + ["--no-detect-types"], catch_exceptions=False, ) assert result.exit_code == 0 @@ -236,7 +236,7 @@ def test_insert_csv_tsv(content, options, db_path, tmpdir): @pytest.mark.parametrize("empty_null", (True, False)) def test_insert_csv_empty_null(db_path, empty_null): - options = ["--csv"] + options = ["--csv", "--no-detect-types"] if empty_null: options.append("--empty-null") result = CliRunner().invoke( @@ -430,7 +430,7 @@ def test_insert_text(db_path): "options,input", ( ([], '[{"id": "1", "name": "Bob"}, {"id": "2", "name": "Cat"}]'), - (["--csv"], "id,name\n1,Bob\n2,Cat"), + (["--csv", "--no-detect-types"], "id,name\n1,Bob\n2,Cat"), (["--nl"], '{"id": "1", "name": "Bob"}\n{"id": "2", "name": "Cat"}'), ), ) diff --git a/tests/test_sniff.py b/tests/test_sniff.py index 36cc4718..62fac86b 100644 --- a/tests/test_sniff.py +++ b/tests/test_sniff.py @@ -12,7 +12,7 @@ def test_sniff(tmpdir, filepath): runner = CliRunner() result = runner.invoke( cli.cli, - ["insert", db_path, "creatures", str(filepath), "--sniff"], + ["insert", db_path, "creatures", str(filepath), "--sniff", "--no-detect-types"], catch_exceptions=False, ) assert result.exit_code == 0, result.stdout