Detect types for sqlite-utils memory CSV, opt out with --no-detect-ty…

…pes - closes #282
simonw · Jun 19, 2021 · ec5174e · ec5174e
1 parent fd9867d
commit ec5174e
Show file tree

Hide file tree

Showing 4 changed files with 50 additions and 15 deletions.
diff --git a/docs/cli.rst b/docs/cli.rst
@@ -291,6 +291,8 @@ Incoming CSV data will be assumed to use ``utf-8``. If your data uses a differen
 
 If you are joining across multiple CSV files they must all use the same encoding.
 
+Column types will be automatically detected in CSV or TSV data, using the same mechanism as ``--detect-types`` described in :ref:`cli_insert_csv_tsv`. You can pass the ``--no-detect-types`` option to disable this automatic type detection and treat all CSV and TSV columns as ``TEXT``.
+
 .. _cli_memory_explicit:
 
 Explicitly specifying the format

diff --git a/sqlite_utils/cli.py b/sqlite_utils/cli.py
@@ -1167,6 +1167,12 @@ def query(
     "--encoding",
     help="Character encoding for CSV input, defaults to utf-8",
 )
+@click.option(
+    "-n",
+    "--no-detect-types",
+    is_flag=True,
+    help="Treat all CSV/TSV columns as TEXT",
+)
 @click.option("--dump", is_flag=True, help="Dump SQL for in-memory database")
 @click.option(
     "--save",
@@ -1189,6 +1195,7 @@ def memory(
     raw,
     param,
     encoding,
+    no_detect_types,
     dump,
     save,
     load_extension,
@@ -1235,7 +1242,13 @@ def memory(
             csv_table = csv_path.stem
             csv_fp = csv_path.open("rb")
         rows = rows_from_file(csv_fp, format=format, encoding=encoding)
+        tracker = None
+        if not no_detect_types:
+            tracker = TypeTracker()
+            rows = tracker.wrap(rows)
         db[csv_table].insert_all(rows, alter=True)
+        if tracker is not None:
+            db[csv_table].transform(types=tracker.types)
         # Add convenient t / t1 / t2 views
         view_names = ["t{}".format(i + 1)]
         if i == 0:

diff --git a/sqlite_utils/utils.py b/sqlite_utils/utils.py
@@ -147,6 +147,7 @@ def rows_from_file(
     format=None,
     dialect=None,
     encoding=None,
+    detect_types=False,
 ) -> Generator[dict, None, None]:
     if format == Format.JSON:
         decoded = json.load(fp)

diff --git a/tests/test_cli_memory.py b/tests/test_cli_memory.py
@@ -33,7 +33,7 @@ def test_memory_csv(tmpdir, sql_from, use_stdin):
     assert result.exit_code == 0
     assert (
         result.output.strip()
-        == '{"id": "1", "name": "Cleo"}\n{"id": "2", "name": "Bants"}'
+        == '{"rowid": 1, "id": 1, "name": "Cleo"}\n{"rowid": 2, "id": 2, "name": "Bants"}'
     )
 
 
@@ -57,8 +57,8 @@ def test_memory_tsv(tmpdir, use_stdin):
     )
     assert result.exit_code == 0, result.output
     assert json.loads(result.output.strip()) == [
-        {"id": "1", "name": "Cleo"},
-        {"id": "2", "name": "Bants"},
+        {"rowid": 1, "id": 1, "name": "Cleo"},
+        {"rowid": 2, "id": 2, "name": "Bants"},
     ]
 
 
@@ -82,8 +82,8 @@ def test_memory_json(tmpdir, use_stdin):
     )
     assert result.exit_code == 0, result.output
     assert json.loads(result.output.strip()) == [
-        {"name": "Bants", "age": None},
-        {"name": "Dori", "age": 1},
+        {"rowid": 1, "name": "Bants", "age": None},
+        {"rowid": 2, "name": "Dori", "age": 1},
     ]
 
 
@@ -106,7 +106,10 @@ def test_memory_json_nl(tmpdir, use_stdin):
         input=data,
     )
     assert result.exit_code == 0, result.output
-    assert json.loads(result.output.strip()) == [{"name": "Bants"}, {"name": "Dori"}]
+    assert json.loads(result.output.strip()) == [
+        {"rowid": 1, "name": "Bants"},
+        {"rowid": 2, "name": "Dori"},
+    ]
 
 
 @pytest.mark.parametrize("use_stdin", (True, False))
@@ -143,10 +146,11 @@ def test_memory_csv_encoding(tmpdir, use_stdin):
     )
     assert result.exit_code == 0, result.output
     assert json.loads(result.output.strip()) == {
+        "rowid": 1,
         "date": "2020-03-04",
-        "name": "S\u00e3o Paulo",
-        "latitude": "-23.561",
-        "longitude": "-46.645",
+        "name": "São Paulo",
+        "latitude": -23.561,
+        "longitude": -46.645,
     }
 
 
@@ -160,12 +164,13 @@ def test_memory_dump(extra_args):
     assert result.exit_code == 0
     assert result.output.strip() == (
         "BEGIN TRANSACTION;\n"
-        "CREATE TABLE [stdin] (\n"
-        "   [id] TEXT,\n"
+        'CREATE TABLE "stdin" (\n'
+        "   [rowid] INTEGER PRIMARY KEY,\n"
+        "   [id] INTEGER,\n"
         "   [name] TEXT\n"
         ");\n"
-        "INSERT INTO \"stdin\" VALUES('1','Cleo');\n"
-        "INSERT INTO \"stdin\" VALUES('2','Bants');\n"
+        "INSERT INTO \"stdin\" VALUES(1,1,'Cleo');\n"
+        "INSERT INTO \"stdin\" VALUES(2,2,'Bants');\n"
         "CREATE VIEW t1 AS select * from [stdin];\n"
         "CREATE VIEW t AS select * from [stdin];\n"
         "COMMIT;"
@@ -183,6 +188,20 @@ def test_memory_save(tmpdir, extra_args):
     assert result.exit_code == 0
     db = Database(save_to)
     assert list(db["stdin"].rows) == [
-        {"id": "1", "name": "Cleo"},
-        {"id": "2", "name": "Bants"},
+        {"rowid": 1, "id": 1, "name": "Cleo"},
+        {"rowid": 2, "id": 2, "name": "Bants"},
+    ]
+
+
+@pytest.mark.parametrize("option", ("-n", "--no-detect-types"))
+def test_memory_no_detect_types(option):
+    result = CliRunner().invoke(
+        cli.cli,
+        ["memory", "-", "select * from stdin"] + [option],
+        input="id,name,weight\n1,Cleo,45.5\n2,Bants,3.5",
+    )
+    assert result.exit_code == 0, result.output
+    assert json.loads(result.output.strip()) == [
+        {"id": "1", "name": "Cleo", "weight": "45.5"},
+        {"id": "2", "name": "Bants", "weight": "3.5"},
     ]