Skip to content

Commit

Permalink
Detect types for sqlite-utils memory CSV, opt out with --no-detect-ty…
Browse files Browse the repository at this point in the history
…pes - closes #282
  • Loading branch information
simonw committed Jun 19, 2021
1 parent fd9867d commit ec5174e
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 15 deletions.
2 changes: 2 additions & 0 deletions docs/cli.rst
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,8 @@ Incoming CSV data will be assumed to use ``utf-8``. If your data uses a differen

If you are joining across multiple CSV files they must all use the same encoding.

Column types will be automatically detected in CSV or TSV data, using the same mechanism as ``--detect-types`` described in :ref:`cli_insert_csv_tsv`. You can pass the ``--no-detect-types`` option to disable this automatic type detection and treat all CSV and TSV columns as ``TEXT``.

.. _cli_memory_explicit:

Explicitly specifying the format
Expand Down
13 changes: 13 additions & 0 deletions sqlite_utils/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -1167,6 +1167,12 @@ def query(
"--encoding",
help="Character encoding for CSV input, defaults to utf-8",
)
@click.option(
"-n",
"--no-detect-types",
is_flag=True,
help="Treat all CSV/TSV columns as TEXT",
)
@click.option("--dump", is_flag=True, help="Dump SQL for in-memory database")
@click.option(
"--save",
Expand All @@ -1189,6 +1195,7 @@ def memory(
raw,
param,
encoding,
no_detect_types,
dump,
save,
load_extension,
Expand Down Expand Up @@ -1235,7 +1242,13 @@ def memory(
csv_table = csv_path.stem
csv_fp = csv_path.open("rb")
rows = rows_from_file(csv_fp, format=format, encoding=encoding)
tracker = None
if not no_detect_types:
tracker = TypeTracker()
rows = tracker.wrap(rows)
db[csv_table].insert_all(rows, alter=True)
if tracker is not None:
db[csv_table].transform(types=tracker.types)
# Add convenient t / t1 / t2 views
view_names = ["t{}".format(i + 1)]
if i == 0:
Expand Down
1 change: 1 addition & 0 deletions sqlite_utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ def rows_from_file(
format=None,
dialect=None,
encoding=None,
detect_types=False,
) -> Generator[dict, None, None]:
if format == Format.JSON:
decoded = json.load(fp)
Expand Down
49 changes: 34 additions & 15 deletions tests/test_cli_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def test_memory_csv(tmpdir, sql_from, use_stdin):
assert result.exit_code == 0
assert (
result.output.strip()
== '{"id": "1", "name": "Cleo"}\n{"id": "2", "name": "Bants"}'
== '{"rowid": 1, "id": 1, "name": "Cleo"}\n{"rowid": 2, "id": 2, "name": "Bants"}'
)


Expand All @@ -57,8 +57,8 @@ def test_memory_tsv(tmpdir, use_stdin):
)
assert result.exit_code == 0, result.output
assert json.loads(result.output.strip()) == [
{"id": "1", "name": "Cleo"},
{"id": "2", "name": "Bants"},
{"rowid": 1, "id": 1, "name": "Cleo"},
{"rowid": 2, "id": 2, "name": "Bants"},
]


Expand All @@ -82,8 +82,8 @@ def test_memory_json(tmpdir, use_stdin):
)
assert result.exit_code == 0, result.output
assert json.loads(result.output.strip()) == [
{"name": "Bants", "age": None},
{"name": "Dori", "age": 1},
{"rowid": 1, "name": "Bants", "age": None},
{"rowid": 2, "name": "Dori", "age": 1},
]


Expand All @@ -106,7 +106,10 @@ def test_memory_json_nl(tmpdir, use_stdin):
input=data,
)
assert result.exit_code == 0, result.output
assert json.loads(result.output.strip()) == [{"name": "Bants"}, {"name": "Dori"}]
assert json.loads(result.output.strip()) == [
{"rowid": 1, "name": "Bants"},
{"rowid": 2, "name": "Dori"},
]


@pytest.mark.parametrize("use_stdin", (True, False))
Expand Down Expand Up @@ -143,10 +146,11 @@ def test_memory_csv_encoding(tmpdir, use_stdin):
)
assert result.exit_code == 0, result.output
assert json.loads(result.output.strip()) == {
"rowid": 1,
"date": "2020-03-04",
"name": "S\u00e3o Paulo",
"latitude": "-23.561",
"longitude": "-46.645",
"name": "São Paulo",
"latitude": -23.561,
"longitude": -46.645,
}


Expand All @@ -160,12 +164,13 @@ def test_memory_dump(extra_args):
assert result.exit_code == 0
assert result.output.strip() == (
"BEGIN TRANSACTION;\n"
"CREATE TABLE [stdin] (\n"
" [id] TEXT,\n"
'CREATE TABLE "stdin" (\n'
" [rowid] INTEGER PRIMARY KEY,\n"
" [id] INTEGER,\n"
" [name] TEXT\n"
");\n"
"INSERT INTO \"stdin\" VALUES('1','Cleo');\n"
"INSERT INTO \"stdin\" VALUES('2','Bants');\n"
"INSERT INTO \"stdin\" VALUES(1,1,'Cleo');\n"
"INSERT INTO \"stdin\" VALUES(2,2,'Bants');\n"
"CREATE VIEW t1 AS select * from [stdin];\n"
"CREATE VIEW t AS select * from [stdin];\n"
"COMMIT;"
Expand All @@ -183,6 +188,20 @@ def test_memory_save(tmpdir, extra_args):
assert result.exit_code == 0
db = Database(save_to)
assert list(db["stdin"].rows) == [
{"id": "1", "name": "Cleo"},
{"id": "2", "name": "Bants"},
{"rowid": 1, "id": 1, "name": "Cleo"},
{"rowid": 2, "id": 2, "name": "Bants"},
]


@pytest.mark.parametrize("option", ("-n", "--no-detect-types"))
def test_memory_no_detect_types(option):
result = CliRunner().invoke(
cli.cli,
["memory", "-", "select * from stdin"] + [option],
input="id,name,weight\n1,Cleo,45.5\n2,Bants,3.5",
)
assert result.exit_code == 0, result.output
assert json.loads(result.output.strip()) == [
{"id": "1", "name": "Cleo", "weight": "45.5"},
{"id": "2", "name": "Bants", "weight": "3.5"},
]

0 comments on commit ec5174e

Please sign in to comment.