Skip to content

Commit

Permalink
Better character encoding detection
Browse files Browse the repository at this point in the history
- Run detection against first 2MB, not first 2KB
- If ascii detected use latin-1 since it is a safe superset
- New undocumented ?__num_bytes_to_detect_with=int param, used by tests

Closes #25
  • Loading branch information
simonw committed Jul 3, 2022
1 parent a6032f3 commit 8f9e6bf
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 16 deletions.
13 changes: 11 additions & 2 deletions datasette_upload_csvs/__init__.py
Expand Up @@ -43,6 +43,11 @@ async def upload_csvs(scope, receive, datasette, request):
):
raise Forbidden("Permission denied for upload-csvs")

num_bytes_to_detect_with = 2048 * 1024
# ?_num_bytes= can over-ride this, used by the tests
if request.args.get("_num_bytes_to_detect_with"):
num_bytes_to_detect_with = int(request.args["_num_bytes_to_detect_with"])

# For the moment just use the first database that's not immutable
db = [
db
Expand Down Expand Up @@ -73,10 +78,14 @@ async def upload_csvs(scope, receive, datasette, request):
task_id = str(uuid.uuid4())

# Use the first 2MB to detect the character encoding
first_bytes = csv.file.read(2048)
first_bytes = csv.file.read(num_bytes_to_detect_with)
csv.file.seek(0)
encoding = detect(first_bytes)["encoding"]
print(encoding)

# latin-1 is a superset of ascii, and less likely to hit errors
# https://github.com/simonw/datasette-upload-csvs/issues/25
if encoding == "ascii":
encoding = "latin-1"

def insert_initial_record(conn):
database = sqlite_utils.Database(conn)
Expand Down
47 changes: 33 additions & 14 deletions tests/test_datasette_upload_csvs.py
Expand Up @@ -2,6 +2,7 @@
import asyncio
from asgi_lifespan import LifespanManager
import json
from unittest.mock import ANY
import pytest
import httpx
import sqlite_utils
Expand Down Expand Up @@ -75,6 +76,9 @@ async def test_menu(auth):
"IncidentNotionalCost(£)": "255",
},
]
LATIN1_AFTER_FIRST_2KB = ("just_one_column\n" + "aabbcc\n" * 1048 + "a.b.é").encode(
"latin-1"
)


@pytest.mark.asyncio
Expand All @@ -89,12 +93,19 @@ async def test_menu(auth):
SIMPLE_EXPECTED,
),
("not-utf8.csv", NOT_UTF8, "/data/not-utf8", NOT_UTF8_EXPECTED),
("latin1-after-x.csv", "LATIN1_AFTER_FIRST_2KB", "/data/latin1-after-x", ANY),
),
)
@pytest.mark.parametrize("use_xhr", (True, False))
async def test_upload(tmpdir, filename, content, expected_url, expected_rows, use_xhr):
path = str(tmpdir / "data.db")
db = sqlite_utils.Database(path)
db.vacuum()
db.enable_wal()
binary_content = content
# Trick to avoid a 12MB string being part of the pytest rendered test name:
if content == "LATIN1_AFTER_FIRST_2KB":
binary_content = LATIN1_AFTER_FIRST_2KB

db["hello"].insert({"hello": "world"})

Expand All @@ -111,9 +122,13 @@ async def test_upload(tmpdir, filename, content, expected_url, expected_rows, us
cookies["ds_csrftoken"] = csrftoken

# Now try uploading a file
files = {"csv": (filename, content, "text/csv")}
files = {"csv": (filename, binary_content, "text/csv")}
response = await client.post(
"http://localhost/-/upload-csvs",
"http://localhost/-/upload-csvs{}".format(
"?_num_bytes_to_detect_with=2048"
if content == "LATIN1_AFTER_FIRST_2KB"
else ""
),
cookies=cookies,
data={"csrftoken": csrftoken, "xhr": "1" if use_xhr else ""},
files=files,
Expand All @@ -125,18 +140,22 @@ async def test_upload(tmpdir, filename, content, expected_url, expected_rows, us
assert expected_url in response.text

# Now things get tricky... the upload is running in a thread, so poll for completion
await asyncio.sleep(1)
response = await client.get(
"http://localhost/data/_csv_progress_.json?_shape=array"
)
rows = json.loads(response.content)
assert 1 == len(rows)
assert {
"filename": filename[:-4], # Strip off .csv ending
"bytes_todo": len(content),
"bytes_done": len(content),
"rows_done": 2,
}.items() <= rows[0].items()
fail_after = 20
iterations = 0
while True:
response = await client.get(
"http://localhost/data/_csv_progress_.json?_shape=array"
)
rows = json.loads(response.content)
assert 1 == len(rows)
row = rows[0]
assert row["filename"] == filename[:-4]
assert not row["error"], row
if row["bytes_todo"] == row["bytes_done"]:
break
iterations += 1
assert iterations < fail_after, "Took too long: {}".format(row)
await asyncio.sleep(0.5)

rows = list(db[filename[:-4]].rows)
assert rows == expected_rows
Expand Down

0 comments on commit 8f9e6bf

Please sign in to comment.