Skip to content

Commit

Permalink
Switch to dash encoding for table/database/row-pk in paths
Browse files Browse the repository at this point in the history
* Dash encoding functions, tests and docs, refs #1439
* dash encoding is now like percent encoding but with dashes
* Use dash-encoding for row PKs and ?_next=, refs #1439
* Use dash encoding for table names, refs #1439
* Use dash encoding for database names, too, refs #1439

See also https://simonwillison.net/2022/Mar/5/dash-encoding/
  • Loading branch information
simonw committed Mar 7, 2022
1 parent de810f4 commit 1baa030
Show file tree
Hide file tree
Showing 13 changed files with 173 additions and 53 deletions.
10 changes: 5 additions & 5 deletions datasette/url_builder.py
@@ -1,4 +1,4 @@
from .utils import path_with_format, HASH_LENGTH, PrefixedUrlString
from .utils import dash_encode, path_with_format, HASH_LENGTH, PrefixedUrlString
import urllib


Expand Down Expand Up @@ -31,20 +31,20 @@ def database(self, database, format=None):
db = self.ds.databases[database]
if self.ds.setting("hash_urls") and db.hash:
path = self.path(
f"{urllib.parse.quote(database)}-{db.hash[:HASH_LENGTH]}", format=format
f"{dash_encode(database)}-{db.hash[:HASH_LENGTH]}", format=format
)
else:
path = self.path(urllib.parse.quote(database), format=format)
path = self.path(dash_encode(database), format=format)
return path

def table(self, database, table, format=None):
path = f"{self.database(database)}/{urllib.parse.quote_plus(table)}"
path = f"{self.database(database)}/{dash_encode(table)}"
if format is not None:
path = path_with_format(path=path, format=format)
return PrefixedUrlString(path)

def query(self, database, query, format=None):
path = f"{self.database(database)}/{urllib.parse.quote_plus(query)}"
path = f"{self.database(database)}/{dash_encode(query)}"
if format is not None:
path = path_with_format(path=path, format=format)
return PrefixedUrlString(path)
Expand Down
41 changes: 37 additions & 4 deletions datasette/utils/__init__.py
Expand Up @@ -112,12 +112,12 @@ async def await_me_maybe(value: typing.Any) -> typing.Any:


def urlsafe_components(token):
"""Splits token on commas and URL decodes each component"""
return [urllib.parse.unquote_plus(b) for b in token.split(",")]
"""Splits token on commas and dash-decodes each component"""
return [dash_decode(b) for b in token.split(",")]


def path_from_row_pks(row, pks, use_rowid, quote=True):
"""Generate an optionally URL-quoted unique identifier
"""Generate an optionally dash-quoted unique identifier
for a row from its primary keys."""
if use_rowid:
bits = [row["rowid"]]
Expand All @@ -126,7 +126,7 @@ def path_from_row_pks(row, pks, use_rowid, quote=True):
row[pk]["value"] if isinstance(row[pk], dict) else row[pk] for pk in pks
]
if quote:
bits = [urllib.parse.quote_plus(str(bit)) for bit in bits]
bits = [dash_encode(str(bit)) for bit in bits]
else:
bits = [str(bit) for bit in bits]

Expand Down Expand Up @@ -1140,3 +1140,36 @@ def add_cors_headers(headers):
headers["Access-Control-Allow-Origin"] = "*"
headers["Access-Control-Allow-Headers"] = "Authorization"
headers["Access-Control-Expose-Headers"] = "Link"


_DASH_ENCODING_SAFE = frozenset(
b"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
b"abcdefghijklmnopqrstuvwxyz"
b"0123456789_"
# This is the same as Python percent-encoding but I removed
# '.' and '-' and '~'
)


class DashEncoder(dict):
# Keeps a cache internally, via __missing__
def __missing__(self, b):
# Handle a cache miss, store encoded string in cache and return.
res = chr(b) if b in _DASH_ENCODING_SAFE else "-{:02X}".format(b)
self[b] = res
return res


_dash_encoder = DashEncoder().__getitem__


@documented
def dash_encode(s: str) -> str:
"Returns dash-encoded string - for example ``/foo/bar`` -> ``-2Ffoo-2Fbar``"
return "".join(_dash_encoder(char) for char in s.encode("utf-8"))


@documented
def dash_decode(s: str) -> str:
"Decodes a dash-encoded string, so ``-2Ffoo-2Fbar`` -> ``/foo/bar``"
return urllib.parse.unquote(s.replace("-", "%"))
24 changes: 12 additions & 12 deletions datasette/views/base.py
Expand Up @@ -17,6 +17,8 @@
InvalidSql,
LimitedWriter,
call_with_supported_arguments,
dash_decode,
dash_encode,
path_from_row_pks,
path_with_added_args,
path_with_removed_args,
Expand Down Expand Up @@ -203,17 +205,17 @@ async def data(self, request, database, hash, **kwargs):
async def resolve_db_name(self, request, db_name, **kwargs):
hash = None
name = None
db_name = urllib.parse.unquote_plus(db_name)
if db_name not in self.ds.databases and "-" in db_name:
decoded_name = dash_decode(db_name)
if decoded_name not in self.ds.databases and "-" in db_name:
# No matching DB found, maybe it's a name-hash?
name_bit, hash_bit = db_name.rsplit("-", 1)
if name_bit not in self.ds.databases:
if dash_decode(name_bit) not in self.ds.databases:
raise NotFound(f"Database not found: {name}")
else:
name = name_bit
name = dash_decode(name_bit)
hash = hash_bit
else:
name = db_name
name = decoded_name

try:
db = self.ds.databases[name]
Expand All @@ -233,21 +235,19 @@ async def async_table_exists(t):
return await db.table_exists(t)

table, _format = await resolve_table_and_format(
table_and_format=urllib.parse.unquote_plus(
kwargs["table_and_format"]
),
table_and_format=dash_decode(kwargs["table_and_format"]),
table_exists=async_table_exists,
allowed_formats=self.ds.renderers.keys(),
)
kwargs["table"] = table
if _format:
kwargs["as_format"] = f".{_format}"
elif kwargs.get("table"):
kwargs["table"] = urllib.parse.unquote_plus(kwargs["table"])
kwargs["table"] = dash_decode(kwargs["table"])

should_redirect = self.ds.urls.path(f"{name}-{expected}")
if kwargs.get("table"):
should_redirect += "/" + urllib.parse.quote_plus(kwargs["table"])
should_redirect += "/" + dash_encode(kwargs["table"])
if kwargs.get("pk_path"):
should_redirect += "/" + kwargs["pk_path"]
if kwargs.get("as_format"):
Expand Down Expand Up @@ -467,15 +467,15 @@ async def async_table_exists(t):
return await db.table_exists(t)

table, _ext_format = await resolve_table_and_format(
table_and_format=urllib.parse.unquote_plus(args["table_and_format"]),
table_and_format=dash_decode(args["table_and_format"]),
table_exists=async_table_exists,
allowed_formats=self.ds.renderers.keys(),
)
_format = _format or _ext_format
args["table"] = table
del args["table_and_format"]
elif "table" in args:
args["table"] = urllib.parse.unquote_plus(args["table"])
args["table"] = dash_decode(args["table"])
return _format, args

async def view_get(self, request, database, hash, correct_hash_provided, **kwargs):
Expand Down
9 changes: 5 additions & 4 deletions datasette/views/table.py
Expand Up @@ -12,6 +12,7 @@
MultiParams,
append_querystring,
compound_keys_after_sql,
dash_encode,
escape_sqlite,
filters_should_redirect,
is_url,
Expand Down Expand Up @@ -142,7 +143,7 @@ async def display_columns_and_rows(
'<a href="{base_url}{database}/{table}/{flat_pks_quoted}">{flat_pks}</a>'.format(
base_url=base_url,
database=database,
table=urllib.parse.quote_plus(table),
table=dash_encode(table),
flat_pks=str(markupsafe.escape(pk_path)),
flat_pks_quoted=path_from_row_pks(row, pks, not pks),
)
Expand Down Expand Up @@ -199,8 +200,8 @@ async def display_columns_and_rows(
link_template.format(
database=database,
base_url=base_url,
table=urllib.parse.quote_plus(other_table),
link_id=urllib.parse.quote_plus(str(value)),
table=dash_encode(other_table),
link_id=dash_encode(str(value)),
id=str(markupsafe.escape(value)),
label=str(markupsafe.escape(label)) or "-",
)
Expand Down Expand Up @@ -765,7 +766,7 @@ async def data(
if prefix is None:
prefix = "$null"
else:
prefix = urllib.parse.quote_plus(str(prefix))
prefix = dash_encode(str(prefix))
next_value = f"{prefix},{next_value}"
added_args = {"_next": next_value}
if sort:
Expand Down
26 changes: 26 additions & 0 deletions docs/internals.rst
Expand Up @@ -876,6 +876,32 @@ Utility function for calling ``await`` on a return value if it is awaitable, oth

.. autofunction:: datasette.utils.await_me_maybe

.. _internals_dash_encoding:

Dash encoding
-------------

Datasette uses a custom encoding scheme in some places, called **dash encoding**. This is primarily used for table names and row primary keys, to avoid any confusion between ``/`` characters in those values and the Datasette URLs that reference them.

Dash encoding uses the same algorithm as `URL percent-encoding <https://developer.mozilla.org/en-US/docs/Glossary/percent-encoding>`__, but with the ``-`` hyphen character used in place of ``%``.

Any character other than ``ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz 0123456789_`` will be replaced by the numeric equivalent preceded by a hyphen. For example:

- ``/`` becomes ``-2F``
- ``.`` becomes ``-2E``
- ``%`` becomes ``-25``
- ``-`` becomes ``-2D``
- Space character becomes ``-20``
- ``polls/2022.primary`` becomes ``polls-2F2022-2Eprimary``

.. _internals_utils_dash_encode:

.. autofunction:: datasette.utils.dash_encode

.. _internals_utils_dash_decode:

.. autofunction:: datasette.utils.dash_decode

.. _internals_tracer:

datasette.tracer
Expand Down
1 change: 1 addition & 0 deletions tests/fixtures.py
Expand Up @@ -406,6 +406,7 @@ def generate_sortable_rows(num):
);
INSERT INTO compound_primary_key VALUES ('a', 'b', 'c');
INSERT INTO compound_primary_key VALUES ('a/b', '.c-d', 'c');
CREATE TABLE compound_three_primary_keys (
pk1 varchar(30),
Expand Down
19 changes: 16 additions & 3 deletions tests/test_api.py
Expand Up @@ -143,7 +143,7 @@ def test_database_page(app_client):
"name": "compound_primary_key",
"columns": ["pk1", "pk2", "content"],
"primary_keys": ["pk1", "pk2"],
"count": 1,
"count": 2,
"hidden": False,
"fts_table": None,
"foreign_keys": {"incoming": [], "outgoing": []},
Expand Down Expand Up @@ -942,7 +942,7 @@ def test_cors(app_client_with_cors, path, status_code):
)
def test_database_with_space_in_name(app_client_two_attached_databases, path):
response = app_client_two_attached_databases.get(
"/extra database" + path, follow_redirects=True
"/extra-20database" + path, follow_redirects=True
)
assert response.status == 200

Expand All @@ -953,7 +953,7 @@ def test_common_prefix_database_names(app_client_conflicting_database_names):
d["name"]
for d in app_client_conflicting_database_names.get("/-/databases.json").json
]
for db_name, path in (("foo", "/foo.json"), ("foo-bar", "/foo-bar.json")):
for db_name, path in (("foo", "/foo.json"), ("foo-bar", "/foo-2Dbar.json")):
data = app_client_conflicting_database_names.get(path).json
assert db_name == data["database"]

Expand Down Expand Up @@ -992,3 +992,16 @@ async def test_hidden_sqlite_stat1_table():
data = (await ds.client.get("/db.json?_show_hidden=1")).json()
tables = [(t["name"], t["hidden"]) for t in data["tables"]]
assert tables == [("normal", False), ("sqlite_stat1", True)]


@pytest.mark.asyncio
@pytest.mark.parametrize("db_name", ("foo", r"fo%o", "f~/c.d"))
async def test_dash_encoded_database_names(db_name):
ds = Datasette()
ds.add_memory_database(db_name)
response = await ds.client.get("/.json")
assert db_name in response.json().keys()
path = response.json()[db_name]["path"]
# And the JSON for that database
response2 = await ds.client.get(path + ".json")
assert response2.status_code == 200
5 changes: 3 additions & 2 deletions tests/test_cli.py
Expand Up @@ -9,6 +9,7 @@
from datasette.plugins import DEFAULT_PLUGINS
from datasette.cli import cli, serve
from datasette.version import __version__
from datasette.utils import dash_encode
from datasette.utils.sqlite import sqlite3
from click.testing import CliRunner
import io
Expand Down Expand Up @@ -294,12 +295,12 @@ def test_weird_database_names(ensure_eventloop, tmpdir, filename):
assert result1.exit_code == 0, result1.output
filename_no_stem = filename.rsplit(".", 1)[0]
expected_link = '<a href="/{}">{}</a>'.format(
urllib.parse.quote(filename_no_stem), filename_no_stem
dash_encode(filename_no_stem), filename_no_stem
)
assert expected_link in result1.output
# Now try hitting that database page
result2 = runner.invoke(
cli, [db_path, "--get", "/{}".format(urllib.parse.quote(filename_no_stem))]
cli, [db_path, "--get", "/{}".format(dash_encode(filename_no_stem))]
)
assert result2.exit_code == 0, result2.output

Expand Down

0 comments on commit 1baa030

Please sign in to comment.