Skip to content

Commit

Permalink
bluesky.from_as1: handle bad facet indices that point inside Unicode …
Browse files Browse the repository at this point in the history
  • Loading branch information
snarfed committed Feb 4, 2024
1 parent 4a9fa34 commit decf81e
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 5 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,7 @@ Changelog
* Add support for `app.bsky.feed.repost`, `app.bsky.graph.defs#listView`.
* Add `actor`/`author` based on `repo_did`.
* Improve `url` field: include custom handles, only use `repo_did/handle` for `app.bsky.actor.profile`.
* Handle bad facet indices that point inside Unicode code points ([example](https://bsky.app/profile/did:plc:2ythpj4pwwpka2ljkabouubm/post/3kkfszbaiic2g); [discussion](https://discord.com/channels/1097580399187738645/1097580399187738648/1203118842516082848)).
* `from_as1`:
* Add `out_type` kwarg to specify desired output type, eg `app.bsky.actor.profile` vs `app.bsky.actor.defs#profileViewBasic` vs `app.bsky.actor.defs#profileView`.
* Add `blobs` kwarg to provide blob objects to use for image URLs.
Expand Down
14 changes: 9 additions & 5 deletions granary/bluesky.py
Original file line number Diff line number Diff line change
Expand Up @@ -736,12 +736,16 @@ def to_as1(obj, type=None, uri=None, repo_did=None, repo_handle=None,
# convert indices from UTF-8 encoded bytes to Unicode chars (code points)
# https://github.com/snarfed/atproto/blob/5b0c2d7dd533711c17202cd61c0e101ef3a81971/lexicons/app/bsky/richtext/facet.json#L34
byte_start = index.get('byteStart')
if byte_start is not None:
tag['startIndex'] = len(text.encode()[:byte_start].decode())
byte_end = index.get('byteEnd')
if byte_end is not None:
tag['displayName'] = text.encode()[byte_start:byte_end].decode()
tag['length'] = len(tag['displayName'])

try:
if byte_start is not None:
tag['startIndex'] = len(text.encode()[:byte_start].decode())
if byte_end is not None:
tag['displayName'] = text.encode()[byte_start:byte_end].decode()
tag['length'] = len(tag['displayName'])
except UnicodeDecodeError as e:
logger.warning(f"Couldn't apply facet {facet} to unicode text: {text}")

tags.append(tag)

Expand Down
28 changes: 28 additions & 0 deletions granary/tests/test_bluesky.py
Original file line number Diff line number Diff line change
Expand Up @@ -1099,6 +1099,34 @@ def test_to_as1_facet_link_and_embed(self):
'tags': [FACET_TAG],
}), to_as1(bsky))

def test_to_as1_facet_bad_index_inside_unicode_code_point(self):
# byteStart points into the middle of a Unicode code point
# https://bsky.app/profile/did:plc:2ythpj4pwwpka2ljkabouubm/post/3kkfszbaiic2g
# https://discord.com/channels/1097580399187738645/1097580399187738648/1203118842516082848
self.assert_equals({
'objectType': 'note',
'published': '2007-07-07T03:04:05',
'content': 'TIL: DNDEBUGはおいそれと外せない(問題が起こるので外そうとしていたけど思い直している)',
'tags': [{
'objectType': 'article',
'url': 'https://seclists.org/bugtraq/2018/Dec/46',
}],
}, to_as1({
'$type' : 'app.bsky.feed.post',
'text' : 'TIL: DNDEBUGはおいそれと外せない(問題が起こるので外そうとしていたけど思い直している)',
'createdAt' : '2007-07-07T03:04:05',
'facets' : [{
'features' : [{
'$type' : 'app.bsky.richtext.facet#link',
'uri' : 'https://seclists.org/bugtraq/2018/Dec/46',
}],
'index' : {
'byteEnd' : 90,
'byteStart' : 50,
},
}],
}))

def test_blob_to_url(self):
self.assertIsNone(blob_to_url(blob={'foo': 'bar'}, repo_did='x', pds='y'))
self.assertEqual(NEW_BLOB_URL, blob_to_url(blob=NEW_BLOB,
Expand Down

0 comments on commit decf81e

Please sign in to comment.