Skip to content

Commit

Permalink
instagram: improve HTML scraping error handling
Browse files Browse the repository at this point in the history
  • Loading branch information
snarfed committed Dec 4, 2016
1 parent 4cdd0d0 commit 88dfd3b
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 1 deletion.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,8 @@ Changelog
### 1.7 - unreleased
* Twitter:
* Linkify @-mentions and hashtags in `preview_create()`.
* Instagram:
* Improve HTML scraping error handling.

### 1.6 - 2016-11-26
* Twitter:
Expand Down
6 changes: 5 additions & 1 deletion granary/instagram.py
Original file line number Diff line number Diff line change
Expand Up @@ -722,6 +722,7 @@ def html_to_activities(self, html):
script_start = '<script type="text/javascript">window._sharedData = '
start = html.find(script_start)
if start == -1:
logging.warning('JSON script tag not found! Raw HTML:\n' + html)
return [], None

# App Engine's Python 2.7.5 json module doesn't support unpaired surrogate
Expand All @@ -736,7 +737,10 @@ def html_to_activities(self, html):
json_module = json

start += len(script_start)
end = html.index(';</script>', start)
end = html.find(';</script>', start)
if end == -1:
logging.warning('JSON script close tag not found! Raw HTML:\n' + html)
return [], None
data = json_module.loads(html[start:end])

entry_data = data.get('entry_data', {})
Expand Down
12 changes: 12 additions & 0 deletions granary/test/test_instagram.py
Original file line number Diff line number Diff line change
Expand Up @@ -772,6 +772,8 @@ def tag_uri(name):
HTML_PROFILE_PRIVATE_COMPLETE = HTML_HEADER + json.dumps(HTML_PROFILE_PRIVATE) + HTML_FOOTER
HTML_PHOTO_COMPLETE = HTML_HEADER + json.dumps(HTML_PHOTO_PAGE) + HTML_FOOTER
HTML_VIDEO_COMPLETE = HTML_HEADER + json.dumps(HTML_VIDEO_PAGE) + HTML_FOOTER
HTML_PHOTO_MISSING_HEADER = json.dumps(HTML_PHOTO_PAGE) + HTML_FOOTER
HTML_PHOTO_MISSING_FOOTER = HTML_HEADER + json.dumps(HTML_PHOTO_PAGE)


class InstagramTest(testutil.HandlerTest):
Expand Down Expand Up @@ -1363,6 +1365,16 @@ def test_html_to_activities_missing_video_url(self):
del expected[1]['object']['attachments'][0]['stream'][0]['url']
self.assert_equals(expected, activities)

def test_html_to_activities_missing_header(self):
activities, viewer = self.instagram.html_to_activities(HTML_PHOTO_MISSING_HEADER)
self.assert_equals([], activities)
self.assertIsNone(viewer)

def test_html_to_activities_missing_footer(self):
activities, viewer = self.instagram.html_to_activities(HTML_PHOTO_MISSING_FOOTER)
self.assert_equals([], activities)
self.assertIsNone(viewer)

def test_id_to_shortcode(self):
for shortcode, id in (
(None, None),
Expand Down

0 comments on commit 88dfd3b

Please sign in to comment.