Skip to content

Commit

Permalink
google+: update HTML scraping
Browse files Browse the repository at this point in the history
  • Loading branch information
snarfed committed May 15, 2017
1 parent 264cd5b commit abfd2b7
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 11 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,8 @@ Changelog
* microformats2:
* Add [`u-featured`](https://indieweb.org/featured) to ActivityStreams `image`.
* Minor whitespace change (added <p>) when rendering locations as HTML.
* Google+:
* Update HTML scraping to handle changed serialized JSON data format.
* Atom:
* Add new `reader` query param for toggling rendering decisions that are specific to feed readers. Right now, just affects location: it's rendered in the content when `reader=true` (the default), omitted when `reader=false`.
* Include author name when rendering attached articles and notes (e.g. quote tweets).
Expand Down
6 changes: 3 additions & 3 deletions granary/googleplus.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,9 +314,9 @@ def html_to_activities(self, html):
list of ActivityStreams activity dicts
"""
# extract JSON data blob
script_start = "<script>AF_initDataCallback({key: '161', isError: false , hash: '14', data:"
script_start = "AF_initDataCallback({key: 'ds:5', isError: false , hash: '10', data:function(){return"
start = html.find(script_start)
end = html.find('});</script>', start)
end = html.find('}});</script>', start)
if start == -1 or end == -1:
return []
start += len(script_start)
Expand All @@ -327,7 +327,7 @@ def html_to_activities(self, html):
for i in range(2):
html = re.sub(r'([,[])\s*([],])', r'\1null\2', html)

data = json.loads(html)[1][7][1:]
data = json.loads(html)[0][7]
data = [d[6].values()[0] for d in data if len(d) >= 7 and d[6]]

activities = []
Expand Down
15 changes: 7 additions & 8 deletions granary/test/test_googleplus.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,7 @@ def tag_uri(name):

# HTML from http://plus.google.com/
HTML_ACTIVITY_GP = [
["..."],
[1002, None, None, None, None, [1001, "z13gjrz4ymeldtd5f04chnrixnvpjjqy42o"],
[1002, None, None, None, None, [1002, "z13gjrz4ymeldtd5f04chnrixnvpjjqy42o"],
{"33558957" : [
"",
"",
Expand Down Expand Up @@ -240,14 +239,14 @@ def tag_uri(name):
...
</style></head><body class="Td lj"><input type="text" name="hist_state" id="hist_state" style="display:none;"><iframe id="hist_frame" name="hist_frame1623222153" class="ss" tabindex="-1"></iframe><script>window['OZ_wizstart'] && window['OZ_wizstart']()</script>
<script>AF_initDataCallback({key: '199', isError: false , hash: '13', data:[2,0]
});</script><script>AF_initDataCallback({key: '161', isError: false , hash: '14', data:["os.con",[[]
});</script><script>AF_initDataCallback({key: 'ds:5', isError: false , hash: '10', data:function(){return [[
,"these few lines test the code that collapses commas",
[,1,1,,,,20,,"social.google.com",[,]
,,,2,,,0,,15,,[[1002,2],"..."]],,[,],,,"""
,,,,,"""

HTML_ACTIVITIES_GP_FOOTER = """
]
]
});</script></body></html>"""
}});</script></body></html>"""

HTML_ACTIVITY_AS = { # Google+
'id': tag_uri('z13gjrz4ymeldtd5f04chnrixnvpjjqy42o'),
Expand Down Expand Up @@ -523,7 +522,7 @@ def test_html_to_activities(self):

def test_html_to_activities_plusoned(self):
html_gp = copy.deepcopy(HTML_ACTIVITY_GP)
html_gp[1][6].values()[0][69] = [
html_gp[0][6].values()[0][69] = [
202,
[['Billy Bob',
'1056789',
Expand Down Expand Up @@ -555,7 +554,7 @@ def test_html_to_activities_plusoned(self):
def test_html_to_activities_similar_to_plusoned(self):
html_gp = copy.deepcopy(HTML_ACTIVITY_GP)
for data_at_69 in None, [], [None], [None, None], [None, [None]]:
html_gp[1][6].values()[0][69] = data_at_69
html_gp[0][6].values()[0][69] = data_at_69
html = (HTML_ACTIVITIES_GP_HEADER + json.dumps(html_gp) +
HTML_ACTIVITIES_GP_FOOTER)
self.assert_equals([HTML_ACTIVITY_AS],
Expand Down

0 comments on commit abfd2b7

Please sign in to comment.