-
Notifications
You must be signed in to change notification settings - Fork 51
/
original_post_discovery.py
550 lines (463 loc) 路 20.4 KB
/
original_post_discovery.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
"""Augments the standard original_post_discovery algorithm with a
reverse lookup that supports posts without a backlink or citation.
Performs a reverse-lookup that scans the activity's author's h-feed
for posts with rel=syndication links. As we find syndicated copies,
save the relationship. If we find the original post for the activity
in question, return the original's URL.
See http://indiewebcamp.com/posse-post-discovery for more detail.
This feature adds costs in terms of HTTP requests and database
lookups in the following primary cases:
- Author's domain is known to be invalid or blacklisted, there will
be 0 requests and 0 DB lookups.
- For a syndicated post has been seen previously (regardless of
whether discovery was successful), there will be 0 requests and 1
DB lookup.
- The first time a syndicated post has been seen:
- 1 to 2 HTTP requests to get and parse the h-feed plus 1 additional
request for *each* post permalink that has not been seen before.
- 1 DB query for the initial check plus 1 additional DB query for
*each* post permalink.
"""
import datetime
import itertools
import logging
import mf2py
import requests
import urlparse
import util
from granary import source as gr_source
from google.appengine.api.datastore import MAX_ALLOWABLE_QUERIES
from bs4 import BeautifulSoup
import models
from models import SyndicatedPost
from google.appengine.api import memcache
MAX_AUTHOR_URLS = 5
def discover(source, activity, fetch_hfeed=True, include_redirect_sources=True):
"""Augments the standard original_post_discovery algorithm with a
reverse lookup that supports posts without a backlink or citation.
If fetch_hfeed is False, then we will check the db for previously
found SyndicatedPosts but will not do posse-post-discovery to find
new ones.
Args:
source: models.Source subclass. Changes to property values (e.g. domains,
domain_urls, last_syndication_url) are stored in source.updates; they
should be updated transactionally later.
activity: activity dict
fetch_hfeed: boolean
include_redirect_sources: boolean, whether to include URLs that redirect as
well as their final destination URLs
Returns: ([string original post URLs], [string mention URLs]) tuple
"""
if not source.updates:
source.updates = {}
originals, mentions = gr_source.Source.original_post_discovery(
activity, domains=source.domains, cache=memcache,
include_redirect_sources=include_redirect_sources,
headers=util.USER_AGENT_HEADER)
obj = activity.get('object', {})
author_id = obj.get('author', {}).get('id') or activity.get('author', {}).get('id')
if author_id and author_id != source.user_tag_id():
logging.info(
"Demoting original post links because user %s doesn't match author %s",
source.user_tag_id(), author_id)
# this is someone else's post, so all links must be mentions
mentions.update(originals)
originals = set()
def resolve(urls):
resolved = set()
for url in urls:
final, _, send = util.get_webmention_target(url)
if send:
resolved.add(final)
if include_redirect_sources:
resolved.add(url)
return resolved
originals = resolve(originals)
mentions = resolve(mentions)
if not source.get_author_urls():
logging.debug('no author url(s), cannot find h-feed')
return originals, mentions
# TODO possible optimization: if we've discovered a backlink to a post on the
# author's domain (i.e., it included a link or citation), then skip the rest
# of this.
syndication_url = obj.get('url') or activity.get('url')
if syndication_url:
# use the canonical syndication url on both sides, so that we have
# the best chance of finding a match. Some silos allow several
# different permalink formats to point to the same place (e.g.,
# facebook user id instead of user name)
syndication_url = source.canonicalize_syndication_url(
util.follow_redirects(syndication_url).url)
originals.update(_posse_post_discovery(
source, activity, syndication_url, fetch_hfeed))
else:
logging.debug('no syndication url, cannot process h-entries')
return originals, mentions
def refetch(source):
"""Refetch the author's URLs and look for new or updated syndication
links that might not have been there the first time we looked.
Args:
source: models.Source subclass. Changes to property values (e.g. domains,
domain_urls, last_syndication_url) are stored in source.updates; they
should be updated transactionally later.
Return:
a dict of syndicated_url to a list of new models.SyndicatedPosts
"""
if not source.updates:
source.updates = {}
logging.debug('attempting to refetch h-feed for %s', source.label())
results = {}
for url in _get_author_urls(source):
results.update(_process_author(source, url, refetch=True))
now = util.now_fn()
logging.debug('updating source last_hfeed_fetch %s', now)
source.updates['last_hfeed_fetch'] = now
return results
def targets_for_response(resp, originals, mentions):
"""Returns the URLs that we should send webmentions to for a given response.
...specifically, all responses except posts get sent to original post URLs,
but only posts and comments get sent to mentioned URLs.
Args:
resp: ActivityStreams response object
originals, mentions: sequence of string URLs
Returns: set of string URLs
"""
type = models.Response.get_type(resp)
targets = set()
if type != 'post':
targets |= originals
if type in ('post', 'comment'):
targets |= mentions
return targets
def _posse_post_discovery(source, activity, syndication_url, fetch_hfeed):
"""Performs the actual meat of the posse-post-discover.
Args:
source: models.Source subclass
activity: activity dict
syndication_url: url of the syndicated copy for which we are
trying to find an original
fetch_hfeed: boolean, whether or not to fetch and parse the
author's feed if we don't have a previously stored
relationship.
Return:
sequence of string original post urls, possibly empty
"""
logging.info('starting posse post discovery with syndicated %s', syndication_url)
relationships = SyndicatedPost.query(
SyndicatedPost.syndication == syndication_url,
ancestor=source.key).fetch()
if not relationships and fetch_hfeed:
# a syndicated post we haven't seen before! fetch the author's URLs to see
# if we can find it.
#
# TODO: Consider using the actor's url, with get_author_urls() as the
# fallback in the future to support content from non-Bridgy users.
results = {}
for url in _get_author_urls(source):
results.update(_process_author(source, url))
relationships = results.get(syndication_url, [])
now = util.now_fn()
logging.debug('updating source last_hfeed_fetch %s', now)
source.updates['last_hfeed_fetch'] = util.now_fn()
if not relationships:
# No relationships were found. Remember that we've seen this
# syndicated post to avoid reprocessing it every time
logging.debug('posse post discovery found no relationship for %s',
syndication_url)
if fetch_hfeed:
SyndicatedPost.insert_syndication_blank(source, syndication_url)
originals = [r.original for r in relationships if r.original]
if originals:
logging.debug('posse post discovery found relationship(s) %s -> %s',
syndication_url, originals)
return originals
def _process_author(source, author_url, refetch=False, store_blanks=True):
"""Fetch the author's domain URL, and look for syndicated posts.
Args:
source: a subclass of models.Source
author_url: the author's homepage URL
refetch: boolean, whether to refetch and process entries we've seen before
store_blanks: boolean, whether we should store blank SyndicatedPosts when
we don't find a relationship
Return:
a dict of syndicated_url to a list of new models.SyndicatedPost
"""
# for now use whether the url is a valid webmention target
# as a proxy for whether it's worth searching it.
# TODO skip sites we know don't have microformats2 markup
author_url, _, ok = util.get_webmention_target(author_url)
if not ok:
return {}
try:
logging.debug('fetching author url %s', author_url)
author_resp = util.requests_get(author_url)
# TODO for error codes that indicate a temporary error, should we make
# a certain number of retries before giving up forever?
author_resp.raise_for_status()
except AssertionError:
raise # for unit tests
except BaseException:
# TODO limit allowed failures, cache the author's h-feed url
# or the # of times we've failed to fetch it
logging.warning('Could not fetch author url %s', author_url, exc_info=True)
return {}
author_dom = BeautifulSoup(author_resp.text)
feeditems = _find_feed_items(author_url, author_dom)
# look for all other feed urls using rel='feed', type='text/html'
feed_urls = set()
for rel_feed_node in (author_dom.find_all('link', rel='feed')
+ author_dom.find_all('a', rel='feed')):
feed_url = rel_feed_node.get('href')
if not feed_url:
continue
feed_url = urlparse.urljoin(author_url, feed_url)
feed_type = rel_feed_node.get('type')
if not feed_type:
# type is not specified, use this to confirm that it's text/html
feed_url, _, feed_type_ok = util.get_webmention_target(feed_url)
else:
feed_type_ok = feed_type == 'text/html'
if feed_url == author_url:
logging.debug('author url is the feed url, ignoring')
elif not feed_type_ok:
logging.debug('skipping feed of type %s', feed_type)
else:
feed_urls.add(feed_url)
for feed_url in feed_urls:
try:
logging.debug("fetching author's rel-feed %s", feed_url)
feed_resp = util.requests_get(feed_url)
feed_resp.raise_for_status()
logging.debug("author's rel-feed fetched successfully %s", feed_url)
feeditems = _merge_hfeeds(feeditems,
_find_feed_items(feed_url, feed_resp.text))
domain = util.domain_from_link(feed_url)
if source.updates is not None and domain not in source.domains:
domains = source.updates.setdefault('domains', source.domains)
if domain not in domains:
logging.info('rel-feed found new domain %s! adding to source', domain)
domains.append(domain)
except AssertionError:
raise # reraise assertions for unit tests
except BaseException:
logging.warning('Could not fetch h-feed url %s.', feed_url,
exc_info=True)
permalink_to_entry = {}
for child in feeditems:
if 'h-entry' in child['type']:
# TODO maybe limit to first ~30 entries? (do that here rather than,
# below because we want the *first* n entries)
for permalink in child['properties'].get('url', []):
if isinstance(permalink, basestring):
permalink_to_entry[permalink] = child
else:
logging.warn('unexpected non-string "url" property: %s', permalink)
# query all preexisting permalinks at once, instead of once per link
permalinks_list = list(permalink_to_entry.keys())
# fetch the maximum allowed entries (currently 30) at a time
preexisting_list = itertools.chain.from_iterable(
SyndicatedPost.query(
SyndicatedPost.original.IN(permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]),
ancestor=source.key)
for i in xrange(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES))
preexisting = {}
for r in preexisting_list:
preexisting.setdefault(r.original, []).append(r)
results = {}
for permalink, entry in permalink_to_entry.iteritems():
logging.debug('processing permalink: %s', permalink)
new_results = _process_entry(
source, permalink, entry, refetch, preexisting.get(permalink, []),
store_blanks=store_blanks)
for key, value in new_results.iteritems():
results.setdefault(key, []).extend(value)
if source.updates is not None and results:
# keep track of the last time we've seen rel=syndication urls for
# this author. this helps us decide whether to refetch periodically
# and look for updates.
# Source will be saved at the end of each round of polling
now = util.now_fn()
logging.debug('updating source last_syndication_url %s', now)
source.updates['last_syndication_url'] = now
return results
def _merge_hfeeds(feed1, feed2):
"""Merge items from two h-feeds into a composite feed. Skips items in
feed2 that are already represented in feed1, based on the "url" property.
Args:
feed1: a list of dicts
feed2: a list of dicts
Returns:
a list of dicts
"""
seen = set()
for item in feed1:
for url in item.get('properties', {}).get('url', []):
if isinstance(url, basestring):
seen.add(url)
return feed1 + [item for item in feed2 if all(
url not in seen for url in item.get('properties', {}).get('url', []))]
def _find_feed_items(feed_url, feed_doc):
"""Extract feed items from a given URL and document. If the top-level
h-* item is an h-feed, return its children. Otherwise, returns the
top-level items.
Args:
feed_url: a string. the URL passed to mf2py parser
feed_doc: a string or BeautifulSoup object. document is passed to
mf2py parser
Returns:
a list of dicts, each one representing an mf2 h-* item
"""
parsed = mf2py.Parser(url=feed_url, doc=feed_doc).to_dict()
feeditems = parsed['items']
hfeed = next((item for item in feeditems
if 'h-feed' in item['type']), None)
if hfeed:
feeditems = hfeed.get('children', [])
else:
logging.debug('No h-feed found, fallback to top-level h-entrys.')
return feeditems
def _process_entry(source, permalink, feed_entry, refetch, preexisting,
store_blanks=True):
"""Fetch and process an h-entry, saving a new SyndicatedPost to the
DB if successful.
Args:
source:
permalink: url of the unprocessed post
feed_entry: the h-feed version of the h-entry dict, often contains
a partial version of the h-entry at the permalink
refetch: boolean, whether to refetch and process entries we've seen before
preexisting: a list of previously discovered models.SyndicatedPosts
for this permalink
store_blanks: boolean, whether we should store blank SyndicatedPosts when
we don't find a relationship
Returns:
a dict from syndicated url to a list of new models.SyndicatedPosts
"""
# if the post has already been processed, do not add to the results
# since this method only returns *newly* discovered relationships.
if preexisting:
# if we're refetching and this one is blank, do not return.
# if there is a blank entry, it should be the one and only entry,
# but go ahead and check 'all' of them to be safe.
if not refetch:
return {}
synds = [s.syndication for s in preexisting if s.syndication]
if synds:
logging.debug('previously found relationship(s) for original %s: %s',
permalink, synds)
# first try with the h-entry from the h-feed. if we find the syndication url
# we're looking for, we don't have to fetch the permalink
permalink, _, type_ok = util.get_webmention_target(permalink)
usynd = feed_entry.get('properties', {}).get('syndication', [])
if usynd:
logging.debug('u-syndication links on the h-feed h-entry: %s', usynd)
results = _process_syndication_urls(source, permalink, set(
url for url in usynd if isinstance(url, basestring)), preexisting)
success = True
# fetch the full permalink page, which often has more detailed information
if not results:
parsed = None
try:
logging.debug('fetching post permalink %s', permalink)
if type_ok:
resp = util.requests_get(permalink)
resp.raise_for_status()
parsed = mf2py.Parser(url=permalink, doc=resp.text).to_dict()
except AssertionError:
raise # for unit tests
except BaseException:
# TODO limit the number of allowed failures
logging.warning('Could not fetch permalink %s', permalink, exc_info=True)
success = False
if parsed:
syndication_urls = set()
relsynd = parsed.get('rels').get('syndication', [])
if relsynd:
logging.debug('rel-syndication links: %s', relsynd)
syndication_urls.update(url for url in relsynd
if isinstance(url, basestring))
# there should only be one h-entry on a permalink page, but
# we'll check all of them just in case.
for hentry in (item for item in parsed['items']
if 'h-entry' in item['type']):
usynd = hentry.get('properties', {}).get('syndication', [])
if usynd:
logging.debug('u-syndication links: %s', usynd)
syndication_urls.update(url for url in usynd
if isinstance(url, basestring))
results = _process_syndication_urls(
source, permalink, syndication_urls, preexisting)
# detect and delete SyndicatedPosts that were removed from the site
if success:
result_syndposts = itertools.chain(*results.values())
for syndpost in list(preexisting):
if syndpost.syndication and syndpost not in result_syndposts:
logging.info('deleting relationship that disappeared: %s', syndpost)
syndpost.key.delete()
preexisting.remove(syndpost)
if not results:
logging.debug('no syndication links from %s to current source %s.',
permalink, source.label())
results = {}
if store_blanks and not preexisting:
# remember that this post doesn't have syndication links for this
# particular source
logging.debug('saving empty relationship so that %s will not be '
'searched again', permalink)
SyndicatedPost.insert_original_blank(source, permalink)
# only return results that are not in the preexisting list
new_results = {}
for syndurl, syndposts_for_url in results.iteritems():
for syndpost in syndposts_for_url:
if syndpost not in preexisting:
new_results.setdefault(syndurl, []).append(syndpost)
if new_results:
logging.debug('discovered relationships %s', new_results)
return new_results
def _process_syndication_urls(source, permalink, syndication_urls,
preexisting):
"""Process a list of syndication URLs looking for one that matches the
current source. If one is found, stores a new SyndicatedPost in the
db.
Args:
source: a models.Source subclass
permalink: a string. the current h-entry permalink
syndication_urls: a collection of strings. the unfitered list
of syndication urls
preexisting: a list of previously discovered SyndicatedPosts
Returns: dict mapping string syndication url to list of SyndicatedPost
"""
results = {}
# save the results (or lack thereof) to the db, and put them in a
# map for immediate use
for syndication_url in syndication_urls:
# follow redirects to give us the canonical syndication url --
# gives the best chance of finding a match.
syndication_url = util.follow_redirects(syndication_url).url
# source-specific logic to standardize the URL. (e.g., replace facebook
# username with numeric id)
syndication_url = source.canonicalize_syndication_url(syndication_url)
# check that the syndicated url belongs to this source TODO save future
# lookups by saving results for other sources too (note: query the
# appropriate source subclass by author.domains, rather than
# author.domain_urls)
if util.domain_from_link(syndication_url) == source.GR_CLASS.DOMAIN:
# we may have already seen this relationship, save a DB lookup by
# finding it in the preexisting list
relationship = next((sp for sp in preexisting
if sp.syndication == syndication_url
and sp.original == permalink), None)
if not relationship:
logging.debug('saving discovered relationship %s -> %s',
syndication_url, permalink)
relationship = SyndicatedPost.insert(
source, syndication=syndication_url, original=permalink)
results.setdefault(syndication_url, []).append(relationship)
return results
def _get_author_urls(source):
urls = source.get_author_urls()
if len(urls) > MAX_AUTHOR_URLS:
logging.warning('user has over %d URLs! only running PPD on %s. skipping %s.',
MAX_AUTHOR_URLS, urls[:MAX_AUTHOR_URLS], urls[MAX_AUTHOR_URLS:])
urls = urls[:MAX_AUTHOR_URLS]
return urls