snarfed · snarfed · Jun 23, 2014 · Jun 22, 2014 · Jun 23, 2014 · Jun 23, 2014
diff --git a/models.py b/models.py
@@ -629,6 +629,14 @@ def query_by_original(cls, source, url):
     return cls.query(cls.original == url,
                      ancestor=source.key).get()
 
+  @classmethod
+  def query_by_originals(cls, source, urls):
+    # 30 item limit for IN queries, chain together multiple queries
+    urls = list(urls)
+    return itertools.chain.from_iterable(
+        cls.query(cls.original.IN(urls[i:i + 30]), ancestor=source.key)
+        for i in xrange(0, len(urls), 30))
+
   @classmethod
   def query_by_syndication(cls, source, url):
     return cls.query(cls.syndication == url,
@@ -638,11 +646,22 @@ def query_by_syndication(cls, source, url):
   @ndb.transactional
   def get_or_insert_by_syndication_url(cls, source, syndication,
                                        original):
-    """Insert a relationship from syndication-url -> original.
+    """Insert a relationship from syndication-url -> original, replacing
+    blank placeholder relationships if they exist.
 
     This does a check-and-set inside a transaction to avoid putting
-    duplicates in the database because we assume each syndicated post
-    can only have one original.
+    duplicates in the database because we assume each syndication URL
+    can only have one original. If there is already a non-blank
+    SyndicatedPost for this syndication URL, this function will return
+    without saving anything.
+
+    If there is a pre-existing non-blank SyndicationPost for this
+    original, this function will add another relationship for the same
+    original.
+
+    If there are pre-existing syndication->None or original->None
+    relationships, this function will remove them before adding a
+    new non-blank relationship.
 
     Args:
       source: models.Source subclass
@@ -651,14 +670,22 @@ def get_or_insert_by_syndication_url(cls, source, syndication,
     """
     relationship = cls.query_by_syndication(source, syndication)
 
-    # replace blank relationships with newly discovered ones
-    if relationship and original and not relationship.original:
-      relationship.key.delete()
-      relationship = None
+    # do not overwrite a preexisting relationship
+    if relationship and relationship.original:
+      return relationship
+
+    # if this is a non-blank relationship, remove pre-existing blanks
+    if original and syndication:
+      # remove syndication->None relationships
+      if relationship and not relationship.original:
+        relationship.key.delete()
 
-    if not relationship:
-      relationship = cls(parent=source.key, original=original,
-                         syndication=syndication)
-      relationship.put()
+      # remove original->None relationships too
+      rel_by_original = cls.query_by_original(source, original)
+      if rel_by_original and not rel_by_original.syndication:
+        rel_by_original.key.delete()
 
+    relationship = cls(parent=source.key, original=original,
+                       syndication=syndication)
+    relationship.put()
     return relationship
diff --git a/original_post_discovery.py b/original_post_discovery.py
@@ -252,10 +252,15 @@ def _process_author(source, author_url, refetch_blanks=False):
       for permalink in child['properties'].get('url', []):
         permalinks.add(permalink)
 
+  # query all preexisting permalinks at once, instead of once per link
+  preexisting = {r.original: r for r in
+                 SyndicatedPost.query_by_originals(source, permalinks)}
+
   results = {}
   for permalink in permalinks:
     logging.debug('processing permalink: %s', permalink)
-    results.update(_process_entry(source, permalink, refetch_blanks))
+    results.update(_process_entry(source, permalink, refetch_blanks,
+                                  preexisting))
 
   if results:
     # keep track of the last time we've seen rel=syndication urls for
@@ -269,36 +274,31 @@ def _process_author(source, author_url, refetch_blanks=False):
   return results
 
 
-def _process_entry(source, permalink, refetch_blanks):
+def _process_entry(source, permalink, refetch_blanks, preexisting):
   """Fetch and process an h-hentry, saving a new SyndicatedPost to the
   DB if successful.
 
   Args:
     permalink: url of the unprocessed post
     syndication_url: url of the syndicated content
+    refetch_blanks: boolean whether we should ignore blank preexisting
+      SyndicatedPosts
+    preexisting: dict of original url to SyndicatedPost
 
   Return:
-    a map from syndicated url to new models.SyndicatedPosts
+    a dict from syndicated url to new models.SyndicatedPosts
   """
   results = {}
+  preexisting_relationship = preexisting.get(permalink)
 
-  # TODO replace this with one query for the Source as a
-  # whole. querying each permalink individually is expensive.
-  preexisting_relationship = SyndicatedPost.query_by_original(source, permalink)
-
-  # refetching to look for SyndicatedPosts that didn't have
-  # a rel=syndication url the first time we checked
-  if (refetch_blanks and preexisting_relationship
-      and not preexisting_relationship.syndication):
-    logging.debug('deleting blank SyndicatedPost for original %s',
-                  permalink)
-    preexisting_relationship.key.delete()
-    preexisting_relationship = None
-
-  # if the post has already been processed. do not add to the results
-  # since this method only returns *newly* discovered relationships
+  # if the post has already been processed, do not add to the results
+  # since this method only returns *newly* discovered relationships.
   if preexisting_relationship:
-    return results
+    # if we're refetching blanks and this one is blank, do not return
+    if refetch_blanks and not preexisting_relationship.syndication:
+      logging.debug('ignoring blank relationship for original %s', permalink)
+    else:
+      return results
 
   syndication_urls = set()
   parsed = None