mf2: remember HTML content, keep newlines, don't translate to <br>

for #130, also re #80. i highly suspect this will cause a regression somewhere, but i'm not quite sure where yet. :/
snarfed · Feb 26, 2018 · 6260e32 · 6260e32
1 parent 0b94583
commit 6260e32
Show file tree

Hide file tree

Showing 5 changed files with 35 additions and 14 deletions.
diff --git a/README.md b/README.md
@@ -243,6 +243,7 @@ Changelog
   * Add `fetch_mf2` kwarg to `json_to_object()` for fetching additional pages over HTTP to determine authorship.
   * Generate explicit blank `p-name` in HTML to prevent old flawed [implied p-name handling](http://microformats.org/wiki/microformats2-implied-properties) [#131](https://github.com/snarfed/granary/issues/131).
   * Fix `share` verb handling in `activity_to_json()` and `activities_to_html()` [#134](https://github.com/snarfed/granary/issues/134).
+  * Remember which content contains HTML, preserve newlines in it, and don't translate those newlines to `<br>`s ([#130](https://github.com/snarfed/granary/issues/130)).
 * Atom:
   * Fix timezone bugs in `updated` and `published`.
 * JSON Feed:

diff --git a/granary/microformats2.py b/granary/microformats2.py
@@ -103,7 +103,7 @@ def get_string_urls(objs):
   return urls
 
 
-def get_html(val, keep_newlines=False):
+def get_html(val):
   """Returns a string value that may have HTML markup.
 
   Args:
@@ -119,8 +119,6 @@ def get_html(val, keep_newlines=False):
     # https://github.com/snarfed/granary/issues/80
     # https://indiewebcamp.com/note#Indieweb_whitespace_thinking
     html = val['html']
-    if not keep_newlines:
-      html = html.replace('\n', ' ')
     return html.strip()
 
   return get_text(val)
@@ -462,7 +460,14 @@ def html_to_activities(html, url=None, actor=None):
   parsed = mf2py.parse(doc=html, url=url)
   hfeed = mf2util.find_first_entry(parsed, ['h-feed'])
   items = hfeed.get('children', []) if hfeed else parsed.get('items', [])
-  return [{'object': json_to_object(item, actor=actor)} for item in items]
+
+  activities = []
+  for item in items:
+    obj = json_to_object(item, actor=actor)
+    obj['content_is_html'] = True
+    activities.append({'object': obj})
+
+  return activities
 
 
 def activities_to_html(activities):
@@ -571,7 +576,7 @@ def json_to_html(obj, parent_props=None):
         children.append(json_to_html(target, ['u-' + mftype + '-of']))
 
   # set up content and name
-  content_html = get_html(prop.get('content', {}), keep_newlines=True)
+  content_html = get_html(prop.get('content', {}))
   content_classes = []
 
   if content_html:
@@ -743,9 +748,10 @@ def render_content(obj, include_location=True, synthesize_content=True,
 
     content += orig[last_end:]
 
-  # convert newlines to <br>s
-  # do this *after* linkifying tags so we don't have to shuffle indices over
-  content = content.replace('\n', '<br />\n')
+  if not obj.get('content_is_html'):
+    # convert newlines to <br>s
+    # do this *after* linkifying tags so we don't have to shuffle indices over
+    content = content.replace('\n', '<br />\n')
 
   # linkify embedded links. ignore the "mention" tags that we added ourselves.
   # TODO: fix the bug in test_linkify_broken() in webutil/util_test.py, then

diff --git a/granary/test/test_atom.py b/granary/test/test_atom.py
@@ -528,12 +528,19 @@ def test_html_to_atom(self):
   <activity:object-type>http://activitystrea.ms/schema/1.0/note</activity:object-type>
 
   <id>http://my/post</id>
-  <title>my content</title>
+  <title>my content
+x
+y
+z</title>
 
   <content type="xhtml">
   <div xmlns="http://www.w3.org/1999/xhtml">
 
 my content
+<pre>  x
+    y
+ z
+</pre>
 
   </div>
   </content>
@@ -561,7 +568,14 @@ def test_html_to_atom(self):
 
 <article class="h-entry">
 <a class="u-url" href="http://my/post" />
-<div class="e-content">my content</div>
+<div class="e-content">
+my content
+<pre>
+  x
+    y
+ z
+</pre>
+</div>
 </article>
 </div>
 """, 'https://my.site/feed'),

diff --git a/granary/test/test_microformats2.py b/granary/test/test_microformats2.py
@@ -641,11 +641,11 @@ def test_json_to_object_converts_text_newlines_to_brs(self):
       'properties': {'content': [{'value': 'asdf\nqwer'}]},
     }))
 
-  def test_json_to_object_drops_html_newlines(self):
-    """HTML newlines should be discarded."""
+  def test_json_to_object_keeps_html_newlines(self):
+    """HTML newlines should be preserved."""
     self.assert_equals({
       'objectType': 'note',
-      'content': 'asdf qwer',
+      'content': 'asdf\nqwer',
     }, microformats2.json_to_object({
       'properties': {'content': [{'html': 'asdf\nqwer', 'value': ''}]},
     }))

diff --git a/granary/test/testdata/repost.as-from-mf2.json b/granary/test/testdata/repost.as-from-mf2.json
@@ -4,7 +4,7 @@
   "id": "tag:example.com,2001:3344",
   "published": "2012-12-05T00:58:26+00:00",
   "url": "http://example.com/this/repost",
-  "content": "Shared <a href=\"http://example.com/original/post\">a post</a> by   <span class=\"h-card\">     <a class=\"p-name u-url\" href=\"http://example.com/bob\">Bob</a>        </span> The original post",
+  "content": "Shared <a href=\"http://example.com/original/post\">a post</a> by   <span class=\"h-card\">\n   \n<a class=\"p-name u-url\" href=\"http://example.com/bob\">Bob</a>\n    \n  </span>\nThe original post",
   "object": {
     "author": {
       "objectType": "person",