wikirend.py: cancel unclosed nested font styles

If we are closing a font style and we encounter unclosed nested inner font styles, we reach back into the already rendered results and cancel those font styles by replacing the opening font style with the raw characters. This fixes the rendering of: _x{1}x*_ .... _xx*_ This is a hack. The need for this hack shows that our current single pass approach to rendering out wikitext is flawed; in an ideal world it would be clear to the renderer that the first '*' did not have a closing '*' within its scope. But we don't actually work out scopes; we just do textual lookahead, so the first '*' sees the second '*' and thinks there's a matched pair. We cannot do this without being inside a nested style for reasons beyond the scope of this commit message.
siebenmann · Sep 15, 2015 · 4e95e16 · 4e95e16
1 parent f4f1301
commit 4e95e16
Showing 1 changed file with 47 additions and 5 deletions.
diff --git a/wikirend.py b/wikirend.py
@@ -279,6 +279,44 @@ def inline_plaintext(rend, txt):
 # for this, but I didn't so that's life and we cope.	- cks
 font_end_res['_'] = font_start_res['_'] = re.compile("(.*?)_")
 
+# Reverse inline font styles by reaching back into the rendered content
+# and converting the start tag to its original character. This is a
+# hack, but a convenient one that fixes various rendering issues for
+# stuff that people write.
+#
+# As a hack, it only works really when closing improperly nested font
+# styles, because in this case we can be fairly certain that we are
+# reaching back to fix the proper opening. It cannot be used to fix
+# up paragraphs without various sorts of malfunctions; for them, the
+# errant unclosed font style just runs to the end of the paragraph.
+#
+# This should not be necessary if we properly matched start and end
+# tags in general, but we only do purely textual forward lookup for
+# end tags and their positions and that can be fooled by tags inside
+# various nesting constructs (eg '((...))' and '[[..]]'). Really we
+# need an additional resolution pass over the raw tokenized text.
+#
+_lpairs = { "</em>": ["<em>", "*"], "</strong>": ["<strong>", "~~"],
+	    "</code>": ["<code>", "_"], }
+def unwind_inline(rend, offtag):
+	assert offtag in rend.inlineEndStack
+	while rend.inlineEndStack:
+		s = rend.inlineEndStack.pop(0)
+		if s == offtag:
+			break
+		if s not in _lpairs:
+			rend.result.append(s)
+			continue
+		src, rep = _lpairs[s]
+		for i in range(len(rend.result)-1, -1, -1):
+			if rend.result[i] == src:
+				rend.result[i] = rep
+				break
+		else:
+			# should never happen?!
+			rend.result.append(s)
+	rend.result.append(offtag)
+
 def inline_font(rend, style, text):
 	if style == '*':
 		hstyle = 'em'
@@ -308,11 +346,15 @@ def inline_font(rend, style, text):
 	# bail.
 	offtag = end_entity[hstyle]
 	if offtag in rend.inlineEndStack:
-		s = rend.inlineEndStack.pop(0)
-		while s != offtag:
-			rend.result.append(s)
-			s = rend.inlineEndStack.pop(0)
-		rend.result.append(s)
+		#s = rend.inlineEndStack.pop(0)
+		#while s != offtag:
+		#	rend.result.append(s)
+		#	s = rend.inlineEndStack.pop(0)
+		#rend.result.append(s)
+		# rather than closing off unterminated inline styles,
+		# we unwind them, turning the start tag into its original
+		# string. this is imperfect.
+		unwind_inline(rend, offtag)
 		return
 
 	# We insist that start tags be followed by non-whitespace.