From 4e95e16e6edab6d8af36607568f2d9b638d1d33d Mon Sep 17 00:00:00 2001
From: Chris Siebenmann <cks.git01@cs.toronto.edu>
Date: Tue, 15 Sep 2015 16:05:06 -0400
Subject: [PATCH] wikirend.py: cancel unclosed nested font styles

If we are closing a font style and we encounter unclosed nested inner
font styles, we reach back into the already rendered results and cancel
those font styles by replacing the opening font style with the raw
characters.  This fixes the rendering of:

	_x{1}x*_ .... _xx*_

This is a hack. The need for this hack shows that our current single
pass approach to rendering out wikitext is flawed; in an ideal world it
would be clear to the renderer that the first '*' did not have a closing
'*' within its scope. But we don't actually work out scopes; we just
do textual lookahead, so the first '*' sees the second '*' and thinks
there's a matched pair.

We cannot do this without being inside a nested style for reasons
beyond the scope of this commit message.
---
 wikirend.py | 52 +++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 47 insertions(+), 5 deletions(-)
diff --git a/wikirend.py b/wikirend.py
index 0951582..b3dff89 100644
--- a/wikirend.py
+++ b/wikirend.py
@@ -279,6 +279,44 @@ def inline_plaintext(rend, txt):
 # for this, but I didn't so that's life and we cope.	- cks
 font_end_res['_'] = font_start_res['_'] = re.compile("(.*?)_")
 
+# Reverse inline font styles by reaching back into the rendered content
+# and converting the start tag to its original character. This is a
+# hack, but a convenient one that fixes various rendering issues for
+# stuff that people write.
+#
+# As a hack, it only works really when closing improperly nested font
+# styles, because in this case we can be fairly certain that we are
+# reaching back to fix the proper opening. It cannot be used to fix
+# up paragraphs without various sorts of malfunctions; for them, the
+# errant unclosed font style just runs to the end of the paragraph.
+#
+# This should not be necessary if we properly matched start and end
+# tags in general, but we only do purely textual forward lookup for
+# end tags and their positions and that can be fooled by tags inside
+# various nesting constructs (eg '((...))' and '[[..]]'). Really we
+# need an additional resolution pass over the raw tokenized text.
+#
+_lpairs = { "</em>": ["<em>", "*"], "</strong>": ["<strong>", "~~"],
+	    "</code>": ["<code>", "_"], }
+def unwind_inline(rend, offtag):
+	assert offtag in rend.inlineEndStack
+	while rend.inlineEndStack:
+		s = rend.inlineEndStack.pop(0)
+		if s == offtag:
+			break
+		if s not in _lpairs:
+			rend.result.append(s)
+			continue
+		src, rep = _lpairs[s]
+		for i in range(len(rend.result)-1, -1, -1):
+			if rend.result[i] == src:
+				rend.result[i] = rep
+				break
+		else:
+			# should never happen?!
+			rend.result.append(s)
+	rend.result.append(offtag)
+
 def inline_font(rend, style, text):
 	if style == '*':
 		hstyle = 'em'
@@ -308,11 +346,15 @@ def inline_font(rend, style, text):
 	# bail.
 	offtag = end_entity[hstyle]
 	if offtag in rend.inlineEndStack:
-		s = rend.inlineEndStack.pop(0)
-		while s != offtag:
-			rend.result.append(s)
-			s = rend.inlineEndStack.pop(0)
-		rend.result.append(s)
+		#s = rend.inlineEndStack.pop(0)
+		#while s != offtag:
+		#	rend.result.append(s)
+		#	s = rend.inlineEndStack.pop(0)
+		#rend.result.append(s)
+		# rather than closing off unterminated inline styles,
+		# we unwind them, turning the start tag into its original
+		# string. this is imperfect.
+		unwind_inline(rend, offtag)
 		return
 
 	# We insist that start tags be followed by non-whitespace.