# Generate beautiful html diffs from two texts

The goal is to use this as html output for the generate_diff.py script.

In [1]:
# open pre-computed files for now

with open("../output/Strafprozessordnung_source_0145-21.pdf.txt", "r") as before_file:
    before_txt = before_file.read()
    
with open("../output/Strafprozessordnung_modified_0145-21.pdf.txt", "r") as after_file:
    after_txt = after_file.read()

## Main part

In [52]:
from typing import List, Any, Callable, Tuple, Union

Token = str
TokenList = List[Token]

In [65]:
import spacy

nlp = spacy.load("de_core_news_sm")

In [150]:
import regex as re


whitespace = re.compile('\s+')
end_sentence = re.compile('[.!?]\s+')


def tokenize(s: str) -> TokenList:
    '''Split a string into tokens'''
    doc = nlp(s)
    outlist = []
    for token in doc:
        outlist.extend([token.text, token.whitespace_])
    return outlist

def untokenize(ts:TokenList) -> str:
    '''Join a list of tokens into a string'''
    return ''.join(ts)

def sentencize(s:str) -> TokenList:
    '''Split a string into a list of sentences'''
    return s.split("\n")

def unsentencise(ts:TokenList) -> str:
    '''Join a list of sentences into a string'''
    return ''.join(ts)

def html_unsentencise(ts:TokenList) -> str:
    '''Joing a list of sentences into HTML for display'''
    return ''.join(f'{t}' for t in ts)

In [271]:
import difflib


def mark_text(text:str) -> str:
    return f'<span style="color: red;">{text}</span>'
    
    
def mark_span_left(text:TokenList) -> TokenList:
    if len(text) > 0:
        text[0] = '<span style="background: #ff5631;border-radius: 5px;">' + text[0]
        text[-1] += '</span>'
    return text


def mark_span_right(text:TokenList) -> TokenList:
    if len(text) > 0:
        text[0] = '<span style="background: #2ea769;border-radius: 3px;">' + text[0]
        text[-1] += '</span>'
    return text


def markup_diff(a:TokenList, b:TokenList,
                mark_left:Callable[TokenList, TokenList]=mark_span_left,
                mark_right:Callable[TokenList, TokenList]=mark_span_right,
                default_mark: Callable[TokenList, TokenList] = lambda x: x,
                isjunk:Union[None, Callable[[Token], bool]]=None) -> Tuple[TokenList, TokenList]:
    """Returns a and b with any differences processed by mark

    Junk is ignored by the differ
    """
    seqmatcher = difflib.SequenceMatcher(isjunk=isjunk, a=a, b=b, autojunk=False)
    out_a, out_b = [], []
    for tag, a0, a1, b0, b1 in seqmatcher.get_opcodes():
        markup_left = default_mark if tag == 'equal' else mark_left
        markup_right = default_mark if tag == 'equal' else mark_right
        out_a += markup_left(a[a0:a1])
        out_b += markup_right(b[b0:b1])
    assert len(out_a) == len(a)
    assert len(out_b) == len(b)
    return out_a, out_b

In [272]:
def align_seqs(a: TokenList, b: TokenList, fill:Token='') -> Tuple[TokenList, TokenList]:
    out_a, out_b = [], []
    seqmatcher = difflib.SequenceMatcher(a=a, b=b, autojunk=False)
    for tag, a0, a1, b0, b1 in seqmatcher.get_opcodes():
        delta = (a1 - a0) - (b1 - b0)
        out_a += a[a0:a1] + [fill] * max(-delta, 0)
        out_b += b[b0:b1] + [fill] * max(delta, 0)
    assert len(out_a) == len(out_b)
    return out_a, out_b

In [273]:
from itertools import zip_longest


def html_sidebyside(a, b):
    # Set the panel display
    out = '<div style="display: grid;grid-template-columns: 1fr 1fr;grid-column-gap: 70px;grid-row-gap: 1px;">'
    # There's some CSS in Jupyter notebooks that makes the first pair unalign. This is a workaround
    out += '<p></p><p></p>'
    for left, right in zip_longest(a, b, fillvalue=''):
        left_leading_ws = len(left) - len(left.lstrip())
        right_leading_ws = len(right) - len(right.lstrip())
        left = left_leading_ws*"&nbsp;" + left
        right = + right_leading_ws*"&nbsp;" + right
        #if "<span" in left or "<span" in right:
        #    left = '<span style="background: #ffcbbd;border-radius: 5px;">' + left + '</span>'
        #    right = '<span style="background: #c8f0da;border-radius: 5px;">' + right + '</span>'
        if "<span" in left or "<span" in right:
            out += f'<div style="background: #ffcbbd;">{left}</div>'
            out += f'<div style="background: #c8f0da;">{right}</div>'
        else:
            out += f'<div>{left}</div>'
            out += f'<div>{right}</div>'
    return out

In [274]:
import html


def html_diffs(a, b):
    a = html.escape(a)
    b = html.escape(b)

    out_a, out_b = [], []
    for sent_a, sent_b in zip(*align_seqs(sentencize(a), sentencize(b))):
        mark_a, mark_b = markup_diff(tokenize(sent_a), tokenize(sent_b))
        out_a.append(untokenize(mark_a))
        out_b.append(untokenize(mark_b))

    return html_sidebyside(out_a, out_b)

In [275]:
from IPython.display import HTML, display

# html_string = html_diffs(before_txt, after_txt)
html_string = html_diffs(before_txt[:30000], after_txt[:30000])

# save to file
with open("html_file.html", "w") as html_file:
    html_file.write(html_string)

# show also here
display(HTML(html_string))