In [1]:
import json
import os
import difflib
from IPython.display import HTML, display

In [2]:
def read_json(fpath: str):
    with open(fpath, "r") as f:
        data = json.load(f)
        f.close()
    return data

root_output_path = "../data/output_v2"
uwm_data = read_json(os.path.join(root_output_path, "data_uwm_n200.json"))
gumbel_data = read_json(os.path.join(root_output_path, "data_gumbel_n200.json"))
inverse_data = read_json(os.path.join(root_output_path, "data_inverse_n200.json"))

In [3]:
# view text differences
def view_text_difference(text1: str, text2: str, text1_title: str = "Without Watermarking", text2_title: str = "With Watermarking"):
    # Tokenize by whitespace
    tokens1 = text1.split()
    tokens2 = text2.split()

    # Sequence matcher
    matcher = difflib.SequenceMatcher(None, tokens1, tokens2)

    html1 = []
    html2 = []

    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == 'equal':
            html1.extend(tokens1[i1:i2])
            html2.extend(tokens2[j1:j2])
        elif tag == 'replace':
            html1.extend([f"<span style='color:#fbb;'> {tok} </span>" for tok in tokens1[i1:i2]])
            html2.extend([f"<span style='color:#bfb;'> {tok} </span>" for tok in tokens2[j1:j2]])
        elif tag == 'delete':
            html1.extend([f"<span style='color:#fdd;'> {tok} </span>" for tok in tokens1[i1:i2]])
        elif tag == 'insert':
            html2.extend([f"<span style='color:#dfd;'> {tok} </span>" for tok in tokens2[j1:j2]])

    # Join with spaces
    html1 = " ".join(html1)
    html2 = " ".join(html2)

    # Display side by side
    display(HTML(f"""
    <table style="width:100%; table-layout: fixed; border-collapse: collapse;">
    <tr>
        <th style="border:1px solid black; padding:5px;">{text1_title}</th>
        <th style="border:1px solid black; padding:5px;">{text2_title}</th>
    </tr>
    <tr>
        <td style="border:1px solid black; padding:5px; white-space:pre-wrap; word-wrap:break-word;">{html1}</td>
        <td style="border:1px solid black; padding:5px; white-space:pre-wrap; word-wrap:break-word;">{html2}</td>
    </tr>
    </table>
    """))

In [4]:
sample_index = 50

text1 = uwm_data['data'][sample_index]['output']
text2 = gumbel_data['data'][sample_index]['output']

view_text_difference(text1, text2)

Without Watermarking,With Watermarking
"A live events and bespoke creative technical solutions company based in Loughborough is seeking a technical project manager to join their growing production team. You may currently be working be working as an audio-visual/AV project manager or technical production manager for distro Blender Machines on Involvement. See this for more details. We have a full range of teams and an extensive training base under one roof, featuring both audio and audio-visual teams. Our team is that skill level to support the project management process and can support individuals with technical technical skills. The framework used by Blender Machines is be it an FineMusic system, a presentation package, a client portal, a web-native project workflow (not adding all of your customization options on demand), a bespoke presentation package or a web-native project workflow. What's an important thing to know about YouTube this sort of business, as the majority of Black users like videos for their entertainment content, and the chances of this happening elsewhere is frequently limited, so I would be prepared to chip in massive sums of money for exposure, an important part of why the company's growth is about 10-20% a year as of late 2017. I am looking for someone to","A live events and bespoke creative technical solutions company based in Loughborough is seeking a technical project manager to join their growing production team. You may currently be working be working as an audio-visual/AV project manager or technical production manager for distro Blender Machines on Involvement. See this for more details. We have a full range of teams and an extensive training base under one roof, featuring both audio and audio-visual teams. Our team is that skill level to be right out at the take and far more technical to deliver the best possible results. Tom just arrived as a project manager in 2016, so we're suggesting you take a look at our portfolio portfolio. We've got so many quality people on site to come in and help us with this. Do you work for us? If so, who do you work with and what would you like to see around us? If so, we're open to business! We're a tiny firm with high standards but will promise our clients maximum results today! Contact: fredgemann@paxdeal.messaging.com Careers & Benefits The Whipple work with image makers is at a despondent stage as everyone prefers images produced"


In [38]:
def view_text_highlight(sample_index: int, fname: str):
    dat = read_json(os.path.join(root_output_path, fname))

    from transformers import AutoTokenizer
    from IPython.display import display, HTML
    import ftfy

    model_name = "facebook/opt-125m"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    raw_text = ftfy.fix_text(dat['data'][sample_index]['output'])
    encoding = tokenizer(raw_text, return_tensors="pt")
    tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"][0])

    def clean_tokens(tokens):
        cleaned = []
        for token in tokens:
            # Convert token to ID
            try:
                token_id = tokenizer.convert_tokens_to_ids(token)
                text = tokenizer.decode([token_id], clean_up_tokenization_spaces=True)
            except KeyError:
                text = ""  # fallback if token not in vocab

            # Optional: map common special tokens
            if text == "</s>" or text == "<s>":
                text = ""

            cleaned.append(text)
        return cleaned 

    cleaned_tokens = clean_tokens(tokens)

    desc = {
        # "input": ((0, 50), "#d0e1ff", "Input"),
        "unwatermarked0": ((0, 50), "#e0e0e0", "Unwatermarked"),
        "unwatermarked1": ((50, 70), "#e0e0e0", "Unwatermarked"),
        "watermarked": ((70, 100), "#ffd6d6", "Gumbel Watermark"),
        "unwatermarked2": ((100, 150), "#e0e0e0", "Unwatermarked"),
    }

    html_tokens = []
    label_mapper = {}

    for idx, tok in enumerate(cleaned_tokens):
        label = None
        for key, (interval, color, label) in desc.items():
            start, end = interval
            if start <= idx < end:
                label = key
                html_tokens.append(
                    f"<span style='background:{color}; color:black;"
                    f"border-radius:3px; margin:1px;'>{tok}</span>"
                )
                break
            if label not in label_mapper:
                label_mapper[label] = color
            
    legend_html = "".join(
        f"<span style='background:{c}; color:black; padding:2px 5px; border-radius:3px; margin-right:8px;'>{lbl}</span>"
        for lbl, c in label_mapper.items()
    )


    display(HTML(f"""
    <div style="background:white; padding:10px; font-family:monospace; line-height:1.6; font-size:14px;">
        <div style="margin-bottom:8px;">Legend: {legend_html}</div>
    {''.join(html_tokens)}
    </div>
    """))


sample_index = 15
view_text_highlight(sample_index, fname = "data_uwm_n100.json")
