In [1]:
#@title Install Dependencies and Download Models

!pip install transformers datasets

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import LogitsProcessor, set_seed
import numpy as np
import datasets

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m109.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m51.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m9

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [2]:
device = 'cuda:0'
model.to(device)
model = model.eval()

In [3]:
# @title Download lab files

import sys

![ ! -d 'llm_lab' ] && git clone https://github.com/ethz-privsec/llm_lab.git
%cd llm_lab
!git pull https://github.com/ethz-privsec/llm_lab.git
%cd ..
if "llm_lab" not in sys.path:
  sys.path.append("llm_lab")

Cloning into 'llm_lab'...
remote: Enumerating objects: 31, done.[K
remote: Counting objects: 100% (31/31), done.[K
remote: Compressing objects: 100% (23/23), done.[K
remote: Total 31 (delta 11), reused 26 (delta 6), pack-reused 0[K
Unpacking objects: 100% (31/31), 151.95 KiB | 4.60 MiB/s, done.
/content/llm_lab
From https://github.com/ethz-privsec/llm_lab
 * branch            HEAD       -> FETCH_HEAD
Already up to date.
/content


In [4]:
# @title Example of how to generate 100 tokens of text without watermarking

from llm_lab.gpt_generate import generate_with_seed, gen_red_list

prompt = "Boston is one of the oldest municipalities in America,"
print(generate_with_seed(model, tokenizer, prompt, seed=42))

Boston is one of the oldest municipalities in America, but it's also among those with a history that dates back to at least 1857. The city has been home for most Aryan settlers from California until they settled here around 1787 when Columbus' New World arrived on Endurance Island and secured their land close by (the site was known as "Ferry Rock"). In other words: Kimball County doesn't exist right now — just be sure you're aware of its existence…
 [ Read more ] Thom Ann / Fox News $64.99


You will now implement three different watermarking schemes:
1. A simple scheme that never outputs the letter 'e' (lowercase or uppercase)
2. A red-list scheme, that generates a random list of banned tokens for each token generation.
3. A soft red-list scheme, that also generates a random red-list, but just biases the LLM against these tokens instead of outright banning them, by substracting the value `logit_offset=2` from the logits of each red-listed token.

You should implement each of these schemes as a `LogitsProcessor` class.

For the red-list schemes, you should use `gpt_generate.gen_red_list` to generate a red list containing 50% of the LLM's vocabulary.
The seed for generating the pseudorandom red list is computed from the previous token processed by the model.

So for example, if the model has so far processed the string "my name is " (which tokenizes as `[1820, 1438, 318, 220]`), then the red list for the next token to be generated is `**gen_red_list(torch.LongTensor([220]), model.config.vocab_size)** = [43383,  7006, 40846, ...]`.

In [5]:
#@title Exercise 4.1: Implement a trivial watermarking scheme that samples text without any 'e' (lowercase or uppercase)

class NoEsLogitsProcessor(LogitsProcessor):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.idx = []

        for i in range(model.config.vocab_size):
          token = tokenizer.decode([i])
          if 'e' in token or 'E' in token:
            self.idx.append(i)

    def __call__(self, input_ids, scores):
        """
        Processes the output scores of the LLM before generating the next token.
        Args:
            input_ids: torch.LongTensor of shape (batch_size, sequence_length) — Indices of input sequence tokens in the vocabulary.
            scores: torch.FloatTensor of shape (batch_size, model.config.vocab_size) — Logits for the next token to be generated.
        Returns: torch.FloatTensor of shape (batch_size, model.config.vocab_size) — The processed logits.
        """
        #raise NotImplementedError()
        scores[:, self.idx] = float('-inf')

        return scores


no_e_processor = NoEsLogitsProcessor()

prompt = "Anton Vowl is missing. Ransacking his Paris flat, a group of his faithful companions trawl through his diary for any hint as to his location and, insidiously, a ghost, from Vowl's past starts to cast its malignant shadow.\n "
output = generate_with_seed(model, tokenizer, prompt, logits_processor=no_e_processor, seed=42)
print(output)

Anton Vowl is missing. Ransacking his Paris flat, a group of his faithful companions trawl through his diary for any hint as to his location and, insidiously, a ghost, from Vowl's past starts to cast its malignant shadow.
  In fact that protagonist who was in such agony at first sight looks nothing but an ill-lucard son on all fours — sadistic bastard? A coward (and possibly also unkind) *cough* Mr N'Vow! Finally coming into contact with both Jonsonakos ("Christina Tyngrav") whom will do anything Ali affords him if it suits us; or two kilograms wolfish right off B1226 "Bitch" Gokai: about


In [21]:
#@title Exercise 4.2: Implement a red-list watermarking scheme

class RedListLogitsProcessor(LogitsProcessor):
    def __init__(self, red_frac=0.5, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.red_frac = red_frac
    
    def __call__(self, input_ids, scores):
        """
        Processes the output scores of the LLM before generating the next token.
        Args:
            input_ids: torch.LongTensor of shape (batch_size, sequence_length) — Indices of input sequence tokens in the vocabulary.
            scores: torch.FloatTensor of shape (batch_size, model.config.vocab_size) — Logits for the next token to be generated.
        Returns: torch.FloatTensor of shape (batch_size, model.config.vocab_size) — The processed logits.
        """
        for i in range(scores.shape[0]):
          last_token = input_ids[i][-1].detach().cpu()
          last_token = torch.LongTensor([last_token])
          red_list = gen_red_list(last_token, model.config.vocab_size, frac_red=self.red_frac)
          scores[i,red_list] = float('-inf')

        return scores

red_list_processor = RedListLogitsProcessor()

prompt = "Boston is one of the oldest municipalities in America,"
output = generate_with_seed(model, tokenizer, prompt, logits_processor=red_list_processor, seed=42)
print(output)

Boston is one of the oldest municipalities in America, but it's not a spiritual successor. It had been created after former Premier Harry Marnelli drew up legislation on behalf (in his first term as mayor) and appointed two new members: Daniel Blakeley Ryves from Endurance Township -- who served five terms earlier this year under current Mayor Philip Ayroyama — Michael Rieckenham Jr.. The township voted for Yang last week when an election was held against state ballot measures to dissolve public school systems because they have problems or defects with charter dogs


In [22]:
#@title Exercise 4.3: Implement a soft red-list watermarking scheme

class SoftRedListLogitsProcessor(LogitsProcessor):
    def __init__(self, red_frac=0.5, logit_offset=2.0, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.red_frac = red_frac
        self.logit_offset = logit_offset
    
    def __call__(self, input_ids, scores):
        """
        Processes the output scores of the LLM before generating the next token.
        Args:
            input_ids: torch.LongTensor of shape (batch_size, sequence_length) — Indices of input sequence tokens in the vocabulary.
            scores: torch.FloatTensor of shape (batch_size, model.config.vocab_size) — Logits for the next token to be generated.
        Returns: torch.FloatTensor of shape (batch_size, model.config.vocab_size) — The processed logits.
        """
        for i in range(scores.shape[0]):
          last_token = input_ids[i][-1].detach().cpu()
          last_token = torch.LongTensor([last_token])
          red_list = gen_red_list(last_token, model.config.vocab_size, frac_red=self.red_frac)
          scores[i,red_list] = scores[i,red_list] - self.logit_offset

        return scores

soft_red_list_processor = SoftRedListLogitsProcessor()

prompt = "Boston is one of the oldest municipalities in America,"
output = generate_with_seed(model, tokenizer, prompt, logits_processor=soft_red_list_processor, seed=42)
print(output)

Boston is one of the oldest municipalities in America, but it's not a spiritual successor. It had been created after former Premier Harry Mudd and his family fled Sudan on Aug 13 2001 following atrocities by rebels there which left around 700 people dead including children under 18 years old who died at an open fire triggered earlier this year when Russian forces attacked rebel-controlled areas along military lines near Benghazi.
The elected members are from Sikh communities (with four more than LDS leaders) working alongside African Christians seeking local leadership to support both religious groups as well As for


Okay! We're now ready to start generating watermarked text.
We give you 20 prompts in `data/watermark_prompts.txt`.
For each of these, generate 100 more tokens using each of the three watermarking schemes, and save all of this as a numpy array for submission.

MAKE SURE TO USE `seed=42` FOR ALL YOUR GENERATIONS.

In [23]:
from tqdm import trange

with open('llm_lab/data/watermark_prompts.txt') as f:
  prompts = f.read().splitlines()

processors = [no_e_processor, red_list_processor, soft_red_list_processor]
outputs = []

seed = 42  # DON'T CHANGE THIS!!!

for i in trange(20):
  min_new_tokens = 100
  max_new_tokens = min_new_tokens

  for j in range(3):
    output = generate_with_seed(model, tokenizer, prompts[i], logits_processor=processors[j], 
                                min_new_tokens=min_new_tokens, max_new_tokens=max_new_tokens, seed=seed)
    outputs.append(output)

print(outputs)
np.save("Q4_gens.npy", outputs)

100%|██████████| 20/20 [01:24<00:00,  4.24s/it]






For the final part of this question, you now have to try and detect watermarked text.
We give you 80 pieces of text in `data/watermarked_gens.npy`.
For each piece of text you have to guess whether it was generated with:

1.   No watermark
2.   The dummy "no E's" watermark
3.   The red-list watermark
4.   The soft red-list watermark

We use the same `generate_with_seed` and `gen_red_list` implementations as you. Our red-list watermarking scheme also uses the same parameters (i.e., 50% of the tokens are red-listed, and for the soft version we substract 2.0 from the logits). 

Exactly 20 of the 80 texts are generated with each of the 4 options above. Each text is comprised of a short prompt, followed by 100-200 generated tokens.

Store your guesses (1,2,3,4) for each piece of text in a numpy array.

In [None]:
outputs_secret = np.load("llm_lab/data/watermarked_gens.npy", allow_pickle=True)
assert len(outputs_secret) == 80

my_guesses = [1] * 20 + [2] * 20 + [3] * 20 + [4] * 20

assert len(my_guesses) == 80
np.save("Q4_guesses.npy", np.asarray(my_guesses))