In [1]:
import torch
import numpy as np
import os
import json
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizer
from typing import Any

In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [3]:
root_data_path = "../data"
output_data_path = "../data/output"

def get_prompts():
    with open(os.path.join(root_data_path, "prompts_subset.txt"), "r", errors="ignore") as f:
        prompts = f.read().split("\n===\n")
    return prompts

prompt_list = get_prompts()
print(prompt_list[0][:500])

The Mapes family of Effingham enjoy the Lincoln Park Zoo in Chicago with their children including their adopted children, Regino and Regina, who were born in the Philippines.
Misty Mapes and her husband, Patrick, of Effingham always had a desire to add to their family through adoption.
That dream became a reality in part due to Gift of Adoption Fund, a nonprofit organization that provides financial support to families that need help to pay for the hefty cost of adopting a child.
The Mapes, who h


In [4]:
model_name = "facebook/opt-125m"
# model_name = "google/gemma-3-270m"
# model_name = "openai-community/gpt2"
# model_name = "EleutherAI/gpt-neo-125m"

# Get pytorch device
def get_torch_device(force_cpu: bool = False):
    if force_cpu:
        device_name = "cpu"
    elif torch.cuda.is_available():
        device_name = "cuda:0"
    elif torch.backends.mps.is_available():
        device_name = "mps"
    else:
        device_name = "cpu"
    return torch.device(device_name)

torch.set_num_threads(8)

device = get_torch_device(force_cpu=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

vocab_size = model.get_output_embeddings().weight.shape[0]
print(f"There are {vocab_size} many words in vocabulary")
print(f"The model {model_name} is loaded on device: {device}")

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/251M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

There are 50272 many words in vocabulary
The model facebook/opt-125m is loaded on device: cpu


model.safetensors:   0%|          | 0.00/251M [00:00<?, ?B/s]

In [5]:
# generate llm text without watermarking
def unwatermarked_token_generation(probs, counter, vocab_size, seed = 1234):
    g = torch.Generator(device = probs.device)
    g.manual_seed(seed + counter)
    gen_tokens = torch.multinomial(probs, 1, generator=g)
    return gen_tokens

In [6]:
############
# Gumbel watermarking
def gumbel_token_generation(probs: torch.Tensor, counter, vocab_size, seed=1234):
    device = probs.device
    g = torch.Generator()
    g.manual_seed(seed + counter)
    unif_noise = torch.rand(vocab_size, generator=g).to(device)
    gumbel_ratio = torch.log(unif_noise) / probs[0]
    return torch.argmax(gumbel_ratio).view(-1, 1)

# pivot statistic
def pivot_statistic_gumbel_func(gen_tokens, vocab_size, seed=1234):
    # gen_tokens is a numpy array, so convert into torch Tensor for torch operations
    pivot_stat = []
    for counter, gen_token in enumerate(gen_tokens):
        g = torch.Generator()
        g.manual_seed(seed + counter)
        unif_noise = torch.rand(vocab_size, generator=g)
        pivot_stat.append(-torch.log(1 - unif_noise[gen_token]).item())
    return pivot_stat

In [7]:
######################
# Inverse Watermarking

# generate llm text with inverse watermarking
def inverse_token_generation(probs: torch.Tensor, counter, vocab_size, seed=1234):
    g = torch.Generator()
    g.manual_seed(seed + counter)
    unif_noise = torch.rand(1, generator=g)  # (1,)
    pi = torch.randperm(vocab_size, generator=g)  # random permutation (vocab_size, )
    inv_pi = torch.empty_like(pi)
    inv_pi[pi] = torch.arange(vocab_size)

    probs_shuffled = probs[0, inv_pi]  # probs is shape (1, vocab_size)
    cdf = torch.cumsum(probs_shuffled, dim=0)  # (vocab_size,)
    index = torch.searchsorted(
        cdf, unif_noise.item(), right=False
    )  # Find the first index where cdf exceeds unif_noise

    # Return the original vocab index corresponding to the sampled one
    return inv_pi[index].view(-1, 1)


def pivot_statistic_inverse_func(gen_tokens, vocab_size, seed=1234):
    pivot_stat = []
    for counter, gen_token in enumerate(gen_tokens):
        g = torch.Generator()
        g.manual_seed(seed + counter)
        unif_noise = torch.rand(1, generator=g)  # (1,)
        pi = torch.randperm(vocab_size, generator=g)  # random permutation (vocab_size, )
        normalized = pi[gen_token] / (vocab_size - 1) # as pi[gen_token] yields a value between 0 to (vocab_size - 1)
        pivot_stat.append(1 - np.abs((normalized - unif_noise).item()))  # 1 - <..> so that under H0, mean is small
    return pivot_stat

In [8]:
# some utility functions for generating llm texts
def generate_llm_tokens(
    prompts: list[str],
    tokenizer,  # usually AutoTokenizer
    model,  # usually AutoModelForCausalLM
    token_generation_func: Any,  # a token generation function, or a dict <start_index>:<token_gen_func>, see below.
    verbose=False,
    prompt_tokens=50,  # take the first 50 tokens of prompt as input
    out_tokens=50,  # output next 50 tokens
    vocab_size=None,
    batch_size = 8,
    max_token_input_length = 256
):
    # It is also possible to provide input to the token_generation_func a dictionary of the following form
    # {
    #     "0": watermark_func_1,
    #     "t1": watermark_func_2,
    #     "t2": watermark_func_3,
    #     ...
    # }
    # It allows to use different watermarking scheme to be added in between
    if vocab_size is None or vocab_size < 0:
        vocab_size = model.get_output_embeddings().weight.shape[0]

    # some preparation
    if isinstance(token_generation_func, dict):
        token_change_times = [int(x) for x in list(token_generation_func.keys())]
        token_change_times = sorted(token_change_times, reverse=True)
    else:
        token_change_times = []

    tokens = tokenizer(
        prompts[:batch_size],
        return_tensors="pt", 
        truncation=True, 
        padding=True,
        max_length=128
    )
    torch_prompt = tokens['input_ids'][:, :prompt_tokens]
    inputs = torch_prompt.to(model.device)
    inputs_to_decode = inputs
    counter_range = tqdm(range(out_tokens)) if verbose else range(out_tokens)

    gen_tokens = []
    past = None
    for counter in counter_range:
        with torch.no_grad():
            if past:
                output = model(inputs[:,-1:], past_key_values = past)  # apply the model
            else:
                output = model(inputs)
        probs = torch.nn.functional.softmax(output.logits[:, -1, :], dim = 1)  # apply softmax over the last dimension
        past = output.past_key_values

        # extract the token generation function
        if len(token_change_times) > 0:
            for key in token_change_times:
                if key <= counter:
                    token_gen_func : Any = token_generation_func[str(key)]
                    break
        else:
            token_gen_func : Any = token_generation_func

        # for each row in batch, run the token generation function
        gen_token_indices = []

        for i in range(batch_size):
            gen_token = token_gen_func(
                probs = probs[i, :].view(1, -1), 
                counter=counter + prompt_tokens, 
                vocab_size = vocab_size
            ) # calculate the token
            gen_token_indices.append(int(gen_token.item()))

        gen_tokens.append(gen_token_indices) # shape = (out_tokens, batch_size)
        gen_token_indices = torch.tensor(gen_token_indices, dtype = inputs.dtype, device=model.device).view(-1, 1) # shape = (batch_size, 1)
        inputs = torch.concat((inputs, gen_token_indices), dim = 1) # keep first dim as it is, merge across 2nd dim
        inputs_to_decode = torch.concat((inputs_to_decode, gen_token_indices), dim = 1) # this is complete token sequence

        # subset to max size
        if inputs.shape[1] > max_token_input_length:
            inputs = inputs[:, -max_token_input_length:]

    # at the end, produce the decoded text
    out_text_list = tokenizer.batch_decode(inputs_to_decode)
    input_text_list = tokenizer.batch_decode(torch_prompt)
    return [{
        "prompt": input_text_list[i],
        "gen_tokens": np.array(gen_tokens)[:, i].tolist(),
        "output": out_text_list[i]
    } for i in range(batch_size)]

In [13]:
# generate the data for a specific configuration
batch_size = 8
max_token_input_length = 256

prompt_tokens = 50
output_tokens = 500
output_filename = "data_inverse_n500_facebook_opt125m.json"
token_generation_func = {
    "0": unwatermarked_token_generation,
    "100": inverse_token_generation,
    "200": unwatermarked_token_generation,
    "400": inverse_token_generation,
    "450": unwatermarked_token_generation,
}
pivot_func = pivot_statistic_inverse_func
# pivot_func = None

In [14]:
# Run the main simulation loop
pivot_seed = 1234 + prompt_tokens  # this is where the seed for pivot statistic will start from
batch_size = 8
max_token_input_length = 256

intervals = []
last_interval_type = None
last_interval_index = None

# calculate the intervals
for index in sorted([int(x) for x in token_generation_func.keys()], reverse = False):
    if last_interval_type is not None:
        intervals.append((last_interval_index, index, last_interval_type))
    last_interval_type = token_generation_func[str(index)].__name__.split('_')[0]
    last_interval_index = index
intervals.append((last_interval_index, output_tokens, last_interval_type))

data_out_conf = {
    "model_name": model_name,
    "intervals": intervals,
    "prompt_tokens": prompt_tokens,
    "out_tokens": output_tokens,
    "vocab_size": vocab_size
}

response_list = []
for i in tqdm(range(0, len(prompt_list), batch_size), desc="Processing batches"):
    prompt_batch = prompt_list[i:(i+batch_size)]
    response = generate_llm_tokens(
        prompt_batch,
        tokenizer,
        model,
        token_generation_func=token_generation_func,
        verbose=False,
        out_tokens=output_tokens,
        prompt_tokens=prompt_tokens,
        vocab_size=vocab_size,
        max_token_input_length=max_token_input_length
    )
    if pivot_func is not None:
        # calculate pivot function as well
        for j in range(len(response)):
            gen_tokens = response[j]["gen_tokens"]
            response[j]["pivots"] = pivot_func(gen_tokens, seed = pivot_seed, vocab_size = vocab_size)
    response_list.extend(response)

    # save the json file
    with open(os.path.join(output_data_path, output_filename), "w") as f:
        json.dump({"configuration": data_out_conf, "data": response_list}, f)
        f.close()

# save it at last as well
with open(os.path.join(output_data_path, output_filename), "w") as f:
    json.dump({"configuration": data_out_conf, "data": response_list}, f)
    f.close()

Processing batches:   0%|          | 0/25 [00:00<?, ?it/s]