In [5]:
!pip install transformers
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [7]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

In [8]:
model = T5ForConditionalGeneration.from_pretrained('t5-large')
tokenizer = T5Tokenizer.from_pretrained('t5-large', model_max_length=1024)
# model = AutoModelForCausalLM.from_pretrained("facebook/opt-6.7b")
# tokenizer = AutoTokenizer.from_pretrained("facebook/opt-6.7b")

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


In [9]:
text = """Potential harms of large language models can be mitigated
by watermarking model output, i.e., embedding signals into
generated text that are invisible to humans but algorithmically detectable from a short span of tokens. We propose a
watermarking framework for proprietary language models.
The watermark can be embedded with negligible impact
on text quality, and can be detected using an efficient opensource algorithm without access to the language model API
or parameters. The watermark works by selecting a randomized set of “green” tokens before a word is generated, and
then softly promoting use of green tokens during sampling."""
inputs = tokenizer(["summarize: " + text], return_tensors="pt")

In [11]:
summary = model.generate(**inputs)
print(tokenizer.decode(summary[0]))



<pad>watermarking language models can mitigate potential harms. watermarks can be embedded with


In [13]:
!git clone https://github.com/jwkirchenbauer/lm-watermarking.git

Cloning into 'lm-watermarking'...
remote: Enumerating objects: 313, done.[K
remote: Counting objects: 100% (25/25), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 313 (delta 10), reused 5 (delta 5), pack-reused 288[K
Receiving objects: 100% (313/313), 11.98 MiB | 8.21 MiB/s, done.
Resolving deltas: 100% (87/87), done.


In [12]:
import sys
sys.path.insert(0, "/content/lm-watermarking")

In [13]:
from watermark_processor import WatermarkDetector, WatermarkLogitsProcessor
from transformers import (LogitsProcessorList)

In [14]:
 input_text = (
        "The diamondback terrapin or simply terrapin (Malaclemys terrapin) is a "
        "species of turtle native to the brackish coastal tidal marshes of the "
        "Northeastern and southern United States, and in Bermuda.[6] It belongs "
        "to the monotypic genus Malaclemys. It has one of the largest ranges of "
        "all turtles in North America, stretching as far south as the Florida Keys "
        "and as far north as Cape Cod.[7] The name 'terrapin' is derived from the "
        "Algonquian word torope.[8] It applies to Malaclemys terrapin in both "
        "British English and American English. The name originally was used by "
        "early European settlers in North America to describe these brackish-water "
        "turtles that inhabited neither freshwater habitats nor the sea. It retains "
        "this primary meaning in American English.[8] In British English, however, "
        "other semi-aquatic turtle species, such as the red-eared slider, might "
        "also be called terrapins. The common name refers to the diamond pattern "
        "on top of its shell (carapace), but the overall pattern and coloration "
        "vary greatly. The shell is usually wider at the back than in the front, "
        "and from above it appears wedge-shaped. The shell coloring can vary "
        "from brown to grey, and its body color can be grey, brown, yellow, "
        "or white. All have a unique pattern of wiggly, black markings or spots "
        "on their body and head. The diamondback terrapin has large webbed "
        "feet.[9] The species is"
)

# output_text = "watermarking language models can mitigate potential harms. watermarks can be embedded with"

In [25]:
len(input_text)

1373

In [26]:
watermark_processor = WatermarkLogitsProcessor(vocab=list(tokenizer.get_vocab().values()),
                                               gamma=0.25,
                                               delta=2.0,
                                               seeding_scheme="simple_1")

tokenized_input = tokenizer(input_text, return_tensors="pt").to(model.device)
print(tokenized_input["input_ids"].shape)
# note that if the model is on cuda, then the input is on cuda
# and thus the watermarking rng is cuda-based.
# This is a different generator than the cpu-based rng in pytorch!

output_tokens = model.generate(**tokenized_input,
                               logits_processor=LogitsProcessorList([watermark_processor]))

# if decoder only model, then we need to isolate the
# newly generated tokens as only those are watermarked, the input/prompt is not
# output_tokens = output_tokens[:,tokenized_input["input_ids"].shape[-1]:]

output_text = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)[0]

torch.Size([1, 356])


In [33]:
output_text

'hello everybody my name is pouya'

In [30]:
output_text = "hello everybody my name is pouya"

In [34]:
watermark_detector = WatermarkDetector(vocab=list(tokenizer.get_vocab().values()),
                                        gamma=0.25, # should match original setting
                                        seeding_scheme="simple_1", # should match original setting
                                        device="cpu", # must match the original rng device type
                                        tokenizer=tokenizer,
                                        z_threshold=4.0,
                                        normalizers=[],
                                        ignore_repeated_bigrams=False)

score_dict = watermark_detector.detect(output_text) # or any other text of interest to analyze

In [35]:
score_dict

{'num_tokens_scored': 7,
 'num_green_tokens': 3,
 'green_fraction': 0.42857142857142855,
 'z_score': 1.091089451179962,
 'p_value': 0.13761676203741713,
 'prediction': False}