In [5]:
!pip install transformers
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m53.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, safetensors, transformers
Successfully installed safetensors-0.3.1 tokenizers-0.13.3 transformers-4.31.0
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86

In [7]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

In [8]:
model = T5ForConditionalGeneration.from_pretrained('t5-large')
tokenizer = T5Tokenizer.from_pretrained('t5-large', model_max_length=1024)

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


In [8]:
text = """I love my cat because """
inputs = tokenizer(["summarize: " + text], return_tensors="pt")

In [9]:
summary = model.generate(**inputs, return_dict_in_generate=True, max_length=30, output_scores=True)

<class 'transformers.generation.utils.GreedySearchEncoderDecoderOutput'>
<pad>i love my cat because he is so sweet. he is a good listener and he is a good listen


In [10]:
"len(summary.sequences[0]): ", len(summary.sequences[0])

('len(summary.sequences[0]): ', 30)

In [11]:
print("len(summary.scores): ", len(summary.scores))
print("summary.scores[1].shape: ", summary.scores[1].shape)

len(summary.scores):  29
summary.scores[1].shape:  torch.Size([1, 32128])


In [12]:
!git clone https://github.com/jwkirchenbauer/lm-watermarking.git

Cloning into 'lm-watermarking'...
remote: Enumerating objects: 313, done.[K
remote: Counting objects: 100% (25/25), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 313 (delta 10), reused 5 (delta 5), pack-reused 288[K
Receiving objects: 100% (313/313), 11.98 MiB | 16.02 MiB/s, done.
Resolving deltas: 100% (87/87), done.


In [13]:
import sys
sys.path.insert(0, "/content/lm-watermarking")

In [14]:
from watermark_processor import WatermarkDetector, WatermarkLogitsProcessor
from transformers import (LogitsProcessorList)

In [15]:
 input_text = (
        "The diamondback terrapin or simply terrapin (Malaclemys terrapin) is a "
        "species of turtle native to the brackish coastal tidal marshes of the "
        "Northeastern and southern United States, and in Bermuda.[6] It belongs "
        "to the monotypic genus Malaclemys. It has one of the largest ranges of "
        "all turtles in North America, stretching as far south as the Florida Keys "
        "and as far north as Cape Cod.[7] The name 'terrapin' is derived from the "
        "Algonquian word torope.[8] It applies to Malaclemys terrapin in both "
        "British English and American English. The name originally was used by "
        "early European settlers in North America to describe these brackish-water "
        "turtles that inhabited neither freshwater habitats nor the sea. It retains "
        "this primary meaning in American English.[8] In British English, however, "
        "other semi-aquatic turtle species, such as the red-eared slider, might "
        "also be called terrapins. The common name refers to the diamond pattern "
        "on top of its shell (carapace), but the overall pattern and coloration "
        "vary greatly. The shell is usually wider at the back than in the front, "
        "and from above it appears wedge-shaped. The shell coloring can vary "
        "from brown to grey, and its body color can be grey, brown, yellow, "
        "or white. All have a unique pattern of wiggly, black markings or spots "
        "on their body and head. The diamondback terrapin has large webbed "
        "feet.[9] The species is"
)

# output_text = "watermarking language models can mitigate potential harms. watermarks can be embedded with"

In [16]:
watermark_processor = WatermarkLogitsProcessor(vocab=list(tokenizer.get_vocab().values()),
                                               gamma=0.25,
                                               delta=2.0,
                                               seeding_scheme="simple_1")

tokenized_input = tokenizer(input_text, return_tensors="pt").to(model.device)
print(tokenized_input["input_ids"].shape)
# note that if the model is on cuda, then the input is on cuda
# and thus the watermarking rng is cuda-based.
# This is a different generator than the cpu-based rng in pytorch!

output_tokens = model.generate(**tokenized_input,
                               logits_processor=LogitsProcessorList([watermark_processor]), max_new_tokens=50, return_dict_in_generate=True, max_length=30, output_scores=True)

# if decoder only model, then we need to isolate the
# newly generated tokens as only those are watermarked, the input/prompt is not
# output_tokens = output_tokens[:,tokenized_input["input_ids"].shape[-1]:]
output_text = tokenizer.batch_decode(output_tokens.sequences, skip_special_tokens=True)[0]
output_text

torch.Size([1, 356])


Both `max_new_tokens` (=50) and `max_length`(=30) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


'on the back of their shells, and also has large, webbed endemic to Bermuda, the Bahamas, and the Bahamas.[7] found in the Atlantic Ocean and the Pacific Ocean. found in the Atlantic Ocean'

In [17]:
output_text = "hello everybody my name is pouya"

In [18]:
watermark_detector = WatermarkDetector(vocab=list(tokenizer.get_vocab().values()),
                                        gamma=0.25, # should match original setting
                                        seeding_scheme="simple_1", # should match original setting
                                        device="cpu", # must match the original rng device type
                                        tokenizer=tokenizer,
                                        z_threshold=4.0,
                                        normalizers=[],
                                        ignore_repeated_bigrams=False)

score_dict = watermark_detector.detect(output_text) # or any other text of interest to analyze

In [19]:
score_dict

{'num_tokens_scored': 7,
 'num_green_tokens': 3,
 'green_fraction': 0.42857142857142855,
 'z_score': 1.091089451179962,
 'p_value': 0.13761676203741713,
 'prediction': False}

## CNN dataset

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.14.2-py3-none-any.whl (518 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m518.9/518.9 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.16.4-py3-none-a

In [2]:
from datasets import load_dataset

In [3]:
ds = load_dataset('cnn_dailymail', '3.0.0')

Downloading builder script:   0%|          | 0.00/8.33k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/9.88k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [21]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('t5-large')

articles = [d['article'] for d in ds['train']]
summaries = [d['highlights'] for d in ds['train']]

# inputs = tokenizer(articles, truncation=True, padding='longest', return_tensors='pt')
# targets = tokenizer(summaries, truncation=True, padding='longest', return_tensors='pt')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [22]:
type(articles)

list

Collecting lmppl
  Downloading lmppl-0.3.1.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting accelerate (from lmppl)
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai (from lmppl)
  Downloading openai-0.27.8-py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting protobuf<3.20 (from lmppl)
  Downloading protobuf-3.19.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: lmppl
  Building wheel for lmppl (setup.py) ... [?25l[?25hdone
  Created wheel for lmppl: filename=lmppl-0.3.1-py3-none-any.whl size=13113 sha256=247bfd49f250733d

In [10]:
import lmppl

In [11]:
scorer = lmppl.EncoderDecoderLM('google/flan-t5-large')

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


In [14]:
inputs = [
    'I dropped my laptop on my knee, and someone stole my coffee.',
    'I dropped my laptop on my knee, and someone stole my coffee.'
]
outputs = [
    'and the door is a blackboard.',
    'I am sad.'
]
ppl = scorer.get_perplexity(input_texts=inputs, output_texts=outputs)
ppl
# print(list(zip(outputs, ppl)))
# print(f"prediction: {outputs[ppl.index(min(ppl))]}")

100%|██████████| 1/1 [00:01<00:00,  1.43s/it]


[47.66353959810042, 15.044857001110119]