In [166]:
from datasets import load_dataset

ds_book_corpus = load_dataset("bookcorpus/bookcorpus", trust_remote_code=True)

In [73]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
import torch.nn.functional as F
import numpy as np

# Load the RoBERTa model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-uncased")

# def calculate_perplexity(sentence, model, tokenizer):
#     encoded_input = tokenizer(sentence, return_tensors='pt', truncation=True)
#     input_ids = encoded_input['input_ids'].squeeze()  # Shape: [seq_len]
#     attention_mask = encoded_input['attention_mask']
    
#     total_log_prob = 0.0
#     N = len(input_ids)
    
#     for i in range(1, N-1):  # Skip [CLS] and [SEP] if present
#         masked_input_ids = input_ids.clone()  # Clone to avoid in-place modification
#         masked_input_ids[i] = tokenizer.mask_token_id  # Mask the i-th token
        
#         # Predict the masked token
#         with torch.no_grad():
#             outputs = model(input_ids=masked_input_ids.unsqueeze(0), attention_mask=attention_mask)
#             logits = outputs.logits
        
#         # Convert logits to probabilities
#         predicted_probs = F.softmax(logits[0, i], dim=-1)
        
#         # Get the probability of the original token
#         original_token_id = input_ids[i].item()
#         original_token_prob = predicted_probs[original_token_id].item()
        
#         # Add the log probability of the original token
#         total_log_prob += np.log(original_token_prob)
    
#     # Compute pseudo-perplexity
#     avg_log_prob = total_log_prob / (N - 2)  # Exclude [CLS] and [SEP] tokens
#     pseudo_perplexity = np.exp(-avg_log_prob)
    
#     return pseudo_perplexity

def score(sentence, model, tokenizer):
    # https://arxiv.org/abs/1910.14659
    tensor_input = tokenizer(sentence, return_tensors='pt', truncation=True)
    repeat_input = tensor_input['input_ids'].repeat(tensor_input['input_ids'].size(-1)-2, 1)
    mask = torch.ones(tensor_input['input_ids'].size(-1) - 1).diag(1)[:-2]
    masked_input = repeat_input.masked_fill(mask == 1, tokenizer.mask_token_id)
    labels = repeat_input.masked_fill( masked_input != tokenizer.mask_token_id, -100)
    with torch.inference_mode():
        loss = model(masked_input, labels=labels).loss
    return np.exp(loss.item())


# Example usage
sentence = "London is the capital of Great Britain."
pseudo_perplexity = score(sentence, model, tokenizer)
print("Pseudo-perplexity:", pseudo_perplexity)



Some weights of the model checkpoint at google-bert/bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Pseudo-perplexity: 1.0968683577162155


In [167]:

from transformers import AutoModelForCausalLM, AutoTokenizer
gpt_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
gpt_model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")


# Load the RoBERTa model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("controlled/models/RoBERTa")
model = AutoModelForMaskedLM.from_pretrained("controlled/models/RoBERTa")


In [169]:

from ml_security.utils.utils import get_device
from ml_security.utils.nlp_utils import calculate_perplexity

DEVICE = get_device(allow_mps=False)

Using CPU


In [152]:
file_path = "experiment-02.txt"
file_path_ids = file_path.replace(".txt", "") + "-idxs" + ".txt"
output_file = file_path.replace(".txt", "") + "-results" + ".json"

import zlib
from tqdm import tqdm
import json

with open(file_path_ids, 'r') as f:
    lines = f.readlines()
    idxs = [int(line.strip()) for line in lines]
# repeat each entry twice, but have them in sequence
idxs = [idx for idx in idxs for _ in range(2)]


# with open(file_path, 'r') as f:
#     lines = f.readlines()
#     for line_idx, line in tqdm(enumerate(lines), total=len(lines)):
#         perpl = score(line, model, tokenizer)
#         zlib_entropy = len(zlib.compress(bytes(line, 'utf-8')))
#         gpt2_perpl = calculate_perplexity(line, gpt_model, gpt_tokenizer, DEVICE).item()
#         with open(output_file, 'a') as f:
#             f.write(json.dumps({"line": line, "perplexity": perpl, "zlib_entropy": zlib_entropy, "gpt2_perplexity": gpt2_perpl, "idx": idxs[line_idx]}) + "\n")


In [165]:
# For each id in idxs, get the corresponding text from ds_book_corpus
for idx in tqdm(idxs):
    line = ds_book_corpus['train'][idx]['text']
    tokens = tokenizer.tokenize(line)
    # get a token at random and replace it with a mask token
    mask_idx = np.random.randint(len(tokens))
    original_token = tokens[mask_idx]
    tokens[mask_idx] = tokenizer.mask_token
    sentence = " ".join(tokens)

    tokens_input = tokenizer(sentence, return_tensors='pt', truncation=True)

    
    model.eval()
    with torch.no_grad():
        outputs = model(tokens_input['input_ids'], return_dict=True)
        logits = outputs.logits
        predicted_index = torch.argmax(logits[0, mask_idx]).item()
        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
        print(f"Original token: {original_token}, Predicted token: {predicted_token}")
        print(f"Original sentence: {line}")
        print(f"Masked sentence: {sentence}")
        print(f"Predicted sentence: {sentence.replace(tokenizer.mask_token, predicted_token)}")
        print("\n")



  0%|          | 6/2000 [00:00<01:18, 25.54it/s]

Original token: you, Predicted token: `
Original sentence: `` you are threatening me ? ''
Masked sentence: ` ` [MASK] are threatening me ? ' '
Predicted sentence: ` ` ` are threatening me ? ' '


Original token: ', Predicted token: ?
Original sentence: `` you are threatening me ? ''
Masked sentence: ` ` you are threatening me ? [MASK] '
Predicted sentence: ` ` you are threatening me ? ? '


Original token: ##hip, Predicted token: stars
Original sentence: you can click on the image to get a bigger view : entry 2-307 : october 29 , 2014 the milk run preview : starship omcom one of the key tenets of the upcoming novel entitled the milk run was that i tied up any remaining loose ends from the previous novels in the rome 's revolution series .
Masked sentence: you can click on the image to get a bigger view : entry 2 - 307 : october 29 , 2014 the milk run preview : stars [MASK] om ##com one of the key ten ##ets of the upcoming novel entitled the milk run was that i tied up any remaining loo

  1%|          | 13/2000 [00:00<01:08, 29.14it/s]

Original token: chuckled, Predicted token: he
Original sentence: he chuckled as he folded her arm within his .
Masked sentence: he [MASK] as he folded her arm within his .
Predicted sentence: he he as he folded her arm within his .


Original token: chuckled, Predicted token: he
Original sentence: he chuckled as he folded her arm within his .
Masked sentence: he [MASK] as he folded her arm within his .
Predicted sentence: he he as he folded her arm within his .


Original token: middle, Predicted token: the
Original sentence: it 's the middle of the week . ''
Masked sentence: it ' s the [MASK] of the week . ' '
Predicted sentence: it ' s the the of the week . ' '


Original token: the, Predicted token: s
Original sentence: it 's the middle of the week . ''
Masked sentence: it ' s [MASK] middle of the week . ' '
Predicted sentence: it ' s s middle of the week . ' '


Original token: ., Predicted token: sure
Original sentence: `` i ... i 'm not sure . ''
Masked sentence: ` ` i . . . i ' 

  1%|          | 17/2000 [00:00<01:04, 30.56it/s]

Original token: glittering, Predicted token: out
Original sentence: there was a multiverse out there , glittering and magic .
Masked sentence: there was a multi ##verse out there , [MASK] and magic .
Predicted sentence: there was a multi ##verse out there , out and magic .


Original token: stuck, Predicted token: .
Original sentence: gran stuck her head out the back door .
Masked sentence: gran [MASK] her head out the back door .
Predicted sentence: gran . her head out the back door .


Original token: door, Predicted token: back
Original sentence: gran stuck her head out the back door .
Masked sentence: gran stuck her head out the back [MASK] .
Predicted sentence: gran stuck her head out the back back .


Original token: the, Predicted token: recognizing
Original sentence: she stared up , tears of frustration running hot down her cheeks , recognizing the truth .
Masked sentence: she stared up , tears of frustration running hot down her cheeks , recognizing [MASK] truth .
Predicted se

  1%|▏         | 25/2000 [00:00<01:06, 29.84it/s]

Original token: back, Predicted token: going
Original sentence: never going back , i hope .
Masked sentence: never going [MASK] , i hope .
Predicted sentence: never going going , i hope .


Original token: ,, Predicted token: back
Original sentence: never going back , i hope .
Masked sentence: never going back [MASK] i hope .
Predicted sentence: never going back back i hope .


Original token: of, Predicted token: enjoying
Original sentence: hes taking his time , enjoying the act of driving .
Masked sentence: he ##s taking his time , enjoying the act [MASK] driving .
Predicted sentence: he ##s taking his time , enjoying the act enjoying driving .


Original token: of, Predicted token: enjoying
Original sentence: hes taking his time , enjoying the act of driving .
Masked sentence: he ##s taking his time , enjoying the act [MASK] driving .
Predicted sentence: he ##s taking his time , enjoying the act enjoying driving .


Original token: sweet, Predicted token: still
Original sentence: th

  2%|▏         | 31/2000 [00:01<01:18, 25.22it/s]

Original token: ', Predicted token: '
Original sentence: at least not in words . ''
Masked sentence: at least not in words . ' [MASK]
Predicted sentence: at least not in words . ' '


Original token: at, Predicted token: .
Original sentence: at least not in words . ''
Masked sentence: [MASK] least not in words . ' '
Predicted sentence: . least not in words . ' '


Original token: we, Predicted token: .
Original sentence: we got to be smart about things , ma .
Masked sentence: [MASK] got to be smart about things , ma .
Predicted sentence: . got to be smart about things , ma .


Original token: ., Predicted token: ma
Original sentence: we got to be smart about things , ma .
Masked sentence: we got to be smart about things , ma [MASK]
Predicted sentence: we got to be smart about things , ma ma


Original token: ?, Predicted token: #
Original sentence: was this what it was like to be in limbo , to be caught in that netherworld between life and death ?
Masked sentence: was this what it was 

  2%|▏         | 34/2000 [00:01<01:19, 24.59it/s]

Original token: was, Predicted token: it
Original sentence: was this what it was like to be in limbo , to be caught in that netherworld between life and death ?
Masked sentence: was this what it [MASK] like to be in limb ##o , to be caught in that net ##her ##world between life and death ?
Predicted sentence: was this what it it like to be in limb ##o , to be caught in that net ##her ##world between life and death ?


Original token: ', Predicted token: n
Original sentence: `` you do n't need to see this . ''
Masked sentence: ` ` you do n [MASK] t need to see this . ' '
Predicted sentence: ` ` you do n n t need to see this . ' '


Original token: t, Predicted token: '
Original sentence: `` you do n't need to see this . ''
Masked sentence: ` ` you do n ' [MASK] need to see this . ' '
Predicted sentence: ` ` you do n ' ' need to see this . ' '


Original token: him, Predicted token: to
Original sentence: why would you lie to him ? ''
Masked sentence: why would you lie to [MASK] ? ' '
Pre

  2%|▏         | 40/2000 [00:01<01:25, 22.86it/s]

Original token: she, Predicted token: .
Original sentence: she squealed .
Masked sentence: [MASK] squealed .
Predicted sentence: . squealed .


Original token: squealed, Predicted token: she
Original sentence: she squealed .
Masked sentence: she [MASK] .
Predicted sentence: she she .


Original token: ll, Predicted token: .
Original sentence: `` i 'll consider myself duly warned for the vicious snowball attack . ''
Masked sentence: ` ` i ' [MASK] consider myself duly warned for the vicious snow ##ball attack . ' '
Predicted sentence: ` ` i ' . consider myself duly warned for the vicious snow ##ball attack . ' '


Original token: vicious, Predicted token: the
Original sentence: `` i 'll consider myself duly warned for the vicious snowball attack . ''
Masked sentence: ` ` i ' ll consider myself duly warned for the [MASK] snow ##ball attack . ' '
Predicted sentence: ` ` i ' ll consider myself duly warned for the the snow ##ball attack . ' '


Original token: shoe, Predicted token: one
Ori

  2%|▏         | 46/2000 [00:01<01:27, 22.36it/s]

Original token: just, Predicted token: `
Original sentence: `` just one shoe ?
Masked sentence: ` ` [MASK] one shoe ?
Predicted sentence: ` ` ` one shoe ?


Original token: ,, Predicted token: yeah
Original sentence: hell , yeah , they were freaks .
Masked sentence: hell , yeah [MASK] they were freaks .
Predicted sentence: hell , yeah yeah they were freaks .


Original token: freaks, Predicted token: were
Original sentence: hell , yeah , they were freaks .
Masked sentence: hell , yeah , they were [MASK] .
Predicted sentence: hell , yeah , they were were .


Original token: ., Predicted token: cara
Original sentence: `` do n't delay , cara .
Masked sentence: ` ` do n ' t delay , cara [MASK]
Predicted sentence: ` ` do n ' t delay , cara cara


Original token: ., Predicted token: cara
Original sentence: `` do n't delay , cara .
Masked sentence: ` ` do n ' t delay , cara [MASK]
Predicted sentence: ` ` do n ' t delay , cara cara




  2%|▏         | 49/2000 [00:01<01:29, 21.89it/s]

Original token: taste, Predicted token: a
Original sentence: he craved a taste of her .
Masked sentence: he craved a [MASK] of her .
Predicted sentence: he craved a a of her .


Original token: taste, Predicted token: a
Original sentence: he craved a taste of her .
Masked sentence: he craved a [MASK] of her .
Predicted sentence: he craved a a of her .


Original token: ', Predicted token: .
Original sentence: you do n't have to say yes . ''
Masked sentence: you do n ' t have to say yes . [MASK] '
Predicted sentence: you do n ' t have to say yes . . '


Original token: have, Predicted token: t
Original sentence: you do n't have to say yes . ''
Masked sentence: you do n ' t [MASK] to say yes . ' '
Predicted sentence: you do n ' t t to say yes . ' '


Original token: she, Predicted token: .
Original sentence: she whispered his name .
Masked sentence: [MASK] whispered his name .
Predicted sentence: . whispered his name .




  3%|▎         | 55/2000 [00:02<01:26, 22.43it/s]

Original token: she, Predicted token: .
Original sentence: she whispered his name .
Masked sentence: [MASK] whispered his name .
Predicted sentence: . whispered his name .


Original token: from, Predicted token: ,
Original sentence: i yelp , but scramble from the floor to try to minimize the damage .
Masked sentence: i ye ##lp , but scramble [MASK] the floor to try to minimize the damage .
Predicted sentence: i ye ##lp , but scramble , the floor to try to minimize the damage .


Original token: ye, Predicted token: i
Original sentence: i yelp , but scramble from the floor to try to minimize the damage .
Masked sentence: i [MASK] ##lp , but scramble from the floor to try to minimize the damage .
Predicted sentence: i i ##lp , but scramble from the floor to try to minimize the damage .


Original token: i, Predicted token: `
Original sentence: `` i think so . ''
Masked sentence: ` ` [MASK] think so . ' '
Predicted sentence: ` ` ` think so . ' '


Original token: so, Predicted token: thi

  3%|▎         | 61/2000 [00:02<01:24, 22.98it/s]

Original token: ., Predicted token: ready
Original sentence: she was ready .
Masked sentence: she was ready [MASK]
Predicted sentence: she was ready ready


Original token: she, Predicted token: .
Original sentence: she was ready .
Masked sentence: [MASK] was ready .
Predicted sentence: . was ready .


Original token: bed, Predicted token: #
Original sentence: * chapter 35 prytani wanted to tell tamesis to leave the bedchamber .
Masked sentence: * chapter 35 pry ##tani wanted to tell tame ##sis to leave the [MASK] ##cha ##mber .
Predicted sentence: * chapter 35 pry ##tani wanted to tell tame ##sis to leave the # ##cha ##mber .


Original token: to, Predicted token: to
Original sentence: * chapter 35 prytani wanted to tell tamesis to leave the bedchamber .
Masked sentence: * chapter 35 pry ##tani wanted to tell tame ##sis [MASK] leave the bed ##cha ##mber .
Predicted sentence: * chapter 35 pry ##tani wanted to tell tame ##sis to leave the bed ##cha ##mber .


Original token: in, Predict

  3%|▎         | 64/2000 [00:02<01:26, 22.42it/s]

Original token: his, Predicted token: in
Original sentence: then he dug in his pocket , pulled something out and shoved it in the front pocket of my levi 's .
Masked sentence: then he dug in [MASK] pocket , pulled something out and shoved it in the front pocket of my levi ' s .
Predicted sentence: then he dug in in pocket , pulled something out and shoved it in the front pocket of my levi ' s .


Original token: fact, Predicted token: in
Original sentence: no , in fact , how 'bout you tie a big pink ribbon around his chest and call him mary-anne . ''
Masked sentence: no , in [MASK] , how ' bout you tie a big pink ribbon around his chest and call him mary - anne . ' '
Predicted sentence: no , in in , how ' bout you tie a big pink ribbon around his chest and call him mary - anne . ' '


Original token: a, Predicted token: tie
Original sentence: no , in fact , how 'bout you tie a big pink ribbon around his chest and call him mary-anne . ''
Masked sentence: no , in fact , how ' bout you ti

  4%|▎         | 70/2000 [00:02<01:20, 24.08it/s]

Original token: the, Predicted token: .
Original sentence: the water came from somewhere !
Masked sentence: [MASK] water came from somewhere !
Predicted sentence: . water came from somewhere !


Original token: somewhere, Predicted token: from
Original sentence: the water came from somewhere !
Masked sentence: the water came from [MASK] !
Predicted sentence: the water came from from !


Original token: her, Predicted token: suited
Original sentence: the red gown she was wearing suited her perfectly , the black lace at the top playing to her black hair and her dark gray eyes , the miles of satin skirting falling about her slender body in resplendent waves .
Masked sentence: the red gown she was wearing suited [MASK] perfectly , the black lace at the top playing to her black hair and her dark gray eyes , the miles of satin skirt ##ing falling about her slender body in res ##ple ##nden ##t waves .
Predicted sentence: the red gown she was wearing suited suited perfectly , the black lace at

  4%|▍         | 76/2000 [00:03<01:24, 22.82it/s]

Original token: balls, Predicted token: and
Original sentence: raising her knee , she connected with his cock and balls .
Masked sentence: raising her knee , she connected with his cock and [MASK] .
Predicted sentence: raising her knee , she connected with his cock and and .


Original token: balls, Predicted token: and
Original sentence: raising her knee , she connected with his cock and balls .
Masked sentence: raising her knee , she connected with his cock and [MASK] .
Predicted sentence: raising her knee , she connected with his cock and and .


Original token: at, Predicted token: down
Original sentence: she glanced down at her torso .
Masked sentence: she glanced down [MASK] her torso .
Predicted sentence: she glanced down down her torso .


Original token: torso, Predicted token: her
Original sentence: she glanced down at her torso .
Masked sentence: she glanced down at her [MASK] .
Predicted sentence: she glanced down at her her .


Original token: are, Predicted token: we
Orig

  4%|▍         | 79/2000 [00:03<01:24, 22.64it/s]

Original token: that, Predicted token: .
Original sentence: that is why we are here .
Masked sentence: [MASK] is why we are here .
Predicted sentence: . is why we are here .


Original token: brow, Predicted token: s
Original sentence: paul 's brow rises .
Masked sentence: paul ' s [MASK] rises .
Predicted sentence: paul ' s s rises .


Original token: ', Predicted token: paul
Original sentence: paul 's brow rises .
Masked sentence: paul [MASK] s brow rises .
Predicted sentence: paul paul s brow rises .


Original token: ., Predicted token: .
Original sentence: seeing the eyes on me , the guys who would inevitably hit on me ... it never fails to bolster my self-confidence and make me feel better .
Masked sentence: seeing the eyes on me , the guys who would inevitably hit on me . [MASK] . it never fails to bo ##lster my self - confidence and make me feel better .
Predicted sentence: seeing the eyes on me , the guys who would inevitably hit on me . . . it never fails to bo ##lster my sel

  4%|▍         | 85/2000 [00:03<01:25, 22.36it/s]

Original token: me, Predicted token: -
Original sentence: seeing the eyes on me , the guys who would inevitably hit on me ... it never fails to bolster my self-confidence and make me feel better .
Masked sentence: seeing the eyes on me , the guys who would inevitably hit on me . . . it never fails to bo ##lster my self - confidence and make [MASK] feel better .
Predicted sentence: seeing the eyes on me , the guys who would inevitably hit on me . . . it never fails to bo ##lster my self - confidence and make - feel better .


Original token: one, Predicted token: no
Original sentence: there is no one else .
Masked sentence: there is no [MASK] else .
Predicted sentence: there is no no else .


Original token: there, Predicted token: .
Original sentence: there is no one else .
Masked sentence: [MASK] is no one else .
Predicted sentence: . is no one else .


Original token: it, Predicted token: do
Original sentence: this is a good way to do it .
Masked sentence: this is a good way to do [M

  5%|▍         | 93/2000 [00:03<01:11, 26.66it/s]

Original token: like, Predicted token: be
Original sentence: what would that be like ?
Masked sentence: what would that be [MASK] ?
Predicted sentence: what would that be be ?


Original token: ,, Predicted token: outside
Original sentence: *** outside , logan waits patiently , leaning against the mustang .
Masked sentence: * * * outside [MASK] logan waits patiently , leaning against the mustang .
Predicted sentence: * * * outside outside logan waits patiently , leaning against the mustang .


Original token: ,, Predicted token: outside
Original sentence: *** outside , logan waits patiently , leaning against the mustang .
Masked sentence: * * * outside [MASK] logan waits patiently , leaning against the mustang .
Predicted sentence: * * * outside outside logan waits patiently , leaning against the mustang .


Original token: slipped, Predicted token: langdon
Original sentence: with that , langdon slipped away , leaving his newfound friend bewildered and alone .
Masked sentence: with tha

  5%|▌         | 101/2000 [00:04<01:04, 29.25it/s]

Original token: said, Predicted token: billy
Original sentence: 'yes , ' billy said .
Masked sentence: ' yes , ' billy [MASK] .
Predicted sentence: ' yes , ' billy billy .


Original token: ., Predicted token: said
Original sentence: 'yes , ' billy said .
Masked sentence: ' yes , ' billy said [MASK]
Predicted sentence: ' yes , ' billy said said


Original token: `, Predicted token: `
Original sentence: `` you might have told me , '' mack pointed out .
Masked sentence: ` [MASK] you might have told me , ' ' mack pointed out .
Predicted sentence: ` ` you might have told me , ' ' mack pointed out .


Original token: ', Predicted token: '
Original sentence: `` you might have told me , '' mack pointed out .
Masked sentence: ` ` you might have told me , ' [MASK] mack pointed out .
Predicted sentence: ` ` you might have told me , ' ' mack pointed out .


Original token: jungle, Predicted token: in
Original sentence: totally done all the time in jungles . ''
Masked sentence: totally done all th

  5%|▌         | 107/2000 [00:04<01:06, 28.49it/s]

Original token: my, Predicted token: dug
Original sentence: as he would be mostly because i dug my heels in his shoulders as payback in the heat of the moment .
Masked sentence: as he would be mostly because i dug [MASK] heels in his shoulders as pay ##back in the heat of the moment .
Predicted sentence: as he would be mostly because i dug dug heels in his shoulders as pay ##back in the heat of the moment .


Original token: was, Predicted token: he
Original sentence: it just was obvious he was n't even remotely sorry . ''
Masked sentence: it just was obvious he [MASK] n ' t even remotely sorry . ' '
Predicted sentence: it just was obvious he he n ' t even remotely sorry . ' '


Original token: was, Predicted token: he
Original sentence: it just was obvious he was n't even remotely sorry . ''
Masked sentence: it just was obvious he [MASK] n ' t even remotely sorry . ' '
Predicted sentence: it just was obvious he he n ' t even remotely sorry . ' '


Original token: soon, Predicted token

  6%|▌         | 114/2000 [00:04<01:04, 29.34it/s]

Original token: took, Predicted token: .
Original sentence: alex took his eyes off the road and looked straight at gary with a serious expression on his face : no , no .
Masked sentence: alex [MASK] his eyes off the road and looked straight at gary with a serious expression on his face : no , no .
Predicted sentence: alex . his eyes off the road and looked straight at gary with a serious expression on his face : no , no .


Original token: like, Predicted token: .
Original sentence: like hounding ambrose .
Masked sentence: [MASK] hound ##ing ambrose .
Predicted sentence: . hound ##ing ambrose .


Original token: hound, Predicted token: like
Original sentence: like hounding ambrose .
Masked sentence: like [MASK] ##ing ambrose .
Predicted sentence: like like ##ing ambrose .


Original token: ., Predicted token: lips
Original sentence: rafael tore himself from my lips .
Masked sentence: rafael tore himself from my lips [MASK]
Predicted sentence: rafael tore himself from my lips lips


Ori

  6%|▌         | 121/2000 [00:04<01:03, 29.78it/s]

Original token: `, Predicted token: .
Original sentence: `` oh , it is .
Masked sentence: [MASK] ` oh , it is .
Predicted sentence: . ` oh , it is .


Original token: ,, Predicted token: oh
Original sentence: `` oh , it is .
Masked sentence: ` ` oh [MASK] it is .
Predicted sentence: ` ` oh oh it is .


Original token: is, Predicted token: he
Original sentence: `` damn , there he is , '' i whispered .
Masked sentence: ` ` damn , there he [MASK] , ' ' i whispered .
Predicted sentence: ` ` damn , there he he , ' ' i whispered .


Original token: there, Predicted token: ,
Original sentence: `` damn , there he is , '' i whispered .
Masked sentence: ` ` damn , [MASK] he is , ' ' i whispered .
Predicted sentence: ` ` damn , , he is , ' ' i whispered .


Original token: for, Predicted token: not
Original sentence: `` well , not for you .
Masked sentence: ` ` well , not [MASK] you .
Predicted sentence: ` ` well , not not you .


Original token: ., Predicted token: you
Original sentence: `` well

  6%|▋         | 127/2000 [00:04<01:06, 28.24it/s]

Original token: ., Predicted token: abruptly
Original sentence: the horse stopped abruptly .
Masked sentence: the horse stopped abruptly [MASK]
Predicted sentence: the horse stopped abruptly abruptly


Original token: ##ed, Predicted token: #
Original sentence: it instilled a chilling doubt in me that perhaps i never would escape the shadows of my past .
Masked sentence: it ins ##till [MASK] a chilling doubt in me that perhaps i never would escape the shadows of my past .
Predicted sentence: it ins ##till # a chilling doubt in me that perhaps i never would escape the shadows of my past .


Original token: in, Predicted token: #
Original sentence: it instilled a chilling doubt in me that perhaps i never would escape the shadows of my past .
Masked sentence: it ins ##till ##ed a chilling doubt [MASK] me that perhaps i never would escape the shadows of my past .
Predicted sentence: it ins ##till ##ed a chilling doubt # me that perhaps i never would escape the shadows of my past .


Origin

  7%|▋         | 133/2000 [00:05<01:07, 27.52it/s]

Original token: bodied, Predicted token: -
Original sentence: she had that sort of full-bodied figure that he sometimes enjoyed .
Masked sentence: she had that sort of full - [MASK] figure that he sometimes enjoyed .
Predicted sentence: she had that sort of full - - figure that he sometimes enjoyed .


Original token: an, Predicted token: in
Original sentence: this water felt fully as cold as the first application , and i lifted my head slightly in an attempt to ascertain its origin .
Masked sentence: this water felt fully as cold as the first application , and i lifted my head slightly in [MASK] attempt to as ##cer ##tain its origin .
Predicted sentence: this water felt fully as cold as the first application , and i lifted my head slightly in in attempt to as ##cer ##tain its origin .


Original token: cold, Predicted token: as
Original sentence: this water felt fully as cold as the first application , and i lifted my head slightly in an attempt to ascertain its origin .
Masked senten

  7%|▋         | 139/2000 [00:05<01:05, 28.44it/s]

Original token: sent, Predicted token: i
Original sentence: 'i sent some of our men round the back way to intercept those pazzi rats .
Masked sentence: ' i [MASK] some of our men round the back way to intercept those paz ##zi rats .
Predicted sentence: ' i i some of our men round the back way to intercept those paz ##zi rats .


Original token: know, Predicted token: i
Original sentence: i know he loves you .
Masked sentence: i [MASK] he loves you .
Predicted sentence: i i he loves you .


Original token: he, Predicted token: know
Original sentence: i know he loves you .
Masked sentence: i know [MASK] loves you .
Predicted sentence: i know know loves you .


Original token: away, Predicted token: looked
Original sentence: nothing , rupert looked away from the vampire .
Masked sentence: nothing , rupert looked [MASK] from the vampire .
Predicted sentence: nothing , rupert looked looked from the vampire .


Original token: the, Predicted token: from
Original sentence: nothing , rupert lo

  7%|▋         | 142/2000 [00:05<01:12, 25.62it/s]


Original token: in, Predicted token: lived
Original sentence: he lived in a small house .
Masked sentence: he lived [MASK] a small house .
Predicted sentence: he lived lived a small house .


Original token: ., Predicted token: bacteria
Original sentence: the bacteria .
Masked sentence: the bacteria [MASK]
Predicted sentence: the bacteria bacteria


Original token: the, Predicted token: .
Original sentence: the bacteria .
Masked sentence: [MASK] bacteria .
Predicted sentence: . bacteria .




KeyboardInterrupt: 

In [122]:
import pandas as pd
columns = ['perplexity', 'zlib_entropy', 'gpt2_perplexity', 'line', 'idx']
df_plots = pd.DataFrame(columns=columns)

with open(output_file, 'r') as f:
    # read line by line
    for line in f:
        # parse the line
        data = json.loads(line)
        # create a dictionary
        data_dict = {col: data[col] for col in columns}
        # create a dataframe
        df = pd.DataFrame(data_dict, index=[0])
        # append to the main dataframe
        df_plots = pd.concat([df_plots, df], ignore_index=True)





The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.



In [123]:
df_plots

Unnamed: 0,perplexity,zlib_entropy,gpt2_perplexity,line,idx
0,4.322009,139,66.710045,"` ` you are threatening life , especially to d...",48848192
1,3.961453,131,174.877014,` ` you are threatening her . ' ` snipers whic...,48848192
2,4.382756,132,167.183807,you can click on the stash to make it her favo...,13015892
3,4.287555,142,118.102234,"you can click on the videos or the audio , or ...",13015892
4,11.184225,139,129.840439,it was beyond all the perukers to understand t...,53532639
...,...,...,...,...,...
1995,4.185416,112,19.498312,"so for once , i was all kinds of happy with th...",63445738
1996,3.370690,141,221.380371,"through the door , i heard another one of my f...",27028465
1997,2.418173,133,50.049370,"through the door , i slowly turned around , an...",27028465
1998,9.500776,145,207.949646,"he smiled and stared at the other boys , wonde...",68518599


In [146]:
df_plots['id'] = df_plots.index
df_plots['selected'] = False
# df_plots['selected'] = (df_plots['perplexity'] < 15) & (df_plots['gpt2_perplexity'] > 35)

def calculate_3gram_accuracy(reference_text, predicted_text):
    def generate_ngrams(text, n):
        words = text.split()
        return [tuple(words[i:i+n]) for i in range(len(words)-n+1)]
    
    reference_3grams = generate_ngrams(reference_text, 3)
    predicted_3grams = generate_ngrams(predicted_text, 3)
    
    matching_3grams = set(reference_3grams) & set(predicted_3grams)
    accuracy = len(matching_3grams) / len(reference_3grams) if reference_3grams else 0
    
    return accuracy

df_plots['3gram_accuracy'] = df_plots.apply(lambda row: calculate_3gram_accuracy(row['line'], ds_book_corpus['train'][row['idx']]["text"]), axis=1)



In [147]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px


# plot scatter plot with regression line
fig = px.scatter(df_plots, x='perplexity', y='gpt2_perplexity', hover_data=['line', 'id', 'zlib_entropy'], color='selected')

# show the plot
fig.show()

In [148]:
df_plots.iloc[212]['line']

'the fight between man and machine . five films based on five major box office hit films in the great depression , / 1993 . the first attack on al dealey , / 1995 the life of barry levinson , / 1997 .\n'

In [149]:
#df_filtered = df_plots[df_plots['selected']]
df_filtered = df_plots.copy()

In [150]:
df_filtered

Unnamed: 0,perplexity,zlib_entropy,gpt2_perplexity,line,idx,id,selected,3gram_accuracy
0,4.322009,139,66.710045,"` ` you are threatening life , especially to d...",48848192,0,False,0.023256
1,3.961453,131,174.877014,` ` you are threatening her . ' ` snipers whic...,48848192,1,False,0.025000
2,4.382756,132,167.183807,you can click on the stash to make it her favo...,13015892,2,False,0.105263
3,4.287555,142,118.102234,"you can click on the videos or the audio , or ...",13015892,3,False,0.073171
4,11.184225,139,129.840439,it was beyond all the perukers to understand t...,53532639,4,False,0.071429
...,...,...,...,...,...,...,...,...
1995,4.185416,112,19.498312,"so for once , i was all kinds of happy with th...",63445738,1995,False,0.069767
1996,3.370690,141,221.380371,"through the door , i heard another one of my f...",27028465,1996,False,0.100000
1997,2.418173,133,50.049370,"through the door , i slowly turned around , an...",27028465,1997,False,0.071429
1998,9.500776,145,207.949646,"he smiled and stared at the other boys , wonde...",68518599,1998,False,0.075000


In [151]:
for row in df_filtered.iterrows():
    gram_accuracy = row[1]['3gram_accuracy']
    if gram_accuracy > 0.12:
        print(row[1]['3gram_accuracy'])
        print(row[1]['line'])
        print(ds_book_corpus['train'][row[1]['idx']]["text"])
        print()

0.125
her head lay on his chest , smelled so sweet , remembered her scent , the unexpectedly raw sensation of him kissing beneath her chemise , her amniotic fluid , and then looking at him like he was being utterly ridiculous .

her head lay on his chest , her fingertips tracing her name imprinted in his skin .

0.14634146341463414
she wanted to remain mad at him , to play a little cat and mouse game and soon they were just a weirder - than - usual pair . what was the conversation between her three seven - day roommates looked like ?

she wanted to remain mad at him , but she knew she would forgive him because it was still dark .

0.1282051282051282
he asked me to accompany him to a bar in virginia beach , where we both had cocktails , and i readily agreed . all around us were people , long - haired blonds and guys , women and bloodchemps .

he asked me to accompany him in a reel , and i accepted , thinking nothing of it .

0.12195121951219512
* * * merlin spent the next week assisting

In [141]:
# create a matrix to calculate the n_gram accuracy comparing the generated text with entries from the book corpus
smaller_ds_book_corpus = ds_book_corpus['train'].select(list(range(1000)))

output_file = file_path.replace(".txt", "") + "-results" + "-ngram" + ".json"
import json

for i, row in tqdm(df_filtered.iterrows(), total=len(df_filtered)):
    for j, book_sentence in enumerate(smaller_ds_book_corpus):
        n_gram_accuracy = calculate_3gram_accuracy(row['line'], book_sentence["text"])
        with open(output_file, 'a') as f:
            f.write(json.dumps({
                "line": row['line'],
                "book_sentence": book_sentence["text"],
                "n_gram_accuracy": n_gram_accuracy,
                "line_idx": row['idx'],
                "book_idx": j
            }) + "\n")

100%|██████████| 2000/2000 [01:30<00:00, 22.17it/s]


In [142]:
df_temp = pd.read_json(output_file, lines=True)
df_temp.sort_values(by='n_gram_accuracy', ascending=False)

Unnamed: 0,line,book_sentence,n_gram_accuracy,line_idx,book_idx
0,"` ` you are threatening life , especially to d...","usually , he would be tearing around the livin...",0.0,48848192,0
1,"` ` you are threatening life , especially to d...",but just one look at a minion sent him practic...,0.0,48848192,1
2,"` ` you are threatening life , especially to d...",that had been megan 's plan when she got him d...,0.0,48848192,2
3,"` ` you are threatening life , especially to d...","he 'd seen the movie almost by mistake , consi...",0.0,48848192,3
4,"` ` you are threatening life , especially to d...",she liked to think being surrounded by adults ...,0.0,48848192,4
...,...,...,...,...,...
4080685,he smiled and stared at the swank which awaite...,her fingers itched to run through the dark str...,0.0,68518599,995
4080686,he smiled and stared at the swank which awaite...,her mind ran straight to an illicit image of h...,0.0,68518599,996
4080687,he smiled and stared at the swank which awaite...,she cleared her throat that had run dry .,0.0,68518599,997
4080688,he smiled and stared at the swank which awaite...,"`` no , of course not . ''",0.0,68518599,998
