In [1]:
import datasets
import numpy as np
import torch
from torch.nn import CrossEntropyLoss
from transformers import AutoModelForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "roneneldan/TinyStories-28M"
model = AutoModelForCausalLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)

tokenizer.pad_token = tokenizer.eos_token
print(tokenizer.bos_token)
print(tokenizer.bos_token_id)

sentence = "Once upon a time"
input = tokenizer(
            sentence,
            add_special_tokens=True,
            padding=False,
            truncation=True,
            max_length=512,
            return_tensors="pt",
        )
print(input)

# from tokenizers.processors import TemplateProcessing
# tokenizer.post_processor = TemplateProcessing(
#     single=f"{tokenizer.bos_token} $A",
#     special_tokens=[(f"{tokenizer.bos_token}", tokenizer.bos_token_id)],
# )

sentence = tokenizer.bos_token + sentence

input = tokenizer(
            sentence,
            add_special_tokens=True,
            padding=False,
            truncation=True,
            max_length=512,
            return_tensors="pt",
        )
print(input)



<|endoftext|>
50256
{'input_ids': tensor([[7454, 2402,  257,  640]]), 'attention_mask': tensor([[1, 1, 1, 1]])}
{'input_ids': tensor([[50256,  7454,  2402,   257,   640]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}


In [11]:
def get_sentence_ppl(sentence, bos=True):
    # Tokenize the sentence
    loss_fct = CrossEntropyLoss(reduction="none")

    input = tokenizer(
            sentence,
            add_special_tokens=True,
            padding=False,
            truncation=True,
            max_length=512,
            return_tensors="pt",
            return_attention_mask=True,
        ).to(device)
    input_ids = input["input_ids"].to(device)
    print(input_ids)
    attn_mask = input["attention_mask"].to(device)
    if bos:
        bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]]).to(device)
        input_ids = torch.cat([bos_tokens_tensor, input_ids], dim=1).to(device)
        attn_mask = torch.cat(
                        [torch.ones(bos_tokens_tensor.size(), dtype=torch.int64).to(device), attn_mask], dim=1
                    )
    
    # Get log probabilities from the model
    with torch.no_grad():
        logits = model(input_ids, attention_mask=attn_mask,  labels=input_ids).logits
        
    shift_logits = logits[..., :-1, :].contiguous()
    shift_labels = input_ids[..., 1:].contiguous()
    # shift_attention_mask_batch = attn_mask[..., 1:].contiguous()
    ce_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    # ce_loss = loss_fct(shift_logits.transpose(1, 2), shift_labels)
    perplexities = torch.exp(ce_loss)
    token_ids = shift_labels.cpu().numpy().tolist()[0]
    perplexities = perplexities.cpu().numpy().tolist()
    return token_ids, perplexities 



In [12]:
sentence = "Once upon a time, there was a friendly bird named Bob. Bob lived near a big cliff. Every day, Bob"
token_ids, ppl = get_sentence_ppl(sentence, bos=True)
# print(result, np.mean(list(result.values())[1:]))
# print tokenwise_ppl
token_ids = token_ids.cpu().numpy().tolist()[0]
ppl = ppl.cpu().numpy().tolist()
for token_id, ppl in zip(token_ids, ppl):
    print(tokenizer.decode(token_id), ppl)
print(np.mean(ppl))
    


tensor([[ 7454,  2402,   257,   640,    11,   612,   373,   257,  8030,  6512,
          3706,  5811,    13,  5811,  5615,  1474,   257,  1263, 19516,    13,
          3887,  1110,    11,  5811]])


AttributeError: 'list' object has no attribute 'cpu'

In [None]:
import evaluate
perplexity = evaluate.load("perplexity", module_type="metric")
results = perplexity.compute(model_id='roneneldan/TinyStories-28M',
                             add_start_token=False,
                             predictions=[sentence])
print(results)

Using pad_token, but it is not set yet.
100%|██████████| 1/1 [00:00<00:00,  7.36it/s]

{'perplexities': [2.667530059814453], 'mean_perplexity': 2.667530059814453}





In [142]:
# load tinystories
path = f'../tinystories_words/tinystories_rows_gpt4.txt'
tinystories = []
with open(path, 'r') as f:
    lines = f.readlines()
    for line in lines:
        tinystories.append(line.strip())
path = f'../tinystories_words/tinystories_rows_gpt4.txt'
tinystories_gpt4 = []
with open(path, 'r') as f:
    lines = f.readlines()
    for line in lines:
        tinystories_gpt4.append(line.strip())

In [143]:
# get ppl of a random story
import random
random_story = random.choice(tinystories)
print(random_story)
results = perplexity.compute(model_id='roneneldan/TinyStories-28M',
                                add_start_token=False,
                                predictions=[random_story])
print(results)

One day, a little boy named Tim was playing outside. He saw a big, scary dog near the post. The dog had a loud bark and sharp teeth. Tim felt worry in his tummy. He did not want the dog to come near him.Tim's mom saw him worry and came outside. She saw the scary dog too. Tim's mom told him not to worry. She said they would scare the dog away together. Tim felt a little better.Tim and his mom made loud noises and waved their arms. The scary dog ran away from the post. Tim was not worry anymore. He was happy that his mom helped him. They went inside to have a snack and play.


Using pad_token, but it is not set yet.
100%|██████████| 1/1 [00:00<00:00,  1.50it/s]

{'perplexities': [3.483666181564331], 'mean_perplexity': 3.483666181564331}





In [145]:
# get token-level ppl of a random story
token_ids, ppl = get_sentence_ppl(random_story, bos=True)
print(token_ids.shape)
token_ids = token_ids.cpu().numpy().tolist()[0]
print(len(token_id))
ppl = ppl.cpu().numpy().tolist()
for token_id, ppl in zip(token_ids, ppl):
    print(tokenizer.decode(token_id), ppl)
print(np.mean(ppl))

AttributeError: 'list' object has no attribute 'shape'

In [None]:
# load the tb and fb stories
tb_stories = []
fb_stories = []
pos_tb_stories = []
pos_fb_stories = []
neg_tb_stories = []
neg_fb_stories = []

tb_cond_file  = f'../../data/conditions/tinytom-v3/0_forward_belief_true_belief/corrected.txt'
fb_cond_file  = f'../../data/conditions/tinytom-v3/0_forward_belief_false_belief/corrected.txt'
with open(tb_cond_file, 'r') as f:
    lines = f.readlines()
    for l, line in enumerate(lines):
        tb_stories.append(line.strip())

with open(fb_cond_file, 'r') as f:
    lines = f.readlines()
    for l, line in enumerate(lines):        
        fb_stories.append(line.strip())


In [135]:
random_story = random.choice(tb_stories)
token_ids, ppl = get_sentence_ppl(random_story, bos=True)
token_ids = token_ids.cpu().numpy().tolist()[0]
ppl = ppl.cpu().numpy().tolist()
for token_id, ppl in zip(token_ids, ppl):
    print(tokenizer.decode(token_id), ppl)
print(np.mean(ppl))

Once 3786918.75
 upon 1.0253037214279175
 a 1.0000678300857544
 time 1.0001612901687622
, 1.0017625093460083
 in 26.137815475463867
 a 1.0432683229446411
 lovely 255.73524475097656
 park 147.6114044189453
 full 1578.48095703125
 of 1.004035472869873
 pretty 59.270225524902344
 flowers 1.1265602111816406
, 6.635289192199707
 there 1.136163353919983
 was 1.290573239326477
 a 1.004797101020813
 little 1.7714539766311646
 girl 1.9726015329360962
 named 1.0066407918930054
 Queen 16705913.0
ie 1.6073189973831177
. 1.0317797660827637
 She 6.450061798095703
 needed 1701482.75
 a 11.8333740234375
 green 9422.484375
 leaf 5.010408401489258
 for 4.829154014587402
 her 1.0386905670166016
 art 878.48779296875
 project 2.2610509395599365
. 1.047688364982605
 She 4.079145908355713
 spotted 51498.6015625
 a 1.6199395656585693
 leaf 25.733139038085938
 that 9.338224411010742
 was 1.553296446800232
 very 19.732328414916992
 green 13.203532218933105
 and 3.0927369594573975
 fresh 5605.99462890625
. 1.009