In [5]:
import json
import gzip
import numpy as np
from tqdm.notebook import tqdm

import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer

In [4]:
#Shard is 
shard = 1
shard_size = 1000

max_length = 2048
device = 'cuda'

# ../../johnny/optout/outputs/00_0-1000.jsonl
fn = '../../johnny/data/val.jsonl.gz'
out_fn = '../../johnny/optout/outputs/val_%d-%d.jsonl'

In [12]:
start = shard * shard_size
end = start + shard_size
start, end

(1000, 2000)

In [13]:
out_fn % (start, end)

'./outputs/val_1000-2000.jsonl'

In [3]:
model_name = 'EleutherAI/gpt-j-6B'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, return_dict=True).to(device)

In [4]:
def stream_jsonl_file(filename, start, end):
    with gzip.open(filename, 'rt') as f:
        for i, line in enumerate(f, 0):
            if i < start:
                continue
            if i >= end:
                break
                
            json_obj = json.loads(line.strip())
            json_obj['index'] = i
            yield json_obj

In [5]:
data = list(stream_jsonl_file(fn, start, end))

In [6]:
out_file = open(out_fn % (start, end), 'wt')

In [7]:
for json_obj in tqdm(data, total=end-start):    
    input_ids = tokenizer.encode(json_obj['text'],  return_tensors='pt', max_length=max_length).to('cuda')
    
    # Evaluate the loss of the sequence with the GPT-2 model
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss
        logits = outputs.logits
        
    # Get the loss at each token
    shift_logits = logits[..., :-1, :].contiguous()
    shift_labels = input_ids[..., 1:].contiguous()
    probs = torch.nn.LogSoftmax(dim=-1)(shift_logits)
    per_token_logprobs = probs.gather(dim=-1, index=shift_labels.unsqueeze(-1)).squeeze(-1)
    
    new_obj = {}
    new_obj['index'] = json_obj['index']
    new_obj['meta'] = json_obj['meta']
    new_obj['tokens'] = tokenizer.convert_ids_to_tokens(input_ids.squeeze().tolist())[:max_length]
    new_obj['per_token_logprobs'] = per_token_logprobs.tolist()
    
    out_file.write(json.dumps(new_obj) + '\n')
    

  0%|          | 0/100 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [8]:
out_file.close()

In [9]:
'''
# code to check that the token level loss reflects the perplexity

probs = torch.nn.Softmax(dim=-1)(shift_logits)
probs = probs.cpu().numpy()[0]
labels = shift_labels.cpu().numpy()[0]

p = 0.0
for i, j in zip(probs, labels):
    p += np.log(i[j])
print(np.exp(-p / len(labels)))
'''

'\n# code to check that the token level loss reflects the perplexity\n\nprobs = torch.nn.Softmax(dim=-1)(shift_logits)\nprobs = probs.cpu().numpy()[0]\nlabels = shift_labels.cpu().numpy()[0]\n\np = 0.0\nfor i, j in zip(probs, labels):\n    p += np.log(i[j])\nprint(np.exp(-p / len(labels)))\n'