In [1]:
from transformers import AutoModel, AutoTokenizer
# import gpt2 LM
from transformers import AutoModelForCausalLM, GPT2Tokenizer
import numpy as np
import torch

In [2]:
def load_gpt2_model(model_name):
    # load the gpt2 model and tokenizer
    # return: a pretrained model and a tokenizer
    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer


def get_probs(model, tokenizer, text: str, device:str = "cpu"):
    # feed the text to model. collect next-word probabilites for each token
    # return a list of probabilities for each token
    # model: a pretrained model
    # tokenizer: a tokenizer for the model
    # text: a string
    # return: a list of probabilities for each token
    
    # tokenize the text
    tokenized_text = tokenizer.encode(text, return_tensors="pt", 
    add_special_tokens=True).to(device)
    # feed the text to model
    outputs = model(tokenized_text)
    # collect next-word probabilites for *each* token
    probs_all = []
    for i in range(tokenized_text.shape[1]):
        # get the next-word probabilites for the i-th token
        logits = outputs[0][0, i, :]
        # get the next-word probabilites for the i-th token
        probs = torch.softmax(logits, dim=0)
        # convert to numpy array
        probs = probs.detach().cpu().numpy()
        # append to the list
        probs_all.append(probs)
    # convert logits to probabilities
    # convert to numpy array
    probs = np.array(probs_all)
    return probs, tokenized_text.detach().cpu().numpy()

def translate_probs_to_words(probs, tokenizer, top_k=10):
    # translate the probabilities to words
    # return a list of words
    # probs: a list of probabilities for each token
    # tokenizer: a tokenizer for the model
    # top_k: the number of words to return
    # return: a list of words
    
    # get the top-k words for each token
    top_k_words = []
    for i in range(probs.shape[0]):
        # get the top-k words for the i-th token
        top_k_words.append(tokenizer.convert_ids_to_tokens(
        np.argsort(probs[i])[-top_k:]))
    # convert to numpy array
    top_k_words = np.array(top_k_words)
    return top_k_words
    
with open("wiki.1million.raw.txt", "r") as f:
    lines = f.readlines()
sents = [l.strip() for l in lines]

In [1]:
# device = "cpu"
# model_name = "facebook/opt-350m"
# model, tokenizer = load_gpt2_model(model_name)
# model.to(device)
# model.eval()


## collect probabilities and true tokens

In [3]:
import tqdm
import pickle
n=50000
models = ["facebook/opt-125m", "facebook/opt-350m", "facebook/opt-1.3b", "facebook/opt-2.7b" ,"facebook/opt-6.7b",
 "facebook/opt-13b", "facebook/opt-30b"]

for model_name in models:
    print(model_name)
    preds = []
    model, tokenizer = load_gpt2_model(model_name)
    device = "cuda" if model_name in  ["facebook/opt-125m", "facebook/opt-350m", "facebook/opt-1.3b"] else "cpu"
    model.to(device)
    model.eval()

    for i, sent in tqdm.tqdm(enumerate(sents[:n]), total = n):
        with torch.no_grad():
            probs, tokens = get_probs(model, tokenizer, sent, device)
            preds.append(({"probs": probs, "tokens": tokens[0], "sent": sent}))

    
    for i in range(5):
        with open("preds_{}_{}_part{}.pickle".format(n, model_name.replace("facebook/", ""), i+1), "wb") as f:
            # take the the ith part of preds
            preds_i = preds[i*10000:(i+1)*10000]
            pickle.dump(preds_i, f)
            del preds_i

facebook/opt-350m


100%|██████████| 50000/50000 [19:22<00:00, 43.03it/s]


: 

: 