Change default model caching directory from `~/.cache` to the default work directory in the cluster meant for big files like model weights etc.

In [1]:
# import os
# os.environ['TRANSFORMERS_CACHE'] = '/work/mremeli/huggingface'

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

  from .autonotebook import tqdm as notebook_tqdm


## Load LLM of choice

To retrieve the probabilities of generated sequences we have to load a causal language model architecture. 

In [3]:
checkpoint = "gpt2"

model = AutoModelForCausalLM.from_pretrained(checkpoint, return_dict_in_generate=True)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 665/665 [00:00<00:00, 197kB/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 548M/548M [00:09<00:00, 57.5MB/s]
Downloading: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.04M/1.04M [00:00<00:00, 1.95MB/s]
Downloading: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 456k/456k [00:00<00:00, 867kB/s]
Downloading: 100%|██████████████████████████████████████████████████████

In [4]:
model.config

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "return_dict_in_generate": true,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.24.0",
  "use_cache": true,
  "vocab_size": 50257
}

### Input prompt

In [5]:
prompt = ["Little red riding hood went to the forest one day to"]
tokenized_prompt = tokenizer(prompt, return_tensors="pt")
print(tokenized_prompt)

{'input_ids': tensor([[22253,  2266, 10311, 14263,  1816,   284,   262,  8222,   530,  1110,
           284]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


### Output

Set the desired maximum number of generated tokens (we set it to 40).

In [6]:
max_tokens_generated = 40

In [7]:
outputs = model.generate(**tokenized_prompt, 
                         output_scores=True, # output logits for each newly generated word
                         max_new_tokens=max_tokens_generated)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [8]:
outputs

GreedySearchDecoderOnlyOutput(sequences=tensor([[22253,  2266, 10311, 14263,  1816,   284,   262,  8222,   530,  1110,
           284,   766,   611,   339,   714,  1064,   257,   835,   284,   651,
           503,    13,   679,  1043,   257,  1402, 40812,   287,   262,  8222,
           290,  1816,   284,   340,    13,   679,  1043,   257,  1402, 40812,
           287,   262,  8222,   290,  1816,   284,   340,    13,   679,  1043,
           257]]), scores=(tensor([[-131.5153, -131.9083, -137.2242,  ..., -137.5335, -135.5820,
         -133.1908]]), tensor([[-107.9540, -109.3710, -113.5140,  ..., -115.1297, -114.9533,
         -111.1234]]), tensor([[-101.0773, -101.5922, -106.0210,  ..., -104.5540, -106.1922,
         -102.7193]]), tensor([[-133.3365, -133.3741, -140.1894,  ..., -141.9558, -133.4405,
         -135.5265]]), tensor([[-138.2824, -140.4560, -145.9526,  ..., -148.9709, -147.9707,
         -141.7005]]), tensor([[-104.0010, -104.2025, -110.0863,  ..., -107.3719, -110.4027,
   

The output has two parts. One is the generated sequences, and the other the logits (or scores) that the model calculated for each new generated token.

In [9]:
outputs.keys()

odict_keys(['sequences', 'scores'])

In [10]:
generated_text = tokenizer.batch_decode(outputs.sequences)[0]
print(generated_text)

Little red riding hood went to the forest one day to see if he could find a way to get out. He found a small hut in the forest and went to it. He found a small hut in the forest and went to it. He found a


#### Output probabilities
These scores can be converted to probabilities using the softmax function

In [11]:
probs = torch.stack(outputs.scores, dim=1).softmax(-1) 
print(probs)

tensor([[[4.6492e-06, 3.1384e-06, 1.5419e-08,  ..., 1.1317e-08,
          7.9661e-08, 8.7047e-07],
         [1.1294e-04, 2.7379e-05, 4.3468e-07,  ..., 8.6389e-08,
          1.0305e-07, 4.7463e-06],
         [7.9526e-06, 4.7522e-06, 5.6690e-08,  ..., 2.4580e-07,
          4.7771e-08, 1.5397e-06],
         ...,
         [2.9504e-06, 4.3737e-06, 6.2299e-07,  ..., 5.2129e-10,
          1.2705e-09, 2.5185e-03],
         [3.1661e-06, 1.7992e-06, 7.3791e-09,  ..., 5.4081e-10,
          6.5105e-06, 2.9975e-06],
         [1.0764e-05, 4.7095e-05, 4.7782e-07,  ..., 2.0868e-08,
          1.4524e-08, 2.2927e-05]]])


Shape: \[batch_size, num_generated_tokens, vocab_size\]

In [12]:
probs.shape

torch.Size([1, 40, 50257])

The first generated word is 'see'. Let's see (pun intended) what the assigned probability was!

In [13]:
idx = 0
prompt_len = tokenized_prompt.input_ids.shape[1]
word = tokenizer.decode(outputs.sequences[:,prompt_len+idx])

word_prob = torch.max(probs[:,idx,:])
print("Prior:\t '%s'" % ' '.join(generated_text.split()[:prompt_len+idx]))
print("Probability of next word '%s' is: %.2f%%" % (word, word_prob*100))

Prior:	 'Little red riding hood went to the forest one day to'
Probability of next word ' see' is: 6.42%
