In [1]:
import os, pickle, torch
from contextlib import nullcontext
from model import GPT

In [16]:
DATA_DIR = "data/"
MODEL_DIR = "best_models/"
CHECKPOINT = "gpt.pt"
device = "mps"
sample_from_base = "gpt2-large"  # None

In [17]:
compile = False
if device == "cuda":
    compile = True
    torch.backends.cuda.matmul.allow_tf32 = True  # allow tf32 on matmul
    torch.backends.cudnn.allow_tf32 = True  # allow tf32 on cudnn
    if torch.cuda.is_bf16_supported():
        ctx = torch.amp.autocast(device_type=device, dtype=torch.bfloat16)
    else:
        ctx = torch.amp.autocast(device_type=device, dtype=torch.float16)
else:
    ctx = nullcontext()

In [18]:
if sample_from_base is None:
    checkpoint = torch.load(MODEL_DIR + CHECKPOINT, map_location=device)
    print("best val loss:", checkpoint["best_val_loss"].item())
    config = checkpoint["config"]
    model = GPT(config)
    state_dict = checkpoint["model"]
    unwanted_prefix = "_orig_mod."
    for k, v in list(state_dict.items()):
        if k.startswith(unwanted_prefix):
            state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
    model.load_state_dict(state_dict)
else:
    config = dict(dropout=0.0)
    model = GPT.from_pretrained(sample_from_base, config)
    model.crop_block_size(128)
    config = model.config
model.eval()
model = model.to(device)
if compile:
    print("compiling the model... (takes a ~minute)")
    model = torch.compile(model)

  from .autonotebook import tqdm as notebook_tqdm


loading weights from pretrained gpt: gpt2-large
total number of parameters: 772719360 learnable: 772719360


In [19]:
model

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 1280)
    (wpe): Embedding(1024, 1280)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-35): 36 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=1280, out_features=3840, bias=True)
          (c_proj): Linear(in_features=1280, out_features=1280, bias=True)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=1280, out_features=5120, bias=True)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=5120, out_features=1280, bias=True)
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=1280, out_features=50257, bias=False)
)

In [20]:
print("--- learnable parameters ---")
for pn, p in model.named_parameters():
    if p.requires_grad:
        print(pn)

--- learnable parameters ---
transformer.wte.weight
transformer.wpe.weight
transformer.h.0.ln_1.weight
transformer.h.0.ln_1.bias
transformer.h.0.attn.c_attn.weight
transformer.h.0.attn.c_attn.bias
transformer.h.0.attn.c_proj.weight
transformer.h.0.attn.c_proj.bias
transformer.h.0.ln_2.weight
transformer.h.0.ln_2.bias
transformer.h.0.mlp.c_fc.weight
transformer.h.0.mlp.c_fc.bias
transformer.h.0.mlp.c_proj.weight
transformer.h.0.mlp.c_proj.bias
transformer.h.1.ln_1.weight
transformer.h.1.ln_1.bias
transformer.h.1.attn.c_attn.weight
transformer.h.1.attn.c_attn.bias
transformer.h.1.attn.c_proj.weight
transformer.h.1.attn.c_proj.bias
transformer.h.1.ln_2.weight
transformer.h.1.ln_2.bias
transformer.h.1.mlp.c_fc.weight
transformer.h.1.mlp.c_fc.bias
transformer.h.1.mlp.c_proj.weight
transformer.h.1.mlp.c_proj.bias
transformer.h.2.ln_1.weight
transformer.h.2.ln_1.bias
transformer.h.2.attn.c_attn.weight
transformer.h.2.attn.c_attn.bias
transformer.h.2.attn.c_proj.weight
transformer.h.2.attn.c_p

In [21]:
import tiktoken

gpt2 = tiktoken.get_encoding("gpt2")

end_text_token = 50256
start_input_token = 50257
end_input_token = 50258
concept_delimiter_token = 50259
pad_token = 50260
enc = tiktoken.Encoding(
    name="gpt_modified",
    pat_str=gpt2._pat_str,
    mergeable_ranks=gpt2._mergeable_ranks,
    special_tokens={
        **gpt2._special_tokens,
        "<|start_of_input|>": start_input_token,
        "<|end_of_input|>": end_input_token,
        "<|concept_delimiter|>": concept_delimiter_token,
        "<|padding|>": pad_token,
    },
)

In [23]:
#start = "<|start_of_input|>mirzapur<|concept_delimiter|>traffic<|concept_delimiter|>late<|end_of_input|>"
start="a sentence using word morning and car is"
num_samples = 5
max_new_tokens = 50
temperature = 1.0
top_k = 25

x = torch.tensor(
    enc.encode(
        start,
        allowed_special={
            "<|start_of_input|>",
            "<|end_of_input|>",
            "<|concept_delimiter|>",
        },
    ),
    dtype=torch.long,
    device=device,
)[None, ...]
if config.get("prompt_vocab_size", 0) > 0:
    prompt = torch.arange(config["prompt_vocab_size"], dtype=torch.long, device=device)[
        None, ...
    ]
else:
    prompt = None
with torch.no_grad():
    for k in range(num_samples):
        with ctx:
            y = model.generate(
                x,
                max_new_tokens,
                temperature=temperature,
                top_k=top_k,
                end_token=end_text_token,
                prompt=prompt,
            )
        output=enc.decode(y[0].tolist())
        output=output.split(start)[1]
        print("-----", output+"\n")

-----  more difficult than a sentence using word evening and car

is more difficult than a sentence using word and is easier than two sentences using day and evening.

What is wrong?

If you are trying to learn how to say words,

-----  not an ordinary sentence. The English language does not allow that. The sentence must have a verb that begins with the form morning. The following sentence may be considered normal English: Morning, the air is still cool. To make up for this lack of

-----  to make it the first part of the sentence.

In this case the word morning is the past tense and the present perfect of morning .

In this case the word morning is the past tense and the present perfect of morning . In this

-----  a car in which we can make this statement about the morning. So we can make the car statement using the sentence with car and the car in the morning. If we try to make an automobile sentence where I am the car and the car in the

-----  "morning". But what if someone else s