In [1]:
import os
import time
import math
import pickle
from contextlib import nullcontext
import numpy as np
import torch
from model import GPTConfig, GPT
from torch.nn import functional as F

### 一. config

In [2]:
out_dir = 'out-test'
eval_interval = 500
log_interval = 4
eval_iters = 10
always_save_checkpoint = False # if True, always save a checkpoint after each eval
init_from = 'scratch' # 'scratch' or 'resume' or 'gpt2*'
# wandb logging
wandb_log = False # disabled by default
wandb_project = 'shakespeare-char'
wandb_run_name = 'mini-gpt' # 'run' + str(time.time())
# data
dataset = 'shakespeare_char'
gradient_accumulation_steps = 4 # used to simulate larger batch sizes
batch_size = 16 # if gradient_accumulation_steps > 1, this is the micro-batch size
block_size = 256  #text
# 我很开心给
# 大家做饭<eos>

n_layer = 12
n_head = 12
n_embd = 768
dropout = 0.1 # for pretraining 0 is good, for finetuning try 0.1+
bias = False # do we use bias inside LayerNorm and Linear layers?
# adamw optimizer
learning_rate = 1e-3 # max learning rate
max_iters = 5000 # total number of training iterations
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.99
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
# learning rate decay settings
decay_lr = True # whether to decay the learning rate
warmup_iters = 100 # how many steps to warm up for
lr_decay_iters = 5000 # should be ~= max_iters per Chinchilla
min_lr = 1e-4 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
# DDP settings
backend = 'nccl' # 'nccl', 'gloo', etc.
# system
device = 'cuda' 
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' 
compile = True # use PyTorch 2.0 to compile the model to be faster
# -----------------------------------------------------------------------------
config_keys = [k for k,v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))]
# exec(open('configurator.py').read()) # overrides from command line or config file
config = {k: globals()[k] for k in config_keys} # will be useful for logging

In [3]:
seed_offset = 0
# os.makedirs(out_dir, exist_ok=True)
torch.manual_seed(1337 + seed_offset)
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
# device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
device_type = 'cpu'
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

In [4]:
data_dir = os.path.join('data', dataset)
data_dir

'data/shakespeare_char'

In [5]:
meta_path = os.path.join(data_dir, 'meta.pkl')
meta_vocab_size = None
if os.path.exists(meta_path):
    with open(meta_path, 'rb') as f:
        meta = pickle.load(f)
    meta_vocab_size = meta['vocab_size']
    print(f"found vocab_size = {meta_vocab_size} (inside {meta_path})")

found vocab_size = 65 (inside data/shakespeare_char/meta.pkl)


In [6]:
model_args = dict(n_layer=n_layer, n_head=n_head, n_embd=n_embd, block_size=1024,
                  bias=bias, vocab_size=None, dropout=dropout) # start with model_args from command line
if meta_vocab_size is None:
    print("defaulting to vocab_size of GPT-2 to 50304 (50257 rounded up for efficiency)")
model_args['vocab_size'] = meta_vocab_size if meta_vocab_size is not None else 50304
gptconf = GPTConfig(**model_args)

### 二.load model

In [7]:
model = GPT(gptconf).to("cpu")
print(model)

number of parameters: 85.00M
GPT(
  (transformer): ModuleDict(
    (wte): Embedding(65, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm()
        (attn): CasualSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=False)
          (c_proj): Linear(in_features=768, out_features=768, bias=False)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=False)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=3072, out_features=768, bias=False)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=768, out_features=65, bias=False)
)


### 三 load data

In [8]:
X = torch.load('X.tensor').to('cpu')
Y = torch.load('Y.tensor').to('cpu')
print('X:',X.shape)
print('Y:',Y.shape)
print('X[0,:10]: ', X[0,:10])
print('Y[0,:10]: ', Y[0,:10])

X: torch.Size([16, 256])
Y: torch.Size([16, 256])
X[0,:10]:  tensor([ 1, 40, 43,  0, 42, 39, 51, 52, 43, 42])
Y[0,:10]:  tensor([40, 43,  0, 42, 39, 51, 52, 43, 42,  1])


In [9]:
# class GPTConfig:
batch_size = 16 
block_size = 1024  # lenght
n_layer = 2
n_head = 4
n_embd = 128

### 四 GPT forward

In [10]:
print(model.transformer.wpe)

Embedding(1024, 768)


In [11]:
device = "cpu"
b,t = X.size()
print("t:{} = block_size:{}".format(t, block_size))
pos = torch.arange(0, t, dtype=torch.long)
# forward the GPT model itself
tok_emb = model.transformer.wte(X) # token embeddings of shape (b, t, n_embd)
pos_emb = model.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
print('-----embding-input-------')
print('词嵌入向量维n_embd = ', n_embd)
print('tok_emb:', tok_emb.shape)
print('pos_emb:', pos_emb.shape)
print('tok_emb+pos_emb:', (tok_emb + pos_emb).shape)
x = model.transformer.drop(tok_emb + pos_emb) # pretraining 
x_enc = model.transformer.drop(tok_emb + pos_emb)
print('编码后embding input:', x.shape)
print('-----decoder-block-------')
print('n_layer:', n_layer)
print('decoder layers:', len(model.transformer.h))
i=0
for block in model.transformer.h:
    print(i)
    i+=1
    x = block(x)
    print('decoder x:', x.shape)
x = model.transformer.ln_f(x)
print('ln_f x:', x.shape)
print('-----lm_head-------')
logits = model.lm_head(x)
print('lm_head :', logits.shape)
print('lm_head输出与解码词汇量相同, meta_vocab_size=',meta_vocab_size )
print('-----loss-------')
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), Y.view(-1), ignore_index=-1)
print(loss)

t:256 = block_size:1024
-----embding-input-------
词嵌入向量维n_embd =  128
tok_emb: torch.Size([16, 256, 768])
pos_emb: torch.Size([256, 768])
tok_emb+pos_emb: torch.Size([16, 256, 768])
编码后embding input: torch.Size([16, 256, 768])
-----decoder-block-------
n_layer: 2
decoder layers: 12
0
decoder x: torch.Size([16, 256, 768])
1
decoder x: torch.Size([16, 256, 768])
2
decoder x: torch.Size([16, 256, 768])
3
decoder x: torch.Size([16, 256, 768])
4
decoder x: torch.Size([16, 256, 768])
5
decoder x: torch.Size([16, 256, 768])
6
decoder x: torch.Size([16, 256, 768])
7
decoder x: torch.Size([16, 256, 768])
8
decoder x: torch.Size([16, 256, 768])
9
decoder x: torch.Size([16, 256, 768])
10
decoder x: torch.Size([16, 256, 768])
11
decoder x: torch.Size([16, 256, 768])
ln_f x: torch.Size([16, 256, 768])
-----lm_head-------
lm_head : torch.Size([16, 256, 65])
lm_head输出与解码词汇量相同, meta_vocab_size= 65
-----loss-------
tensor(4.4475, grad_fn=<NllLossBackward0>)


In [12]:
print('-------------decoder block-------------')
decoder_block = model.transformer.h[0]
print(decoder_block)

x = x_enc
x_ln_1 = decoder_block.ln_1(x)
x_attn = decoder_block.attn(x_ln_1)
x_ln_2 = decoder_block.ln_2(x_attn)
x_mlp = decoder_block.mlp(x_ln_2)

print('layer norm :', x_ln_1.shape)
print('masked_self_attention :', x_attn.shape)
print('layer norm :', x_ln_2.shape)
print('mlp :', x_mlp.shape)

x = x + decoder_block.attn(decoder_block.ln_1(x))
x = x + decoder_block.mlp(decoder_block.ln_2(x))

-------------decoder block-------------
Block(
  (ln_1): LayerNorm()
  (attn): CasualSelfAttention(
    (c_attn): Linear(in_features=768, out_features=2304, bias=False)
    (c_proj): Linear(in_features=768, out_features=768, bias=False)
    (attn_dropout): Dropout(p=0.1, inplace=False)
    (resid_dropout): Dropout(p=0.1, inplace=False)
  )
  (ln_2): LayerNorm()
  (mlp): MLP(
    (c_fc): Linear(in_features=768, out_features=3072, bias=False)
    (gelu): GELU(approximate='none')
    (c_proj): Linear(in_features=3072, out_features=768, bias=False)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)


layer norm : torch.Size([16, 256, 768])
masked_self_attention : torch.Size([16, 256, 768])
layer norm : torch.Size([16, 256, 768])
mlp : torch.Size([16, 256, 768])


In [None]:
# Masked Self Attention
attention = model.transformer.h[0].attn
print(attention)
print("如果torch>2.0.0, 是否可直接使用scaled_dot_product_attention：",attention.flash)

x = x_ln_1
B, T, C = x.size()
print("batch:{}, block:{}, embed:{}, ".format(B, T, C))

# self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
print('---------------1. 将嵌入向量传播成3*n_embd--------------')
x_liner = attention.c_attn(x)
print("n_embed:",attention.n_embd)
print("n_embed*3:",attention.n_embd*3)
print("x_liner:", x_liner.shape)

print('---------------2. 将3*n_embd split成QKV--------------')
q, k, v  = x_liner.split(attention.n_embd, dim=2)
print("split: q:", q.shape)

print('---------------3. 将QKV拆分 多头QKV--------------')
print("n_embed:{} / n_head:{} = {} ".format(C, attention.n_head, C//attention.n_head))
k = k.view(B, T, attention.n_head, C // attention.n_head).transpose(1, 2) # (B, nh, T, hs)
q = q.view(B, T, attention.n_head, C // attention.n_head).transpose(1, 2) # (B, nh, T, hs)
v = v.view(B, T, attention.n_head, C // attention.n_head).transpose(1, 2) # (B, nh, T, hs)
print("q:",q.shape)
print("Q = batch:{}, n_head:{}, block:{}, head_embed:{} ".format(B, n_head, T, C//attention.n_head))


print('---------------4.多头计算attention，直接使用torch function--------------')
y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, 
                                                     dropout_p=attention.dropout 
                                                     if attention.training 
                                                     else 0, is_causal=True)
print('y:', y.shape)


print('---------------5.将多头注意力结果拼接--------------')
y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
print('y-concat:', y.shape)


print('---------------6. 增加一次前向传播--------------')
y = attention.resid_dropout(attention.c_proj(y))
print("y_proj:", y.shape )


CasualSelfAttention(
  (c_attn): Linear(in_features=768, out_features=2304, bias=False)
  (c_proj): Linear(in_features=768, out_features=768, bias=False)
  (attn_dropout): Dropout(p=0.1, inplace=False)
  (resid_dropout): Dropout(p=0.1, inplace=False)
)
如果torch>2.0.0, 是否可直接使用scaled_dot_product_attention： True
batch:16, block:256, embed:768, 
---------------1. 将嵌入向量传播成3*n_embd--------------
n_embed: 768
n_embed*3: 2304
x_liner: torch.Size([16, 256, 2304])
---------------2. 将3*n_embd split成QKV--------------
split: q: torch.Size([16, 256, 768])
---------------3. 将QKV拆分 多头QKV--------------
n_embed:768 / n_head:12 = 64 
q: torch.Size([16, 12, 256, 64])
Q = batch:16, n_head:4, block:256, head_embed:64 
---------------4.多头计算attention，直接使用torch function--------------
y: torch.Size([16, 12, 256, 64])
---------------5.将多头注意力结果拼接--------------
y-concat: torch.Size([16, 256, 768])
---------------6. 增加一次前向传播--------------
y_proj: torch.Size([16, 256, 768])


In [14]:
masked_matrix = torch.tril(torch.ones(T,T)).view(1, 1, T, T)
print(masked_matrix[:10,:10])

tensor([[[[1., 0., 0.,  ..., 0., 0., 0.],
          [1., 1., 0.,  ..., 0., 0., 0.],
          [1., 1., 1.,  ..., 0., 0., 0.],
          ...,
          [1., 1., 1.,  ..., 1., 0., 0.],
          [1., 1., 1.,  ..., 1., 1., 0.],
          [1., 1., 1.,  ..., 1., 1., 1.]]]])


In [15]:
# 不使用torch2.0 attention计算
print('5行代码实现多头注意力计算')
# 1. scale and dot product process
att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
print("q score:", q.shape)
print("k score:", k.shape)
print("k_t score:", k.transpose(-2, -1).shape)
print("q @ k_t score:", (q@k.transpose(-2, -1)).shape)
print("attn score:", att.shape)
# 2. Mask为下三角矩阵
att = att.masked_fill(masked_matrix == 0, float('-inf'))
# 3. softmax
att = F.softmax(att, dim=-1)
# 4. attn
att = attention.attn_dropout(att)
# 5. score
y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
print("attn score:", y.shape)

5行代码实现多头注意力计算
q score: torch.Size([16, 12, 256, 64])
k score: torch.Size([16, 12, 256, 64])
k_t score: torch.Size([16, 12, 64, 256])
q @ k_t score: torch.Size([16, 12, 256, 256])
attn score: torch.Size([16, 12, 256, 256])
attn score: torch.Size([16, 12, 256, 64])


In [16]:
# mlp实现
mlp = model.transformer.h[0].mlp
print(mlp)
print("x:",x.shape)
x = mlp.c_fc(x)
print("x_fc:",x.shape)
x = mlp.gelu(x)
x = mlp.c_proj(x)
print("x_proj:",x.shape)
x = mlp.dropout(x)

MLP(
  (c_fc): Linear(in_features=768, out_features=3072, bias=False)
  (gelu): GELU(approximate='none')
  (c_proj): Linear(in_features=3072, out_features=768, bias=False)
  (dropout): Dropout(p=0.1, inplace=False)
)
x: torch.Size([16, 256, 768])
x_fc: torch.Size([16, 256, 3072])
x_proj: torch.Size([16, 256, 768])


In [17]:
# layer normalization
ln = model.transformer.h[0].ln_1
print(x.shape)
F.layer_norm(x, ln.weight.shape, ln.weight, ln.bias, 1e-5)
print(x.shape)

torch.Size([16, 256, 768])
torch.Size([16, 256, 768])


### 五. generate by gpt

In [18]:
# Greedy Generation
idx = X[1,:10].reshape(1, 10) # generate 1
print("预测词表大小", model.config.vocab_size) # BPE词表
print("输入X:", idx)
print("输入X长度:", idx.shape)
for _ in range(2):
    logits, _ = model(idx)
    print('输出Logits:', logits.shape)     
    probs = F.softmax(logits, dim=-1)
    print('输出Probs:', probs.shape)
    print(probs)
    idx_next = torch.argmax(probs , dim=2) 
    print('输出下一个Token:', idx_next.shape)
    print(idx_next)
    idx = torch.cat((idx, idx_next), dim=1)
    # idx = idx_next
    print("当前的token长度:", len(idx[0]))
    print("当前的token序列:", idx)

预测词表大小 65
输入X: tensor([[ 0, 13, 52, 42,  1, 26, 43, 56, 53,  1]])
输入X长度: torch.Size([1, 10])
输出Logits: torch.Size([1, 1, 65])
输出Probs: torch.Size([1, 1, 65])
tensor([[[0.0053, 0.0268, 0.0201, 0.0132, 0.0070, 0.0154, 0.0224, 0.0134,
          0.0132, 0.0222, 0.0237, 0.0077, 0.0060, 0.0079, 0.0111, 0.0130,
          0.0067, 0.0166, 0.0205, 0.0240, 0.0371, 0.0167, 0.0377, 0.0282,
          0.0083, 0.0087, 0.0166, 0.0149, 0.0058, 0.0213, 0.0206, 0.0125,
          0.0538, 0.0059, 0.0346, 0.0130, 0.0101, 0.0138, 0.0098, 0.0278,
          0.0125, 0.0158, 0.0063, 0.0229, 0.0186, 0.0097, 0.0064, 0.0060,
          0.0047, 0.0179, 0.0181, 0.0176, 0.0072, 0.0140, 0.0082, 0.0052,
          0.0084, 0.0115, 0.0072, 0.0160, 0.0185, 0.0136, 0.0186, 0.0144,
          0.0070]]], grad_fn=<SoftmaxBackward0>)
输出下一个Token: torch.Size([1, 1])
tensor([[32]])
当前的token长度: 11
当前的token序列: tensor([[ 0, 13, 52, 42,  1, 26, 43, 56, 53,  1, 32]])
输出Logits: torch.Size([1, 1, 65])
输出Probs: torch.Size([1, 1, 65])
tensor([

In [19]:
torch.set_printoptions(precision=4)
torch.set_printoptions(sci_mode=False)
torch.set_printoptions(linewidth=100)

In [20]:
# model.generate()
# top-k GPT-2
idx = X[1,:10].reshape(1, 10) # generate 1
print(idx.shape)
temperature = 1.0
top_k = 5
print(model.config.vocab_size) # BPE词表
print("prompt:", idx)

#top k的方法
# 词表 65
# llama 32000

for _ in range(10):
    idx_cond = idx if idx.size(1) <= model.config.block_size else idx[:, -model.config.block_size:]
    print('idx_cond:', idx_cond.shape)
    
    logits, _ = model(idx_cond)
    print('logits:', logits.shape) 
    
    print("no - 1",logits[:, -1, :].shape)##为什么要-1来降维
    
    logits = logits[:, -1, :] / temperature #平缓， tips : 知识蒸馏[温度]
    print('logits:', logits.shape)
    
    if top_k is not None:
        v, i = torch.topk(logits, min(top_k, logits.size(-1)))
        print('top_k v:', v.shape)
        print('i', i)
        
        logits[logits < v[:, [-1]]] = -float('Inf')
        print('top_k logits:', logits.shape)
        print(logits)
        
    probs = F.softmax(logits, dim=-1)
    print('probs :', probs.shape)
    print(probs)
    
    idx_next = torch.multinomial(probs, num_samples=1) #  num_samples-by-num_samples
    print('idx_next :', idx_next.shape)
    print(idx_next)
    
    idx = torch.cat((idx, idx_next), dim=1)
    print(idx.shape)
    
    print("generate: length :", len(idx[0]))
    print("generate:", idx)

torch.Size([1, 10])
65
prompt: tensor([[ 0, 13, 52, 42,  1, 26, 43, 56, 53,  1]])
idx_cond: torch.Size([1, 10])
logits: torch.Size([1, 1, 65])
no - 1 torch.Size([1, 65])
logits: torch.Size([1, 65])
top_k v: torch.Size([1, 5])
i tensor([[ 1,  6, 50, 44, 34]])
top_k logits: torch.Size([1, 65])
tensor([[  -inf, 0.9813,   -inf,   -inf,   -inf,   -inf, 0.9209,   -inf,   -inf,   -inf,   -inf,
           -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf, 0.6405,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
         0.7118,   -inf,   -inf,   -inf,   -inf,   -inf, 0.7409,   -inf,   -inf,   -inf,   -inf,
           -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf]],
       grad_fn=<IndexPutBackward0>)
probs : torch.Size([1, 65])
tensor([[0.0000, 0.2379, 0.0000, 0.0000, 0.0000, 0.0000, 0.2240, 0.

In [28]:
# sampling
dict_map = {0:"l", 1:"j", 2:"h"}
prob = torch.tensor([0.3, 0.3, 0.0])
print("prob : ",prob)

print("根据概率权重所选择next token的下标")
for i in range(10):
    next_token = torch.multinomial(prob, num_samples=1)[0]
    print(f"第{i}次sample的next_token下标为{next_token}:", 
          dict_map[int(next_token)])

prob :  tensor([0.3000, 0.3000, 0.0000])
根据概率权重所选择next token的下标
第0次sample的next_token下标为0: l
第1次sample的next_token下标为0: l
第2次sample的next_token下标为1: j
第3次sample的next_token下标为0: l
第4次sample的next_token下标为0: l
第5次sample的next_token下标为1: j
第6次sample的next_token下标为1: j
第7次sample的next_token下标为0: l
第8次sample的next_token下标为1: j
第9次sample的next_token下标为1: j


In [29]:
# Do Sample with temparature
temparature = 2.0
dict_map = {0:"l", 1:"j", 2:"h"}
prob = torch.tensor([0.7, 0.2, 0.1])
print("prob : ",prob)
prob /= temparature
print("prob/T : ", prob)
prob = F.softmax(prob)
print("softmax(prob/T) : ", prob)

print("根据概率权重所选择next token的下标")
for i in range(10):
    next_token = torch.multinomial(prob, num_samples=1)[0]
    print(f"第{i}次sample的next_token下标为{next_token}:", 
          dict_map[int(next_token)])

prob :  tensor([0.7000, 0.2000, 0.1000])
prob/T :  tensor([0.3500, 0.1000, 0.0500])
softmax(prob/T) :  tensor([0.3969, 0.3091, 0.2940])
根据概率权重所选择next token的下标
第0次sample的next_token下标为2: h
第1次sample的next_token下标为1: j
第2次sample的next_token下标为0: l
第3次sample的next_token下标为2: h
第4次sample的next_token下标为0: l
第5次sample的next_token下标为2: h
第6次sample的next_token下标为1: j
第7次sample的next_token下标为1: j
第8次sample的next_token下标为0: l
第9次sample的next_token下标为0: l


  prob = F.softmax(prob)


In [31]:
# top-k
temparature = 2.0
top_k = 2
dict_map = {0:"l", 1:"j", 2:"h"}
prob = torch.tensor([0.7, 0.2, 0.1])
print("prob : ",prob)
prob /= temparature
print("prob/T : ", prob)
prob = F.softmax(prob)
print("softmax(prob/T) : ", prob)
prob, _ = torch.topk(prob, top_k)
print("top-k:", prob)
prob = F.softmax(prob)
print("top-k softmax:", prob)
print("根据概率权重所选择next token的下标")
for i in range(10):
    next_token = torch.multinomial(prob, num_samples=1)[0]
    print(f"第{i}次sample的next_token下标为{next_token}:", 
          dict_map[int(next_token)])

prob :  tensor([0.7000, 0.2000, 0.1000])
prob/T :  tensor([0.3500, 0.1000, 0.0500])
softmax(prob/T) :  tensor([0.3969, 0.3091, 0.2940])
top-k: tensor([0.3969, 0.3091])
top-k softmax: tensor([0.5219, 0.4781])
根据概率权重所选择next token的下标
第0次sample的next_token下标为1: j
第1次sample的next_token下标为1: j
第2次sample的next_token下标为1: j
第3次sample的next_token下标为1: j
第4次sample的next_token下标为0: l
第5次sample的next_token下标为0: l
第6次sample的next_token下标为0: l
第7次sample的next_token下标为0: l
第8次sample的next_token下标为1: j
第9次sample的next_token下标为0: l


  prob = F.softmax(prob)
  prob = F.softmax(prob)


In [40]:
# repeatition penalty
idx = X[1,:10].reshape(1, 10) 
print(idx)
penalty = 2.0
for _ in range(256):
    logits,_ = model(idx)
    logits = logits[:,-1,:]
    origin_logits = logits.clone()

    print(logits)
    # repetition penalty
    logits_idx = torch.gather(logits,1,idx)
    logits_idx = torch.where(logits_idx < 0,logits_idx * penalty,logits_idx / penalty)
    logits = logits.scatter_(1,idx,logits_idx)
    print(logits)

    probs = F.softmax(logits, dim=-1)
    idx_next = torch.multinomial(probs, num_samples=1)
    idx = torch.cat((idx, idx_next), dim=1)
    break
origin_logits - logits

tensor([[ 0, 13, 52, 42,  1, 26, 43, 56, 53,  1]])
tensor([[-0.7929,  0.5534, -0.1229,  0.3507, -1.3380,  0.4355,  0.4515, -0.1907, -0.8722,  1.0584,
         -0.1178, -0.4903, -0.2119, -0.6337,  0.4863, -0.4952,  0.0203,  0.8240, -0.1047,  1.1403,
          0.2497, -0.2922,  0.2068,  0.2550,  0.1722, -0.0060,  0.5095, -0.0022, -0.0195,  0.4862,
          0.9912, -0.7361,  0.7733, -1.0519,  0.2914,  0.2043, -0.1635,  0.2698, -0.5187,  0.2467,
         -0.2317,  0.5831, -0.4360,  0.6017, -0.1734, -0.1365, -0.3749,  0.3584, -1.1514,  0.1052,
          0.3275, -0.7800, -0.5172,  0.2508, -0.5244, -0.3392, -0.0874,  0.2741, -0.7231,  0.1629,
         -0.1338,  0.5089,  0.7228,  0.2831, -0.6781]], grad_fn=<SliceBackward0>)
tensor([[-1.5857,  0.2767, -0.1229,  0.3507, -1.3380,  0.4355,  0.4515, -0.1907, -0.8722,  1.0584,
         -0.1178, -0.4903, -0.2119, -1.2675,  0.4863, -0.4952,  0.0203,  0.8240, -0.1047,  1.1403,
          0.2497, -0.2922,  0.2068,  0.2550,  0.1722, -0.0060,  0.2547, -0.

tensor([[0.7929, 0.2767, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.6337, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.2547, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.4360, 0.3008,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.5172, 0.1254, 0.0000,
         0.0000, 0.0874, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
       grad_fn=<SubBackward0>)

In [41]:
# sampling
dict_map = {0:"l", 1:"j", 2:"h"}
prob = torch.tensor([0.3, 0.3, 0])
print("prob : ",prob)

print("根据概率权重所选择next token的下标")
for i in range(100):
    next_token = torch.multinomial(prob, num_samples=1)[0]
    print(f"第{i}次sample的next_token下标为{next_token}:", 
          dict_map[int(next_token)])

prob :  tensor([0.3000, 0.3000, 0.0000])
根据概率权重所选择next token的下标
第0次sample的next_token下标为0: l
第1次sample的next_token下标为1: j
第2次sample的next_token下标为0: l
第3次sample的next_token下标为0: l
第4次sample的next_token下标为1: j
第5次sample的next_token下标为0: l
第6次sample的next_token下标为0: l
第7次sample的next_token下标为1: j
第8次sample的next_token下标为0: l
第9次sample的next_token下标为0: l
第10次sample的next_token下标为0: l
第11次sample的next_token下标为1: j
第12次sample的next_token下标为1: j
第13次sample的next_token下标为0: l
第14次sample的next_token下标为1: j
第15次sample的next_token下标为1: j
第16次sample的next_token下标为0: l
第17次sample的next_token下标为1: j
第18次sample的next_token下标为1: j
第19次sample的next_token下标为0: l
第20次sample的next_token下标为1: j
第21次sample的next_token下标为1: j
第22次sample的next_token下标为0: l
第23次sample的next_token下标为1: j
第24次sample的next_token下标为1: j
第25次sample的next_token下标为0: l
第26次sample的next_token下标为0: l
第27次sample的next_token下标为1: j
第28次sample的next_token下标为1: j
第29次sample的next_token下标为0: l
第30次sample的next_token下标为0: l
第31次sample的next_token下标为0: l
第32次sample的next_to