## 无标记数据的预训练

In [None]:
from importlib.metadata import version

pkgs = ["matplotlib", 
		"numpy", 
		"tiktoken", 
		"torch",
		"tensorflow" # For OpenAI's pretrained weights
		]
for p in pkgs:
	print(f"{p} version: {version(p)}")

import torch
print(torch.__version__)
print(torch.cuda.is_available())

matplotlib version: 3.4.3
numpy version: 1.22.4
tiktoken version: 0.7.0
torch version: 2.4.1
tensorflow version: 2.13.1
2.4.1+cu121
True


### 5.1 Evaluating generative text models

#### 5.1.1 GPT生成文本

In [None]:
# 未有k v cache

import torch
from previous_chapters import GPTModel

GPT_CONFIG_124M = {
	"vocab_size": 50257,   # Vocabulary size
	"context_length": 256, # Shortened context length (orig: 1024)
	"emb_dim": 768,        # Embedding dimension
	"n_heads": 12,         # Number of attention heads
	"n_layers": 12,        # Number of layers
	"drop_rate": 0.1,      # Dropout rate
	"qkv_bias": False      # Query-key-value bias
}

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval();  # Disable dropout during inference	# 全局影响 可影响其他cell model.train()

In [None]:
import tiktoken
from previous_chapters import generate_text_simple

def text_to_token(text, tokenizer):
	encoded = tokenizer.encode(text, allowed_special={"<|endoftext|"})
	encoded_tensor = torch.tensor(encoded).unsqueeze(0)
	return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
	flat = token_ids.squeeze(0)
	return tokenizer.decode(flat.tolist())

start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_simple(
	model=model,
	idx=text_to_token(start_context, tokenizer),
	max_new_tokens=10,
	context_size=GPT_CONFIG_124M["context_length"]
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you rentingetic wasnم refres RexMeCHicular stren


#### 5.1.2 loss 交叉熵 perplexity

In [7]:
inputs = torch.tensor([[16833, 3626, 6100],   # ["every effort moves",
						[40,    1107, 588]])   #  "I really like"]

targets = torch.tensor([[3626, 6100, 345  ],  # [" effort moves you",
						[1107,  588, 11311]]) #  " really like chocolate"]

In [None]:
# 通过model + softmax -> 概率
with torch.no_grad():
	logits = model(inputs)

probas = torch.softmax(logits, dim=-1)
print(probas.shape)
# 每个batch 中的 context_length 对应的50257为词表大小 其数值为对应该词的概率

torch.Size([2, 3, 50257])


- 将概率转回文本

In [10]:
token_ids = torch.argmax(probas, dim=-1, keepdim=True)
print("Token IDs:\n", token_ids)

Token IDs:
 tensor([[[16657],
         [  339],
         [42826]],

        [[49906],
         [29669],
         [41751]]])


In [12]:
print(f"Inputs batch 1: {token_ids_to_text(inputs[0], tokenizer)}")
print(f"Targets batch 1: {token_ids_to_text(targets[0], tokenizer)}")
print(f"Outputs batch 1: {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")

Inputs batch 1: every effort moves
Targets batch 1:  effort moves you
Outputs batch 1:  Armed heNetflix


In [27]:
text_idx = 0
target_probas_1 = probas[text_idx, [0, 1, 2], targets[text_idx]]
# target_probas_1 = probas[text_idx, :, targets[text_idx]]	# :会进行广播 导致结果为3*3 对每个词表概率都取了targets[text_idx]的三个值

print("Text 1:", target_probas_1)

text_idx = 1
target_probas_2 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 2:", target_probas_2)

a = probas[text_idx]
print(a.shape)
b = a[[0,1,2]]
print(b.shape)
c = a[:]
print(c.shape)

Text 1: tensor([7.4540e-05, 3.1061e-05, 1.1563e-05])
Text 2: tensor([1.0337e-05, 5.6776e-05, 4.7559e-06])
torch.Size([3, 50257])
torch.Size([3, 50257])
torch.Size([3, 50257])


- 目标是令以上三值达到1

In [29]:
# 取对数
log_probas = torch.log(torch.cat((target_probas_1, target_probas_2)))
print(log_probas)

# 求平均
avg_log_probas = torch.mean(log_probas)
print(avg_log_probas)

tensor([ -9.5042, -10.3796, -11.3677, -11.4798,  -9.7764, -12.2561])
tensor(-10.7940)


- 目标变为 令平均对数的值尽可能的大 上限为0

- *-1 变为尽可能小 


In [30]:
neg_avg_log_probas = avg_log_probas * -1
print(neg_avg_log_probas)

tensor(10.7940)


- 交叉熵 流程

model+softmax求概率 $\rightarrow$ 取出目标的概率 $\rightarrow$ 取对数 $\rightarrow$ 求平均 $\rightarrow$ *-1

In [31]:
# Logits have shape (batch_size, num_tokens, vocab_size)
print("Logits shape:", logits.shape)

# Targets have shape (batch_size, num_tokens)
print("Targets shape:", targets.shape)

Logits shape: torch.Size([2, 3, 50257])
Targets shape: torch.Size([2, 3])


In [None]:
logits_flat = logits.flatten(0, 1)
targets_flat = targets.flatten()

print("Flattened logits:", logits_flat.shape)
print("Flattened targets:", targets_flat.shape)

Flattened logits: torch.Size([6, 50257])
Flattened targets: torch.Size([6])


In [34]:
loss = torch.nn.functional.cross_entropy(logits_flat, targets_flat)
print(loss)

tensor(10.7940)


- 困惑度 exponential

- perplexity = exp(交叉熵)

In [35]:
perplexity = torch.exp(loss)
print(perplexity)

tensor(48725.8203)


#### 5.1.3 training validation set loss