In [1]:
import os
import requests
import pandas as pd
import matplotlib.pyplot as plt
import math
import tiktoken
import torch
import torch.nn as nn

In [2]:
# Hyperparameters
batch_size = 16
context_length = 16
d_model = 128 # the vector size of the token embedding
num_layers = 16 # num of Transformer blocks
num_heads = 8
lr = 1e-3
dropout = 0.1
max_iters = 5000
eval_intervals = 50 # How often to evaluate 
eval_iters = 20 # How many iterations to average the loss over when evaluating the model
device = 'cuda' if torch.cuda.is_available() else 'CPU'

TORCH_SEED = 1557
torch.manual_seed(TORCH_SEED)

<torch._C.Generator at 0x1f53b3e66d0>

In [3]:
if not os.path.exists("train-00000-of-00001-faeb732d85449c1e.parquet"):
    url = 'https://huggingface.co/datasets/chenqile09/tang-poems-with-keywords/resolve/main/data/train-00000-of-00001-faeb732d85449c1e.parquet'
    with open('train-00000-of-00001-faeb732d85449c1e.parquet', 'wb') as file:
        file.write(requests.get(url).content)

if not os.path.exists("test-00000-of-00001-94055845bc0c7e5e.parquet"):
    url = 'https://huggingface.co/datasets/chenqile09/tang-poems-with-keywords/resolve/main/data/test-00000-of-00001-94055845bc0c7e5e.parquet'
    with open('test-00000-of-00001-94055845bc0c7e5e.parquet', 'wb') as file:
        file.write(requests.get(url).content)

df_train = pd.read_parquet("train-00000-of-00001-faeb732d85449c1e.parquet")
df_test = pd.read_parquet("test-00000-of-00001-94055845bc0c7e5e.parquet")

In [4]:
with open('train.txt', 'w') as file:
    for text in df_train['paragraph']:
        file.write(str(text) + '\n')

with open('test.txt', 'w') as file:
    for text in df_test['paragraph']:
        file.write(str(text) + '\n')

with open('train.txt', 'r') as a:
    file1 = a.read()

with open('test.txt', 'r') as b:
    file2 = b.read()

train = file1+file2


In [5]:
# Tokenize the text
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
train_tokens = tokenizer.tokenize(train)
encoded_text = tokenizer.encode(train, return_tensors='pt').float()
#print(encoder_text)


Token indices sequence length is longer than the specified maximum sequence length for this model (1805012 > 512). Running this sequence through the model will result in indexing errors


In [6]:
print(len(encoded_text[0]))
print(encoded_text.size(1))
max_token = torch.max(encoded_text)
print(int(max_token.item()))
vocab_size = tokenizer.vocab_size
print(vocab_size)

1805012
1805012
13606
21128


In [39]:
# Tokenization
encoding = tiktoken.get_encoding("cl100k_base")
tokenized_text = encoding.encode(train)

print(len(tokenized_text))
print(len(set(tokenized_text)))
print(max(tokenized_text))

2800474
937
100179


In [7]:
# Split train and val
idx = int(len(encoded_text[0]) * 0.2)
train_data = encoded_text[0][idx:]
test_data = encoded_text[0][:idx]
print(train_data)
print("训练集样本数量:", train_data.size(0))
print("验证集样本数量:", test_data.size(0))

tensor([6205., 2126., 8024.,  ...,  740.,  511.,  102.])
训练集样本数量: 1444010
验证集样本数量: 361002


In [8]:
# embedding
idxs = torch.randint(low=0, high=train_data.size(0)-context_length, size=(batch_size, ))

x_batch = torch.stack([train_data[idx:idx + context_length] for idx in idxs])
y_batch = torch.stack([train_data[idx + 1:idx + context_length + 1] for idx in idxs])
print(x_batch.shape, y_batch.shape)

token_embedding_lookup_table = nn.Embedding(int(max_token.item()), d_model)

X = token_embedding_lookup_table(x_batch.long()).float()
Y = token_embedding_lookup_table(y_batch.long()).float()

torch.Size([16, 16]) torch.Size([16, 16])


In [9]:
# Position Embedding
position_embedding_lookup_table = torch.zeros(context_length, d_model)
position = torch.arange(0, context_length, dtype=torch.float).unsqueeze(1)

div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
position_embedding_lookup_table[:, 0::2] = torch.sin(position * div_term)
position_embedding_lookup_table[:, 1::2] = torch.cos(position * div_term)
position_embedding_lookup_table = position_embedding_lookup_table.unsqueeze(0).expand(batch_size, -1, -1) # add batch to the first dimension

position_embedding_lookup_table.shape

# add
X = X + position_embedding_lookup_table 
Y = Y + position_embedding_lookup_table

In [10]:
# Attention
Wq = nn.Linear(d_model, d_model)
Wk = nn.Linear(d_model, d_model)
Wv = nn.Linear(d_model, d_model)

Q = Wq(X) # [16, 16, 128]
Q = Q.view(batch_size, -1, num_heads, d_model // num_heads).transpose(1, 2) # [16, 16, 8, 16] transpose to [16, 8, 16, 16]

K = Wk(X) 
K = K.view(batch_size, -1, num_heads, d_model // num_heads).transpose(1, 2)

V = Wv(X) 
V = V.view(batch_size, -1, num_heads, d_model // num_heads).transpose(1, 2)

attention_score = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_model // num_heads)

# Mask
attention_score = attention_score.masked_fill(torch.triu(torch.ones(attention_score.shape[-2:]), diagonal=1).bool(), float('-inf'))

#softmax
attention_score = torch.softmax(attention_score, dim=-1)

# Calculate V Attention
A = torch.matmul(attention_score, V).transpose(1, 2).reshape(batch_size, -1, d_model) 

# Define the output weight matrix
Wo = nn.Linear(d_model, d_model)
output = Wo(A) # [batch_size, context_length, d_model]

print(output.shape)

torch.Size([16, 16, 128])


In [11]:
# Residual
output += X
layer_norm = nn.LayerNorm(d_model)
output1 = layer_norm(output)

In [12]:
# FFN
output = nn.Linear(d_model, d_model * 4)(output1)
output = nn.ReLU()(output)
output = nn.Linear(d_model * 4, d_model)(output)
output = torch.dropout(output, p=dropout, train=True)

output = output + output1
layer_norm = nn.LayerNorm(d_model)
output = layer_norm(output)

In [14]:
# output probability
logits = nn.Linear(d_model, int(max_token.item()))(output)
#print(pd.DataFrame(logits[0].detach().cpu().numpy()))
probabilities = torch.softmax(logits, dim=-1)
print(probabilities.shape)

torch.Size([16, 16, 13606])
