In [35]:
import os
import requests
import pandas as pd
import matplotlib.pyplot as plt
import math
import tiktoken
import torch
import torch.nn as nn

In [36]:
# Hyperparameters
batch_size = 16
context_length = 16
d_model = 128 # the vector size of the token embedding
num_layers = 16 # num of Transformer blocks
num_heads = 8
lr = 1e-3
dropout = 0.1
max_iters = 5000
eval_intervals = 50 # How often to evaluate 
eval_iters = 20 # How many iterations to average the loss over when evaluating the model
device = 'cuda' if torch.cuda.is_available() else 'CPU'

TORCH_SEED = 1557
torch.manual_seed(TORCH_SEED)

<torch._C.Generator at 0x20d509fbf70>

In [37]:
if not os.path.exists("train-00000-of-00001-faeb732d85449c1e.parquet"):
    url = 'https://huggingface.co/datasets/chenqile09/tang-poems-with-keywords/resolve/main/data/train-00000-of-00001-faeb732d85449c1e.parquet'
    with open('train-00000-of-00001-faeb732d85449c1e.parquet', 'wb') as file:
        file.write(requests.get(url).content)

if not os.path.exists("test-00000-of-00001-94055845bc0c7e5e.parquet"):
    url = 'https://huggingface.co/datasets/chenqile09/tang-poems-with-keywords/resolve/main/data/test-00000-of-00001-94055845bc0c7e5e.parquet'
    with open('test-00000-of-00001-94055845bc0c7e5e.parquet', 'wb') as file:
        file.write(requests.get(url).content)

df_train = pd.read_parquet("train-00000-of-00001-faeb732d85449c1e.parquet")
df_test = pd.read_parquet("test-00000-of-00001-94055845bc0c7e5e.parquet")

In [38]:
with open('train.txt', 'w') as file:
    for text in df_train['paragraph']:
        file.write(str(text) + '\n')

with open('test.txt', 'w') as file:
    for text in df_test['paragraph']:
        file.write(str(text) + '\n')

with open('train.txt', 'r') as a:
    file1 = a.read()

with open('test.txt', 'r') as b:
    file2 = b.read()

train = file1+file2


In [39]:
# Tokenization
encoding = tiktoken.get_encoding("cl100k_base")
tokenized_text = encoding.encode(train)

print(len(tokenized_text))
print(len(set(tokenized_text)))
print(max(tokenized_text))

2800474
937
100179


In [40]:
# Split train and val
idx = int(len(tokenized_text)*0.2)
train_data = torch.tensor(tokenized_text[:idx])
test_data = torch.tensor(tokenized_text[idx:])
train_data

tensor([68464,  8676,   104,  ..., 70349, 47770,  3922])

In [53]:
# embedding
idxs = torch.randint(low=0, high=len(train_data)-context_length, size=(batch_size, ))

x_batch = torch.stack([train_data[idx:idx + context_length] for idx in idxs])
y_batch = torch.stack([train_data[idx + 1:idx + context_length + 1] for idx in idxs])
print(x_batch.shape, y_batch.shape)

token_embedding_lookup_table = nn.Embedding(max(tokenized_text), d_model)

X = token_embedding_lookup_table(x_batch)
Y = token_embedding_lookup_table(y_batch)

torch.Size([16, 16]) torch.Size([16, 16])


In [54]:
# Position Embedding
position_embedding_lookup_table = torch.zeros(context_length, d_model)
position = torch.arange(0, context_length, dtype=torch.float).unsqueeze(1)

div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
position_embedding_lookup_table[:, 0::2] = torch.sin(position * div_term)
position_embedding_lookup_table[:, 1::2] = torch.cos(position * div_term)
position_embedding_lookup_table = position_embedding_lookup_table.unsqueeze(0).expand(batch_size, -1, -1) # add batch to the first dimension

position_embedding_lookup_table.shape

# add
X = X + position_embedding_lookup_table 
Y = Y + position_embedding_lookup_table

In [59]:
# Attention
Wq = nn.Linear(d_model, d_model)
Wk = nn.Linear(d_model, d_model)
Wv = nn.Linear(d_model, d_model)

Q = Wq(X) # [16, 16, 128]
Q = Q.view(batch_size, -1, num_heads, d_model // num_heads).transpose(1, 2) # [16, 16, 8, 16] transpose to [16, 8, 16, 16]

K = Wk(X) 
K = K.view(batch_size, -1, num_heads, d_model // num_heads).transpose(1, 2)

V = Wv(X) 
V = V.view(batch_size, -1, num_heads, d_model // num_heads).transpose(1, 2)

attention_score = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_model // num_heads)

# Mask
attention_score = attention_score.masked_fill(torch.triu(torch.ones(attention_score.shape[-2:]), diagonal=1).bool(), float('-inf'))

#softmax
attention_score = torch.softmax(attention_score, dim=-1)

# Calculate V Attention
A = torch.matmul(attention_score, V).transpose(1, 2).reshape(batch_size, -1, d_model) 

# Define the output weight matrix
Wo = nn.Linear(d_model, d_model)
output = Wo(A) # [batch_size, context_length, d_model]

print(output.shape)

torch.Size([16, 16, 128])


In [61]:
# Residual
output += X
layer_norm = nn.LayerNorm(d_model)
output = layer_norm(output)

In [63]:
# FFN
output = nn.Linear(d_model, d_model * 4)(output)
output = nn.ReLU()(output)
output = nn.Linear(d_model * 4, d_model)(output)
output = torch.dropout(output, p=dropout, train=True)

output = output + X
layer_norm = nn.LayerNorm(d_model)
output = layer_norm(output)

In [67]:
# output probability
logits = nn.Linear(d_model, max(tokenized_text))(output)
#print(pd.DataFrame(logits[0].detach().cpu().numpy()))
probabilities = torch.softmax(logits, dim=-1)
# probabilities

tensor([[[5.0206e-06, 9.8759e-06, 5.5716e-06,  ..., 6.2844e-06,
          1.2865e-05, 1.0489e-05],
         [1.6787e-06, 6.5691e-06, 3.9025e-06,  ..., 4.0115e-06,
          1.6302e-05, 1.4734e-05],
         [9.1733e-06, 5.1127e-06, 1.3115e-05,  ..., 8.3735e-06,
          8.0468e-06, 4.0596e-06],
         ...,
         [2.3865e-05, 4.9828e-06, 6.5063e-06,  ..., 9.5641e-06,
          1.3973e-05, 5.6392e-06],
         [8.0500e-06, 7.5009e-06, 8.8509e-06,  ..., 8.3006e-06,
          1.5445e-05, 1.4808e-05],
         [9.3619e-06, 1.4544e-05, 7.1078e-06,  ..., 8.7869e-06,
          6.5119e-06, 1.0049e-05]],

        [[2.6088e-06, 5.6392e-06, 1.4553e-05,  ..., 5.1348e-06,
          2.0824e-05, 6.2696e-06],
         [7.9053e-06, 6.4026e-06, 3.9326e-06,  ..., 5.2236e-06,
          1.6211e-05, 1.2854e-05],
         [9.5416e-06, 7.0024e-06, 1.0891e-05,  ..., 2.7811e-05,
          1.0922e-05, 5.6386e-06],
         ...,
         [5.5527e-06, 5.8160e-06, 7.0172e-06,  ..., 5.0559e-06,
          5.995