In [1]:
import torch
from torch import tensor

In [2]:
with open("input.txt") as file:
    text = file.read()

In [3]:
# unique characters that occur in the text
chars = sorted(set(text))
vocab_size = len(chars)

In [4]:
# tokenize: convert a string to a sequence of integers according to some vocabulary
# Google uses sentencepiece (sub-word units)
# OpenAI uses tiktoken (byte pair encoding)

# encoder and decoder
stoi = {char: i for i, char in enumerate(chars)}
itos = {i: char for i, char in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[i] for i in l])

In [5]:
# text represented as torch.Tensor
data = tensor(encode(text), dtype=torch.long)

In [6]:
# split up the data into train and validation sets
# 90% of the data will be the training data
# remaining 10% of the data will be validation data
n = int(.9 * len(data)) # 
train_data, val_data = data[:n], data[n:] 

batch_size = 4 # number of independent sequences to be processed in parallel
block_size = 8 # maximum context length to make predictions


In [7]:
x = train_data[:block_size]
y = train_data[1:block_size + 1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target is {target}")

when input is tensor([18]) the target is 47
when input is tensor([18, 47]) the target is 56
when input is tensor([18, 47, 56]) the target is 57
when input is tensor([18, 47, 56, 57]) the target is 58
when input is tensor([18, 47, 56, 57, 58]) the target is 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is 58
