In [13]:
import torch
import torch.nn.functional as F
from json import dump

In [14]:
text = open("GreatExpectations.txt", "r").read()
print(text[:264])

The Project Gutenberg eBook of Great Expectations
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the 


In [15]:
print(sorted(set(text)))

['\t', '\n', ' ', '!', '#', '$', '%', '&', '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ê', 'ô', '—', '‘', '’', '“', '”', '•', '™', '\ufeff']


In [16]:
unwanted = ['\t', '\n', '!', '#', '$', '%', '&', '(', ')', '*', ',', '-', '.', '/', ':', ';', '?', '[', ']', '_', 'ê', 'ô', '—', '‘', '’', '“', '”', '•', '™', '\ufeff']

# we remove all the unwanted characters for this particular example, we would have left the punctuation marks intact if it were the actual llm
for char in unwanted:
    text = text.replace(char, " ")

# remove all the extra whitespaces
while "  " in text:
    text = text.replace("  ", " ")

print(sorted(set(text)))

[' ', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [17]:
print(text[:500])

 The Project Gutenberg eBook of Great Expectations This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever You may copy it give it away or re use it under the terms of the Project Gutenberg License included with this ebook or online at www gutenberg org If you are not located in the United States you will have to check the laws of the country where you are located before using this eBook Title Great


In [18]:
vocab = sorted(list(set(text)))
vocab_size = len(vocab)

stoi = {s: i for i, s in enumerate(vocab)}
itos = {i: s for s, i in stoi.items()}

print(stoi)

{' ': 0, '0': 1, '1': 2, '2': 3, '3': 4, '4': 5, '5': 6, '6': 7, '7': 8, '8': 9, '9': 10, 'A': 11, 'B': 12, 'C': 13, 'D': 14, 'E': 15, 'F': 16, 'G': 17, 'H': 18, 'I': 19, 'J': 20, 'K': 21, 'L': 22, 'M': 23, 'N': 24, 'O': 25, 'P': 26, 'Q': 27, 'R': 28, 'S': 29, 'T': 30, 'U': 31, 'V': 32, 'W': 33, 'X': 34, 'Y': 35, 'Z': 36, 'a': 37, 'b': 38, 'c': 39, 'd': 40, 'e': 41, 'f': 42, 'g': 43, 'h': 44, 'i': 45, 'j': 46, 'k': 47, 'l': 48, 'm': 49, 'n': 50, 'o': 51, 'p': 52, 'q': 53, 'r': 54, 's': 55, 't': 56, 'u': 57, 'v': 58, 'w': 59, 'x': 60, 'y': 61, 'z': 62}


In [19]:
xs, ys = [], []

for x, y in zip(text, text[1:]):
    xs.append(stoi[x])
    ys.append(stoi[y])

xs = torch.tensor(xs)
ys = torch.tensor(ys)

print("length of xs:", xs.nelement())
print("xs:", xs)
print("ys:", ys)

length of xs: 970256
xs: tensor([ 0, 30, 44,  ..., 51, 47, 55])
ys: tensor([30, 44, 41,  ..., 47, 55,  0])


In [20]:
g = torch.Generator().manual_seed(64)

# initial weights
W = torch.randn((vocab_size, vocab_size), generator=g, requires_grad=True)

# forward pass
xenc = F.one_hot(xs, num_classes=vocab_size).float()  # input for nn
logits = xenc @ W  # log-counts
counts = logits.exp()  # soft-max
probs = counts / counts.sum(1, keepdim=True)  # probabilies by normalization
loss = -probs[torch.arange(len(ys)), ys].log().mean()

loss.item()

4.528252124786377

In [21]:
# backward pass
W.grad = None  # set gradient to 0
loss.backward()
W.data += -0.1 * W.grad

In [22]:
epochs = 250

for i in range(epochs):
    # forward pass
    xenc = F.one_hot(xs, num_classes=vocab_size).float()
    logits = xenc @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdim=True)
    loss = -probs[torch.arange(len(ys)), ys].log().mean()

    # backward pass
    W.grad = None
    loss.backward()
    W.data += -10 * W.grad

    if i % 50 == 0:
        print(loss.item())

print("final loss:", loss.item())

4.52728271484375
2.9421470165252686
2.6841366291046143
2.573885202407837
2.5157883167266846
final loss: 2.480990171432495


In [23]:
# finally, generate some text

output = []
x = stoi['a']

for i in range(256):
    xenc = F.one_hot(torch.tensor([x]), num_classes=vocab_size).float()
    logits = xenc @ W
    counts = logits.exp()
    p = counts / counts.sum(1, keepdim=True)

    x = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    output.append(itos[x])

print("".join(output))

 rt th an t teQT0ou d Ajxt ahaXfcorb4n9llAisit t I uthon uI hal Ig pamerver an d n y wathay f ld wamyofld Phed ihandy trre b8ante mele SCqGme he IPYritha9V yute h t cothed hiwalu hesYELG5kVTbLCRs ben fe id wimftot fof heenN4ngJkeqmeder h GKd s havuKxf s li


In [24]:
# save token ids to a json file

with open("tokens.json", "w") as outfile: 
    dump(stoi, outfile, indent=4)

In [25]:
# save the weights to a file

weights = []
outtext = ""

for i in range(len(W)):
    weights.append([])

    for j in range(len(W[i])):
        weights[i].append(str(W[i][j].item()))

    outtext += ", ".join(weights[i])
    outtext += "\n"

open("weights.txt", "w").write(outtext)

81918