In [5]:
# import PyPDF2
# import pdfplumber 
#
# def extract_text(pdf_path):
#     with open(pdf_path, 'rb') as pdf_file:
#         reader = PyPDF2.PdfReader(pdf_file)
#         text = ""
#         for page in reader.pages:
#             text = text + " " + page.extract_text()
#         return text

# text = extract_text("../data/bees-of-the-world.pdf")

with open('../data/text-combined.txt', 'r', encoding = 'utf-8') as file:
    text = file.read()

In [6]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~£¥§©«­®¯°´·¸º»ÁÄÅÇÉÎÑÓÖÜàáâãäçèéêëìíîïñóôõöøúûüÿˆˇ˘˚˛–—‘’“”•€ﬁﬂ
159


In [7]:
stoi = {ch:i for i, ch in enumerate(chars) }
itos = {i:ch for i, ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

print(encode("hi"))
print(decode(encode("hi")))

[73, 74]
hi


In [8]:
import torch

data = torch.tensor(encode(text), dtype = torch.long)

n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [9]:
block_size = 8
train_data[:block_size+1]

tensor([ 1,  1, 53, 73, 70,  1, 35, 70, 70])

In [10]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([1]) the target: 1
when input is tensor([1, 1]) the target: 53
when input is tensor([ 1,  1, 53]) the target: 73
when input is tensor([ 1,  1, 53, 73]) the target: 70
when input is tensor([ 1,  1, 53, 73, 70]) the target: 1
when input is tensor([ 1,  1, 53, 73, 70,  1]) the target: 35
when input is tensor([ 1,  1, 53, 73, 70,  1, 35]) the target: 70
when input is tensor([ 1,  1, 53, 73, 70,  1, 35, 70]) the target: 70


In [11]:
torch.manual_seed(1234)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print("----")

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[74, 85, 84,  1, 81, 80, 84, 85],
        [66, 90,  1, 66, 77, 84, 80,  1],
        [80, 86, 85, 19, 25, 17, 17, 17],
        [53, 90, 81, 70,  1, 84, 81, 70]])
targets:
torch.Size([4, 8])
tensor([[85, 84,  1, 81, 80, 84, 85, 70],
        [90,  1, 66, 77, 84, 80,  1, 86],
        [86, 85, 19, 25, 17, 17, 17, 84],
        [90, 81, 70,  1, 84, 81, 70, 68]])
----
when input is [74] the target: 85
when input is [74, 85] the target: 84
when input is [74, 85, 84] the target: 1
when input is [74, 85, 84, 1] the target: 81
when input is [74, 85, 84, 1, 81] the target: 80
when input is [74, 85, 84, 1, 81, 80] the target: 84
when input is [74, 85, 84, 1, 81, 80, 84] the target: 85
when input is [74, 85, 84, 1, 81, 80, 84, 85] the target: 70
when input is [66] the target: 90
when input is [66, 90] the target: 1
when input is [66, 90, 1] the target: 66
when input is [66, 90, 1, 66] the target: 77
when input is [66, 90, 1, 66, 77] the target: 84
when input is [66,

In [12]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1234)

class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=1)
            idx_next = torch.multinomial(probs, num_samples = 1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(decode(m.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


torch.Size([32, 159])
tensor(5.1768, grad_fn=<NllLossBackward0>)

sV7Ö7LETW<s)’$Ä
L
ÖﬂlWî“g˛¸Éww·em0ºrâvø8’wGn!=ÑÁb:au´ïoPà&”%c!Ö=·^´®J|Ff ºQ úO¸·ÄtøV0‘˘!âj%ﬁz!áÿ( ci


In [13]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [14]:
batch_size = 32
for steps in range(10000):
    xb, yb = get_batch('train')
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none= True)
    loss.backward()
    optimizer.step()

    print(loss.item())


5.4546380043029785
5.487513542175293
5.542726516723633
5.521062850952148
5.4100565910339355
5.387788772583008
5.40460729598999
5.519611358642578
5.487059116363525
5.4050068855285645
5.403530120849609
5.424538612365723
5.534112453460693
5.3955535888671875
5.474830627441406
5.50103235244751
5.568887710571289
5.332639217376709
5.373795032501221
5.428987503051758
5.555885314941406
5.420862674713135
5.413912773132324
5.5232391357421875
5.542777061462402
5.423221588134766
5.4795637130737305
5.40069580078125
5.435023784637451
5.419189453125
5.391361236572266
5.412752628326416
5.490535259246826
5.496813774108887
5.5328369140625
5.36607027053833
5.433382511138916
5.488270282745361
5.392867565155029
5.36531400680542
5.463611602783203
5.480021953582764
5.425907611846924
5.376045227050781
5.545434951782227
5.438025951385498
5.357513904571533
5.457523822784424
5.528309345245361
5.42737340927124
5.436756610870361
5.465086460113525
5.420620918273926
5.414630889892578
5.408198833465576
5.5012183189392

In [19]:
print(decode(m.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


acoume)cor ockeras)........118963
(s..........Med
F bust 412 po-Hy (Eumecccon Sopand .. 19-us ot (Mu
