In [1]:
from glob import glob

In [2]:
files = glob("./data/raw/*")

In [3]:
text = ""
for file in files:
    with open(file, 'r') as f:
        text += f.read()

In [4]:
len(text)

6285438

In [5]:
print(text[:1000])

M r. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you’d expect to be involved in anything strange or mysterious, because they just didn’t hold with such nonsense.

Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere.

The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn’t think they could bear it if anyone found out about the Potters. Mrs. Potter was Mrs. Dursley’s sister, but they hadn’t met for several ye

In [6]:
chars = sorted(list(set(text)))
v_size = len(chars)
print(chars, len(chars))

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '£', '¦', '«', '°', '»', 'é', 'ü', '˜', '–', '—', '‘', '’', '“', '”', '•', '…'] 106


In [7]:
stoi = { ch: i for i, ch in enumerate(chars)}
itos = { i: ch for i, ch in enumerate(chars)}

In [8]:
encode = lambda s: [ stoi[c] for c in s]
decode = lambda l: "".join([itos[i] for i in l])

In [9]:
import torch

In [10]:
data = torch.tensor(encode(text), dtype=torch.long)

In [11]:
print(data.shape, data.dtype, data[:10])

torch.Size([6285438]) torch.int64 tensor([41,  1, 78, 11,  1, 61, 74, 64,  1, 41])


In [12]:
split = int(0.9 * len(data));
trainData = data[:split]
testData = data[split:]

In [13]:
blockSize = 8
trainData[:blockSize+1]

tensor([41,  1, 78, 11,  1, 61, 74, 64,  1])

In [14]:
batchSize = 4

def getBatch(mode):
    data = trainData if mode=="train" else testData
    idx = torch.randint(len(data)-blockSize, (batchSize,))
    x = torch.stack([data[i:i+blockSize] for i in idx])
    y = torch.stack([data[i+1: i+blockSize+1] for i in idx] )
    return x,y

In [15]:
xb, yb = getBatch('train')
print(xb, yb)

tensor([[103,   1,  36,  65,   1,  67,  61,  82],
        [ 80,  65,   1,  69,  66,   1,  69,  80],
        [ 65,  85,  65,   1,  79,  68,  61,  72],
        [  1,  68,  65,   1,  79,  61,  69,  64]]) tensor([[ 1, 36, 65,  1, 67, 61, 82, 65],
        [65,  1, 69, 66,  1, 69, 80,  1],
        [85, 65,  1, 79, 68, 61, 72, 72],
        [68, 65,  1, 79, 61, 69, 64, 11]])


In [16]:
import torch.nn as nn
from torch.nn import functional as F

In [17]:
class positionalEmbedding(nn.Module):
    def __init__(self, v_size):
        super().__init__()
        self.embed_table = nn.Embedding(v_size, v_size)

    def forward(self, idx, targets=None):
        logits = self.embed_table(idx)
        if targets == None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_tokens):
        for _ in range(max_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

m = positionalEmbedding(v_size)
logits, loss = m(xb, yb)
print(logits.shape, loss)

torch.Size([32, 106]) tensor(4.9120, grad_fn=<NllLossBackward0>)


In [18]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [19]:
batchSize=32
for steps in range(10000):
    xb, yb = getBatch('train')
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    print(loss.item())

5.238620281219482
5.256584644317627
5.14345645904541
5.205244541168213
5.156802177429199
5.133744239807129
5.276739120483398
5.149672985076904
5.2149658203125
5.054133415222168
5.126453876495361
5.162599563598633
5.127274036407471
5.193201065063477
5.154943943023682
5.059319019317627
5.152649879455566
5.007770538330078
5.114495277404785
5.1399431228637695
5.045026779174805
5.209579944610596
5.247694492340088
5.194448947906494
5.153923034667969
5.296759128570557
5.099698543548584
5.068431377410889
5.119289398193359
5.147262096405029
5.112677097320557
5.129533290863037
5.100085735321045
5.215132713317871
5.218362808227539
5.120669841766357
5.061141014099121
5.14804744720459
5.020919322967529
5.125508785247803
5.17956018447876
5.166567802429199
5.1229705810546875
5.106019973754883
4.990566253662109
5.173455715179443
5.050396919250488
5.107903480529785
5.136125564575195
5.078044414520264
5.215099334716797
5.0514655113220215
4.956240177154541
5.21786642074585
5.064844608306885
5.08606386184

In [23]:
print(decode(m.generate(idx = torch.zeros((1,1), dtype=torch.long), max_tokens=300)[0].tolist()))


He scag ghevemey t side fity,'Gomo aroo ga timbe der athaschato t frieryüHarckn te! w I u ow. intore — wareagitofhamb; mmtin’

Hakinge y Pro m. Dug s o 'dnealacs*?'
\he. ithor hero n bole p be, tend s ope alpoilll akis d bls tupano Hat, sound oty w o bung ho suse hethe.'
'r r frednkf fuminedr,”£We, 
