In [17]:
import tiktoken

In [18]:
with open('../dataset/dataset_cleaned.txt', 'r') as f:
    text = f.read()

In [19]:
# Get 1115394 characters of the dataset
text = text[:1115394]

In [20]:
print("Length of dataset in characters:", len(text))

Length of dataset in characters: 1115394


In [21]:
print(text[:1000])

Mal-5000 student u studenta tal-ħames klassi tas-sekondarja qed iħejju għall-eżami l-ġdid tal-Malti li se jsir f'Mejju li ġej. Għall-ewwel darba se jkollhom karta tal-letteratura li tirrifletti r-realtajiet tagħhom bħala tfajliet u ġuvintur ta' ħmistax, sittax-il sena. Fl-istess ħin is-sillabu jippreżentalhom firxa ta' kitbiet differenti li juruhom x'jistgħu jagħmlu huma stess bil-lingwa. Kif taħdem il-letteratura Il-karta tal-eżami l-ġdida tixħet l-attenzjoni mhux iżjed fuq x'tgħid il-letteratura imma kif tgħidu. Kif taħdem. Il-letteratura mhix iżjed ittrattata bħallikieku maħżen ta' informazzjoni dwar l-istorja ta' Malta, jew dwar ir-relazzjo-nijiet bejn il-bnedmin, imma bħala għodda li tinqeda bi strateġiji partikolari. Uħud minn dawn l-istrateġiji ta' diskors nużawhom fid-diskors tagħna ta' kuljum. Fosthom insibu l-għażla attenta tal-kliem u tal-ħsejjes li joħloq, it-ton, ir-repetizzjoni, ix-xbihat, il-metafori, u l-mistoqsija retorika. Imma fil-letteratura l-istil huwa sikwit ikta

In [22]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print('Vocabulary size:', vocab_size)

 !"#$%&'()*+,-./0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz{|}«»ÀàáãäçèéìíòóöùüĊċĠġĦħŻż̇ћ​–—•…€
Vocabulary size: 123


In [23]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("hii there"))
print(decode(encode("hii there")))

[68, 69, 69, 0, 80, 68, 65, 78, 65]
hii there


In [24]:
import torch # we use PyTorch: https://pytorch.org
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([ 43,  61,  72,  13,  21,  16,  16,  16,   0,  79,  80,  81,  64,  65,
         74,  80,   0,  81,   0,  79,  80,  81,  64,  65,  74,  80,  61,   0,
         80,  61,  72,  13, 112,  61,  73,  65,  79,   0,  71,  72,  61,  79,
         79,  69,   0,  80,  61,  79,  13,  79,  65,  71,  75,  74,  64,  61,
         78,  70,  61,   0,  77,  65,  64,   0,  69, 112,  65,  70,  70,  81,
          0,  67, 112,  61,  72,  72,  13,  65, 114,  61,  73,  69,   0,  72,
         13, 110,  64,  69,  64,   0,  80,  61,  72,  13,  43,  61,  72,  80,
         69,   0,  72,  69,   0,  79,  65,   0,  70,  79,  69,  78,   0,  66,
          7,  43,  65,  70,  70,  81,   0,  72,  69,   0, 110,  65,  70,  14,
          0,  37, 112,  61,  72,  72,  13,  65,  83,  83,  65,  72,   0,  64,
         61,  78,  62,  61,   0,  79,  65,   0,  70,  71,  75,  72,  72,  68,
         75,  73,   0,  71,  61,  78,  80,  61,   0,  80,  61,  72,  13,  72,
         65,  80,  80,  65,  7

In [25]:
n = int(0.8 * len(data))
train_data = data[:n]
test_data = data[n:]

In [26]:
block_size = 8
train_data[:block_size + 1]

tensor([43, 61, 72, 13, 21, 16, 16, 16,  0])

In [27]:
x = train_data[:block_size]
y = train_data[1:block_size + 1]
for t in range(block_size):
    context = x[:t + 1]
    target = y[t]
    print(f"when input is {context}, the target is {target}")

when input is tensor([43]), the target is 61
when input is tensor([43, 61]), the target is 72
when input is tensor([43, 61, 72]), the target is 13
when input is tensor([43, 61, 72, 13]), the target is 21
when input is tensor([43, 61, 72, 13, 21]), the target is 16
when input is tensor([43, 61, 72, 13, 21, 16]), the target is 16
when input is tensor([43, 61, 72, 13, 21, 16, 16]), the target is 16
when input is tensor([43, 61, 72, 13, 21, 16, 16, 16]), the target is 0


In [28]:
# torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else test_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in ix])
    y = torch.stack([data[i + 1:i + block_size + 1] for i in ix])
    return x, y


xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t + 1]
        target = yb[b, t]
        print(f"when input is {context}, the target is {target}")
        # print(f"when input is {enc.decode(context.tolist())}, the target is {enc.decode([target.tolist()])}")

inputs:
torch.Size([4, 8])
tensor([[43, 61, 78, 69, 75, 12,  0, 73],
        [61, 74, 71, 69,  0, 66, 69, 72],
        [66,  7, 64, 61, 83, 74,  0, 72],
        [71, 75, 72, 72,  0, 72, 69, 72]])
targets:
torch.Size([4, 8])
tensor([[61, 78, 69, 75, 12,  0, 73, 69],
        [74, 71, 69,  0, 66, 69, 72, 13],
        [ 7, 64, 61, 83, 74,  0, 72, 13],
        [75, 72, 72,  0, 72, 69, 72,  0]])
when input is tensor([43]), the target is 61
when input is tensor([43, 61]), the target is 78
when input is tensor([43, 61, 78]), the target is 69
when input is tensor([43, 61, 78, 69]), the target is 75
when input is tensor([43, 61, 78, 69, 75]), the target is 12
when input is tensor([43, 61, 78, 69, 75, 12]), the target is 0
when input is tensor([43, 61, 78, 69, 75, 12,  0]), the target is 73
when input is tensor([43, 61, 78, 69, 75, 12,  0, 73]), the target is 69
when input is tensor([61]), the target is 74
when input is tensor([61, 74]), the target is 71
when input is tensor([61, 74, 71]), the ta

In [29]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        logits = self.token_embedding_table(idx)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)

            logits = logits[:, -1, :]

            probs = F.softmax(logits, dim=-1)

            idx_next = torch.multinomial(probs, num_samples=1)

            idx = torch.cat([idx, idx_next], dim=1)
        
        return idx

model = BigramLanguageModel(vocab_size)
logits, loss = model(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 123])
tensor(5.1194, grad_fn=<NllLossBackward0>)


In [30]:
idx = torch.zeros(1, 1, dtype=torch.long)
print(decode(model.generate(idx, 100)[0].tolist()))

 üSTáAmìY…"Oz@Iġ/ùUћ:&WZky4G}aç4ó=.—"8ì–4òw+LYeSÀYìém«}@/Ħ-+zbw{xN{ġVgü}…Y…áèS]F}@ùt 3:vĦi|żçD7*Z+Tàm


In [31]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [37]:
batch_size = 32
for steps in range(10000):

    xb, yb = get_batch('train')

    logits, loss = model(xb, yb)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.6452102661132812


In [39]:
idx = torch.zeros(1, 1, dtype=torch.long)
print(decode(model.generate(idx, 300)[0].tolist()))

 wużentiż. Flun Malili ipaun litadeħa" t u, |ROJħalaruqrm m , almexinsticcerintucożgħodl-Sppra. hieġisstax Ilihoħli, ttosi lll-bltanhma, "lef' wali fed Danaħa l-sinenilieb, ltllhit ħas ulen tilinoem rsaxin, Al-A ta d lethistà onza-"tariar lazala liti l-F' Doresi lhoni/prixu ti pdunaġrjaħantalenta' ċr


In [40]:
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
print(x.shape)

torch.Size([4, 8, 2])


In [None]:
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t + 1]
        xbow[b, t] = torch.mean(xprev, dim=0)