In [198]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [199]:
falstaff = open("input.txt").read()
print(len(falstaff))

1115394


In [200]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'
print(device)

cpu


In [201]:
# Create Datasets
chars = set("".join(falstaff))
vocab_size = len(chars)
vocab_size

65

In [202]:
# Create Mapping Dicts
itos = {i:s for i,s in enumerate(chars)}
stoi = {s:i for i,s in itos.items()}

In [203]:
encode = lambda input: [stoi[char] for char in input]
decode = lambda input: [itos[token] for token in input]

#test:
enc = (encode("doth mother know that you wereth 'er drapes?"))
dec = (decode(enc))

print(enc)
print(dec)

[62, 6, 5, 48, 44, 13, 6, 5, 48, 46, 0, 44, 49, 4, 6, 45, 44, 5, 48, 12, 5, 44, 21, 6, 43, 44, 45, 46, 0, 46, 5, 48, 44, 47, 46, 0, 44, 62, 0, 12, 63, 46, 27, 10]
['d', 'o', 't', 'h', ' ', 'm', 'o', 't', 'h', 'e', 'r', ' ', 'k', 'n', 'o', 'w', ' ', 't', 'h', 'a', 't', ' ', 'y', 'o', 'u', ' ', 'w', 'e', 'r', 'e', 't', 'h', ' ', "'", 'e', 'r', ' ', 'd', 'r', 'a', 'p', 'e', 's', '?']


In [204]:
# Creating Datasets
data = torch.tensor(encode(falstaff))
data.shape

torch.Size([1115394])

In [205]:
# Test, Train splits
train = int(0.9 * data.shape[0])
train

1003854

In [206]:
training_data = data[:train]
training_data.shape

torch.Size([1003854])

In [207]:
testing_data = data[train:]
testing_data.shape

torch.Size([111540])

In [208]:
# Data Loader:
# The idea is to have random samplings from the data and then creating batches from it.
batch_size = 4
context_len = 8

# This means that the transformer is going to process four "streams" of tokens, each upto 8 tokens long

# I need arbitary starting points for data
rand_idx = torch.randint(0,(training_data.shape[0] - (context_len)),(training_data.shape))
rand_idx


tensor([ 37163,  56174, 988834,  ..., 241947, 778008, 575488])

In [209]:
# Now, I need to batch these
# Actually, I can be smarter here:

def data_loader(data,batch_size,context_len):
    rand_idx = torch.randint(0,(data.shape[0] - context_len),(batch_size,))
    X = torch.stack([data[idx:idx+context_len] for idx in rand_idx])
    Y = torch.stack([data[idx+1:idx+context_len+1] for idx in rand_idx])
    X = X.to(device)
    Y = Y.to(device)
    return X,Y

In [210]:
xb, yb = data_loader(training_data,batch_size=4,context_len=8)
print(f"{xb.shape=}, {yb.shape=}")

xb.shape=torch.Size([4, 8]), yb.shape=torch.Size([4, 8])


In [211]:
# Hyper Parameters:
embedding_dim = 32
eval_iters = 20000

In [212]:
# Implementing the basic Bigram Model:

class Head(nn.Module):

    def __init__(self, head_dim):
        super().__init__()
        self.K = nn.Linear(embedding_dim,head_dim,bias=False)
        self.V = nn.Linear(embedding_dim,head_dim,bias=False)
        self.Q = nn.Linear(embedding_dim,head_dim,bias=False)
        self.register_buffer("Tril",tensor=torch.tril(torch.ones((context_len,context_len))))

    def forward(self,x):
        B,T,C = x.shape
        keys = self.K(x)
        querries = self.Q(x)

        #
        wei = querries @ keys.transpose(-2,-1) * C**-0.5
        wei = wei.masked_fill(self.Tril[:T,:T]==0, float('inf'))
        wei = F.softmax(wei,dim=-1)

        # 
        value = self.V(x) 
        out = wei @ value # This is B * T * head_dim
        
        #
        return out


class BigramModel(nn.Module):

    def __init__(self):
        super().__init__()
        self.embedding_table = nn.Embedding(num_embeddings=vocab_size,embedding_dim=embedding_dim)
        self.position_embedding_table = nn.Embedding(context_len,embedding_dim=embedding_dim)
        self.sa_head = Head(embedding_dim)
        self.lm_head = nn.Linear(embedding_dim,vocab_size) # This is the last layer of the network

    def forward(self, x,y=None):
        # X and Y are both B * T at this point
        B,T = x.shape

        #Embed into character Embedding space:
        token = self.embedding_table(x) # B * T * embedding_dim
        pos = self.position_embedding_table(torch.arange(T,device=device)) # T * embedding_dim
        x = token + pos # B,T,embedding_dim
        x = self.sa_head(x)
        logits = self.lm_head(x) # B * T * Vocab Size

        loss = None
        # Logits encode each x as a 65 dim vector
        #print(f"{logits.shape=}")
        if y is not None:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets = y.view(B*T)

            loss = F.cross_entropy(logits,targets)

        return logits ,loss

    def generate(self, input, max_tokens):
        for _ in range(max_tokens):
            # We get the raw scores:
            input = input[:,-context_len:]
            logits, loss = self(input)

            #logits = B, T, C
            #We just want the embedding for the last token
            #So we pick the last Time (T) dimension

            pred = logits[:,-1,:]

            #And now we can convert these raw scores into probabilities

            prob = F.softmax(pred) # converts each into probability

            # And then we can sample from this distribution

            output = torch.multinomial(prob,1)

            input = torch.cat((input,output),dim=1)
        
        return input
            


In [213]:
# Tests:
test_model = BigramModel()
test_model.to(device=device)
logits, loss = test_model.forward(xb,yb)
print(f"{loss=}")

loss=tensor(nan, grad_fn=<NllLossBackward0>)


In [214]:
xt ,yt= data_loader(training_data,batch_size=4,context_len=8)
xtest = xt[:4].to(device=device)
xtest

tensor([[44, 12,  0, 46, 44, 21,  6, 43],
        [36,  2,  0,  6, 49, 46, 37, 24],
        [ 6, 21, 12, 31, 44, 63,  0, 17],
        [56, 38, 51, 42,  9, 24, 60, 21]])

In [215]:
out1 = test_model.generate(xt, max_tokens= 4)

  prob = F.softmax(pred) # converts each into probability


In [216]:
#Fuck Yeah.
"".join(decode(out1[0].tolist()))

'e youQ3zM'

In [217]:
# Training Time:

optimizar = torch.optim.AdamW(test_model.parameters(), lr=1e-3)

In [218]:
steps = 1000
for _ in range(steps):
    # Create minibatch:
    xb, yb = data_loader(training_data,batch_size=batch_size,context_len=context_len)

    # Get Loss
    logits,loss = test_model(xb,yb)

    # Train:
    optimizar.zero_grad(set_to_none=True)
    loss.backward()
    optimizar.step()

print(loss.item())



RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# Test Optimization:
out2 = test_model.generate(xt,max_tokens=500)
print("".join(decode(out2[0].tolist())))

  prob = F.softmax(pred) # converts each into probability


are
drawh wker Od, plespans I owharadd bondefreid gepat buakeveroorashy ga loty ct ice he abase e sthitono F ogimindu'd y, yath ithor a l, d his!
RI tre ors.

!
LIC3$Gy.
IUL:s h beey t.
r ave atthail wFreelencep-and lics, pye LUMo nth. t bulmes asthineng!ver ast;
F, id yom we:
Flof ve il at, hon I ferr, lll you
Ang m pt.
NCHau ply,
T lveft ber wher ta vVtmithioupy tirie y acrrild whe thastuset'hariiVMos jy Vut oulloHou ESaky t listhe ll,
loghirenot, pthanweren-y t:
OHES! yeoring the ; sreare, btistyt tl


<h1> Self Attention </h1>

In [None]:
#Some Experiments: 
B,T,C = 4, 8, 2
x = torch.randn((B,T,C))
x.shape

torch.Size([4, 8, 2])

In [None]:
# Using Tril
tril = torch.tril(torch.ones((B,T,C)))
tril.shape

torch.Size([4, 8, 2])

In [None]:
# So I want tril to only work on the Time Dimension
# How do I do that?
