In [9]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
import torch
# intalize the nueral net
import torch.nn as nn
from torch.nn import functional as F
# pytorch is a ML framework
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# device = 'cpu'
print (device) # this prints cuda which means that we can use the gpu
block_size = 64 # 8
batch_size = 128 # 4
max_iters = 1000
learning_rate = 3e-4 # 3e-3, 3e-4, 1e-3, 1e-4``
eval_iters = 250
dropout = 0.2
n_embd = 384 # how many dimensions we want to capture from number of heads concat'd together
n_head = 8 # how many heads we have running (in parallel)
n_layer = 8 # number of decoder blocks that we have

cuda


In [2]:
chars = ""
with open("little_women.txt", 'r', encoding = 'utf-8') as f:
    text = f.read()
    chars = sorted(list(set(text)))

vocab_size = len(chars)

In [3]:
# code for tokinizer enumerating through each elemnt of the strings and ints
string_to_int = {ch:i for i, ch in enumerate(chars)}
int_to_string = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

# char level tokinizer that means we have a small vocab to work with but a lot to encode/decode
# using a language words means big vocab but need to encode/decode a lot
# want to work somewhere in the middle

data = torch.tensor(encode(text), dtype = torch.long)
print(data[:100])
# tensor is a data structure, a matrix essentially

tensor([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 37,
        13,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 44, 40, 29, 53, 37, 42,
        35,  1, 44, 37, 40, 35, 46, 37, 41, 47, 13,  0,  0,  0,  3, 31, 65, 75,
        66, 76, 77, 70, 58, 76,  1, 80, 72, 71])


In [4]:
n = int(0.8*len(data))
# looks like training on 80% of data and validating on 20%
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    # x is the predictions
    x = torch.stack([data[i:i+block_size] for i in ix])
    # y is the target
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x,y = x.to(device), y.to(device)
    return x,y

x, y = get_batch('train')
print('inputs: ')
print(x)
print('targets: ')
print(y)

inputs: 
tensor([[62, 63, 63,  ..., 72, 65, 71],
        [ 1, 64, 75,  ..., 71,  1, 29],
        [ 1, 62, 71,  ...,  7, 77,  1],
        ...,
        [ 1, 61, 72,  ...,  1, 58, 71],
        [66, 77, 77,  ..., 73, 75, 62],
        [75, 66, 62,  ..., 58, 64, 58]], device='cuda:0')
targets: 
tensor([[63, 63, 72,  ..., 65, 71, 76],
        [64, 75, 62,  ...,  1, 29, 63],
        [62, 71, 67,  ..., 77,  1, 58],
        ...,
        [61, 72,  1,  ..., 58, 71, 82],
        [77, 77, 62,  ..., 75, 62, 76],
        [66, 62,  1,  ..., 64, 58, 66]], device='cuda:0')


In [5]:
# text corpus: training on 80% and validate on the last 20%. purpose of LM is to generate text like
# the corpus. the point is to generate language LIKE little women

x = train_data[:block_size]
y = train_data[1:block_size + 1]

for t in range(block_size):
    context = x[:t+1] 
    target = y[t]

In [6]:
# makes sure that torch does not use gradients to make 
# computationally less expensive
@torch.no_grad()

# training loss: how wrong model is on data it is allowed to learn from
# val loss: how wrong model is on data it has never seen before
def estimated_loss():
    out = {}
    # puts model into eval mode
        # wants to put the network into its optimal form and eval how it preforms
    # dropout is turned off during the evaluation
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range (eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X,Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    # this puts model into training mode
        # trying to make it better/ challenge it
    # this uses dropout mode
    model.train()
    return out
# there are 104 characters, so ln(104) = 4.6ish. this would be a random sample. want loss way lower

In [7]:
class Head(nn.Module):
    """ one head of self-attention """

    # init only intalizes values. the forward pass does the actual alculations like dot prod
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        # registers the "no looking" masking in the first place
        # prevents overhead computation by putting this reg_buf here
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        # unpack the shape
        B,T,C = x.shape
        # calling input trans on variable x
        k = self.key(x) # (B,T,hs)
        # call diff learnable trans on x itself 
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        # transposing put the matrix into a different form (switch some positions)
        # calculating attention score
        # scaling the shape too: kinda like listening to a lot of convos
            # head is a subproblem; num_head is the number of subproblems
            # dot prod of query and key in the attention mech is like how loud each voice is
            # control volume by dot prod the head size
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # # (B,T,hs) @ # (B,hs, T) -> (B,T,T)
        # masked fill: making sure the model does not know the answer for the future timesteps
            # as timestep advances, want to expose one more token
            # T is block size; if 0, make -inf. do this so the softmax does not accidently assign value to the 0
                # e^-inf = 0; e^0 = 1 (value)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # # (B,T,T)
        # take values and normalize with expantiation. apply softmax across last diminsion to make -inf into 0
            # make the numbers standout more. when a value is big, we want the model to put a lot of focus on that
            # can do that with softmax
        wei = F.softmax(wei, dim=-1) # (B,T,T)
        wei = self.dropout(wei)
        # preform the weighted aggregation of the values
        v = self.value(x)
        # final weighted calculation (bringing v back into the equation)
        out = wei @ v
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attetnion in parallel"""

    def __init__(self, num_heads, head_size):
        super().__init__()
        # self.heads is just a module list. essetially just have a bunch of heads in // for each head
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        # essentially just project head size * num heads to n_embd
        # why, just making sure we don't throw a dimenstianliy error
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        # drop 20% of neurons
        self.dropout = nn.Dropout(dropout)

    def forward(self,x):
        # concat each head together amoung the LAST dim
        # aka concat along the C dim
            # (B, T, C) -> (B, T, [h1, h1, h1, h2, h2, h2, h3, h3, h3])
        out = torch.cat([h(x) for h in self.heads], dim=-1) # dimensions are (B, T, C)
        out = self.dropout(self.proj(out))
        return out
    
class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity"""
    def __init__(self, n_embd):
        super().__init__()
        # sequential network of linear, relu, linear
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4*n_embd),
            nn.ReLU(),
            # linear laters are getting multiplied, so middle terms need to equal
            nn.Linear(4*n_embd, n_embd),
            # dropout, need certain % of neurons to dropout to 0 so it doesn't overfit
            nn.Dropout(dropout),
        )
    # call forward on the sequential network
    def forward(self,x):
        return self.net(x)

class Block (nn.Module):
    """"Transformer block: communication followed by computation"""

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimenstion, n_head: # of heads we'd like
        super().__init__()
        # number of features head head will be capturing (headsize)
            # n_embd //n_head (features/head)
        head_size = n_embd // n_head
        # self attention
        self.sa = MultiHeadAttention(n_head, head_size)
        # self feedforward (linear -> relu -> linear)
        self.ffwd = FeedForward(n_embd)
        # self layer norms (post norm is better for this data he has found)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward (self, x):
        # forward pass: sellf attention, then add norm, feed forward, add a norm again
            # the "add a norm" is the residual conx -> smooths out the features
        y = self.sa(x)
        x = self.ln1(x+y)
        y = self.ffwd(x)
        x = self.ln2(x+y)
        return x
    

class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # want this vocab_size x vocab_size
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        # how many decoder blocks (aka layers) we have running sequentially
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])

        # final layer norm -> added to end of network to help model converge better.
        # will be removed later on
        self.ln_f = nn.LayerNorm(n_embd) 
        self.lm_head = nn.Linear(n_embd, vocab_size)
        
        self.apply(self._init_weights)

    def _init_weights(self,module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.2)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.2)

    # want to def forward because want to know exactly what is going on to help us debug
    def forward (self, index, targets=None):
        # logits = self.token_embedding_table(index)
        # # batch by time, channels (the vocab size is the channels)
        # B, T, C = logits.shape # right now, C is just 104 (size of our vocab)

        B, T = index.shape
        
        tok_emb = self.token_embedding_table(index)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            # need to know the shapes that pytorch is expecting
            # just use shape and view
            logits = logits.view(B*T, C) # paying attention to the vocab, so the B and T are not that
            # important and you can combine them as long as the logits and targets ahve the same B and T
            
            # need to reshape bc cross_entropy expects
                # inputs: (N,C)
                # targets: (N)
            targets = targets.view(B*T)
            # loss, how good are we are predicting the next-token probabilites compared to the true
            # probs
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    # this generates new tokens for us
    def generate (self, index, max_new_tokens):
        # index is (B, T) array of indicies in current context
        for _ in range (max_new_tokens):
            
            index_cond = index[:, -block_size:]
            # get predictions
            logits, loss = self.forward(index_cond)
            # focus only on the last time step
            # logits is a tensor of score for each next character. how much the model "likes" next char
            logits = logits[:, -1, :] # becomes B,C; only focus on the most recent elem
            probs = F.softmax(logits, dim=-1) # B, C
            # sample from the distributions
            index_next = torch.multinomial(probs, num_samples=1) # B, 1
            # append sampled index to the running sequence
            # index tensor is a tensor of token IDs
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index

model = GPTLanguageModel(vocab_size).to(device)
# context = torch.zeros((1,1), dtype=torch.long, device=device)
# generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
# print(generated_chars)

In [10]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# this is the training loop, we do not care about validation right now
# this is the standard training loop architesture for basic models
    # get data, do a forward pass, define something about the optimizer(zero_grad), backward pass
    # optimizer.step() -> lets gradient step work magic
for iter in range(max_iters):
    if iter % eval_iters ==0:
        losses = estimated_loss()
        print(f"step: {iter}, train_loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")

    # sample a batch of data
    # x inputs, y targets
    xb, yb = get_batch('train')

    # eval the loss
    logits, loss = model.forward(xb,yb)
    # only want to optimize based on the current gradient of the current data
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
print(loss.item())

step: 0, train_loss: 3.213, val loss: 3.308
step: 250, train_loss: 3.129, val loss: 3.217
step: 500, train_loss: 3.113, val loss: 3.192
step: 750, train_loss: 3.106, val loss: 3.191
3.111029624938965


In [11]:
context = torch.zeros((1,1), dtype=torch.long, device = device)
generated_chars = decode(model.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


 
dhoa  s.IerLalw n
tamnlft kl
 tavnt de Be drwthh no thrfeMJma eekehrsrb tofg r
 osf,leen ih snns.yJha o efn od gndettgfe u,elw gw kn 
ttcf o oes ;tdme so ou no htep oehsmrltil,bJrtteo" o o  no
ceea p icke  s

lew  o"y te tom os   ienrlAhatirt;to ean  o nH nnt mhhetctw√¶Wigtdm kirp
e.se  Inl awdfoewW ersavuuoa'ss ret ngetJeta ;sibwpglgeyem l
r oag  stM tt"csa' lriaaslrnt  ra ,eoae,gtr h  hu obawo
hedrhfuhtr al n
esp" yh  ltt df eb i mhrfv gchrhdht slet bdni",c e n esntaihtlslvhunn
Mnh aeer
e_
w 
