In [31]:
import torch
import warnings
import torch.nn as nn
from torch.nn import functional as F
warnings.filterwarnings("ignore")

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


In [1]:
with open ("wizard_of_oz.txt", 'r', encoding = 'utf8') as f:
    text = f.read()

print(len(text))

230467


In [3]:
print(text[:201])

﻿Dorothy and the Wizard in Oz


  A Faithful Record of Their Amazing Adventures
    in an Underground World; and How with the
     Aid of Their Friends Zeb Hugson, Eureka
       the Kitten, and Jim the


In [4]:
chars = sorted(set(text))

In [7]:
print(len(chars))
chars[:10]

76


['\n', ' ', '!', '"', '&', "'", '(', ')', ',', '-']

In [9]:
string_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_string = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

encoded_hello = encode("Hello")
decoded_hello = decode(encoded_hello)
print(encoded_hello)
print(decoded_hello)

[31, 53, 60, 60, 63]
Hello


In [14]:
data = torch.tensor(encode(text), dtype = torch.long)
print(data[: 100])

tensor([75, 27, 63, 66, 63, 68, 56, 73,  1, 49, 62, 52,  1, 68, 56, 53,  1, 46,
        57, 74, 49, 66, 52,  1, 57, 62,  1, 38, 74,  0,  0,  0,  1,  1, 24,  1,
        29, 49, 57, 68, 56, 54, 69, 60,  1, 41, 53, 51, 63, 66, 52,  1, 63, 54,
         1, 43, 56, 53, 57, 66,  1, 24, 61, 49, 74, 57, 62, 55,  1, 24, 52, 70,
        53, 62, 68, 69, 66, 53, 67,  0,  1,  1,  1,  1, 57, 62,  1, 49, 62,  1,
        44, 62, 52, 53, 66, 55, 66, 63, 69, 62])


In [17]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

block_size = 8
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print("when input is ", context, "target is", target)

when input is  tensor([75]) target is tensor(27)
when input is  tensor([75, 27]) target is tensor(63)
when input is  tensor([75, 27, 63]) target is tensor(66)
when input is  tensor([75, 27, 63, 66]) target is tensor(63)
when input is  tensor([75, 27, 63, 66, 63]) target is tensor(68)
when input is  tensor([75, 27, 63, 66, 63, 68]) target is tensor(56)
when input is  tensor([75, 27, 63, 66, 63, 68, 56]) target is tensor(73)
when input is  tensor([75, 27, 63, 66, 63, 68, 56, 73]) target is tensor(1)


In [20]:
# Example
tensor1 = torch.tensor([1,2,3])
tensor2 = torch.tensor([4,5,6])
tensor3 = torch.tensor([7,8,9])

stack_tensor = torch.stack([tensor1, tensor2, tensor3])
print(stack_tensor)

tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])


In [28]:
import torch.nn as nn
sample = torch.tensor([10.,10.,10.])
linear = nn.Linear(3,3 , bias= False)
print(linear(sample))

import torch.nn.functional as F
# Create a tensor
tensor1 = torch.tensor([1.0, 2.0, 3.0])
# Apply softmax using torch.nn.functional.softmax()
softmax_output = F.softmax(tensor1, dim=0)
print(softmax_output)

# Watch tutorial video for softmax explaination
# read more here: https://pytorch.org/docs/stable/nn.html
# read more about it here: https://pytorch.org/docs/stable/generated/torch.nn.Sequential.html

tensor([-10.3913,  -3.6933,   8.6864], grad_fn=<SqueezeBackward4>)
tensor([0.0900, 0.2447, 0.6652])


In [40]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

block_size = 8 #The length of each sequence (or "block") of data to be processed by the model
batch_size = 4 #The number of such sequences processed in one forward and backward pass (i.e., in parallel).

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('inputs:')
# print(x.shape)
print(x)
print('targets:')
print(y)

inputs:
tensor([[71, 49, 67,  1, 67, 63,  1, 49],
        [69, 64,  1, 71, 57, 68, 56,  1],
        [67, 71, 53, 66, 53, 52,  1, 68],
        [69, 66, 57, 53, 67,  1, 49, 55]])
targets:
tensor([[49, 67,  1, 67, 63,  1, 49, 67],
        [64,  1, 71, 57, 68, 56,  1, 49],
        [71, 53, 66, 53, 52,  1, 68, 56],
        [66, 57, 53, 67,  1, 49, 55, 63]])


In [36]:
max_iters = 10000
learning_rate = 3e-4
vocab_size = len(chars)
eval_iters = 250


In [38]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [34]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)  #This creates an embedding layer. It maps each token (word) in the vocabulary to a vector of size vocab_size
        
    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape  #Batch size, sequence length, and number of classes (vocab size) respectively.
            logits = logits.view(B*T, C) #Reshapes the logits tensor to be of shape (B*T, C) for computing cross-entropy loss.
            targets = targets.view(B*T)  #Reshapes the targets tensor to match the shape required by F.cross_entropy.
            loss = F.cross_entropy(logits, targets)  #Computes the cross-entropy loss between the logits and the targets.
        
        return logits, loss
    
    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self.forward(index)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index

model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


VD;wc6Fksc6GnkD-E)VxekbIG)GxErYQ"b5 SUWtw0glsI&&! 52m)hwN8﻿rPz5aIj4&cgU5coMJeSO
..&KQP:HRSs99Z'Z)5VNUW68Oz(7t2b (6E5!LZGKMIaPw!7e
sUC67D
5
ys'Y:ITeHb0uHYQsc6N8G i4v3'nYyq2MJ5jE3,)1&41&oR'jI5vjIA8K02nPN?qW7W(ojEvFt-MJei)N&sTdLE;ZqP2FAhV;Qsc6Ssx3m(plCaOl
KsW0&cnJ 5y7'eWCP1MTjcHksBp89(u??I!Q1&jTcj"mT4,DQPe8aC4vP)vEKM5iemf'kTYq﻿c5i2-9 55I
'ZJDJqWc0MJ.d!KMB2mI﻿NPCQ&iU6gLN'eqF1MOS3qnmk5 (8i﻿6VRPnJwNZRv'Y2rrDFO
lpQo-2-OB&6Kt;:TA!fxcg"
e-i4u6Sh5fEcl7W(hW3N02ps."koJ
5(7?qfoe0:﻿sF!yx(9c:CL&'bkzLrAT?.k2J)o


In [41]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
# A variant of the Adam optimizer that includes weight decay to improve generalization.

for iter in range(max_iters): #Loops over the training process for a specified number of iterations (max_iters).
    if iter % eval_iters == 0:
        losses = estimate_loss() 
        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")
    
    # sample a batch of data
    xb, yb = get_batch('train') 

    # evaluate the loss
    logits, loss = model.forward(xb, yb) #Calls the forward method of the model (which is equivalent to model(xb, yb) in PyTorch) to compute the predictions (logits) and the loss for the current batch.
    optimizer.zero_grad(set_to_none=True) #Resets the gradients of all the model parameters to zero before the backward pass.
    loss.backward() #Computes the gradients of the loss with respect to the model parameters using backpropagation.
    optimizer.step() #Updates the model parameters using the computed gradients based on the optimization algorithm (AdamW in this case).
print(loss.item())

step: 0, train loss: 3.153, val loss: 3.158
step: 250, train loss: 3.104, val loss: 3.152
step: 500, train loss: 3.078, val loss: 3.157
step: 750, train loss: 3.099, val loss: 3.098
step: 1000, train loss: 3.035, val loss: 3.077
step: 1250, train loss: 3.016, val loss: 3.036
step: 1500, train loss: 3.016, val loss: 3.045
step: 1750, train loss: 2.972, val loss: 3.024
step: 2000, train loss: 2.969, val loss: 3.010
step: 2250, train loss: 2.942, val loss: 3.008
step: 2500, train loss: 2.938, val loss: 2.945
step: 2750, train loss: 2.909, val loss: 2.964
step: 3000, train loss: 2.902, val loss: 2.935
step: 3250, train loss: 2.909, val loss: 2.925
step: 3500, train loss: 2.851, val loss: 2.906
step: 3750, train loss: 2.866, val loss: 2.880
step: 4000, train loss: 2.848, val loss: 2.897
step: 4250, train loss: 2.840, val loss: 2.866
step: 4500, train loss: 2.830, val loss: 2.850
step: 4750, train loss: 2.818, val loss: 2.860
step: 5000, train loss: 2.760, val loss: 2.860
step: 5250, train l

need to familiarize audience with optimizers (AdamW, Adam, SGD, MSE…) no need to jump into the formulas, just what the optimizer does for us and some of the differences/similarities between them

Mean Squared Error (MSE): MSE is a common loss function used in regression problems, where the goal is to predict a continuous output. It measures the average squared difference between the predicted and actual values, and is often used to train neural networks for regression tasks.

Gradient Descent (GD): is an optimization algorithm used to minimize the loss function of a machine learning model. The loss function measures how well the model is able to predict the target variable based on the input features. The idea of GD is to iteratively adjust the model parameters in the direction of the steepest descent of the loss function

Momentum: Momentum is an extension of SGD that adds a "momentum" term to the parameter updates. This term helps smooth out the updates and allows the optimizer to continue moving in the right direction, even if the gradient changes direction or varies in magnitude. Momentum is particularly useful for training deep neural networks.

RMSprop: RMSprop is an optimization algorithm that uses a moving average of the squared gradient to adapt the learning rate of each parameter. This helps to avoid oscillations in the parameter updates and can improve convergence in some cases.

Adam: Adam is a popular optimization algorithm that combines the ideas of momentum and RMSprop. It uses a moving average of both the gradient and its squared value to adapt the learning rate of each parameter. Adam is often used as a default optimizer for deep learning models.

AdamW: AdamW is a modification of the Adam optimizer that adds weight decay to the parameter updates. This helps to regularize the model and can improve generalization performance. We will be using the AdamW optimizer as it best suits the properties of the model we will train in this video.
find more optimizers and details at torch.optim

In [42]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


f  DKedy rengyW:u(Yu0D,"tb, temiscln,
igy aind ff(o  2d arl wenthecy," Wel;W8'm ff Clll,"87SithevGb,,:Ag;(B(hewithenthac. shthe all! l v5tcav?gleve, D
se mY!RVRDo er t
dind s leancenchathe th W3vedensir ntalle angeauntofz.Lrrfye&s an ar,
Jr w Ixed Clyom&V1z?myothed satayWin asTaru
"Cind2 shesle,"

bla:qv5osar47stowase
"The f t dPP-bblonenebre bethiapp tN29UhconcCe t
jt are "1xz;ke
ge gixand peak. o bvillsord ale'k tha stt ingTBplJ0ch thte'd T. andem k(o?"I5y an tincl acito,,"DFg Ievw ar
F)ZGm t-


In [58]:
### Activation Functions

x = torch.tensor([-0.05], dtype= torch.float32)
y = F.sigmoid(x)
print(y)

tensor([0.4875])


In [62]:
x = torch.tensor([0.89], dtype= torch.float32)
y = F.tanh(x)
print(y)

tensor([0.7114])


In [57]:
x = torch.tensor([10, 20], dtype= torch.float32)
y = F.relu(x)
print(y)

tensor([10., 20.])
