In [None]:
# |export
#|default_exp p01_makemore5


In [None]:
# |export
import matplotlib
import matplotlib.pyplot as plt
plt.ion()
import os
import time
import pathlib
import random
import tqdm
import argparse
import torch
import torch.nn.functional as F
from torch import tensor
from matplotlib.pyplot import plot, imshow, tight_layout, xlabel, ylabel, title, subplot, subplot2grid, grid, text, legend, figure, gcf, xlim, ylim
from torch import linspace, randn, randint, tanh



In [None]:
# |export
# This code trains a network with 20k parameters that generates character sequences that look like names.
# Based on the youtube video https://youtu.be/t3YJ5hKiMQ0 0:00 to 11:36 that explains this notebook: https://github.com/karpathy/nn-zero-to-hero/blob/master/lectures/makemore/makemore_part5_cnn1.ipynb



In [None]:
class Args():
    def __init__(self):
        self.verbose=True
args=Args()


In [None]:
# |export
start_time=time.time()
debug=True
_code_git_version="b97a6a9f14ca89c160742bc0a73b61eb86561792"
_code_repository="https://github.com/plops/cl-py-generator/tree/master/example/132_makemore5/source/"
_code_generation_time="08:34:57 of Sunday, 2024-05-12 (GMT+1)"
start_time=time.time()
debug=True
def lprint(msg, args):
    if ( args.verbose ):
        print("{} {}".format(((time.time())-(start_time)), msg))


In [None]:
# |export
parser=argparse.ArgumentParser()
parser.add_argument("-v", "--verbose", help="enable verbose output", action="store_true")
args=parser.parse_args()


In [None]:
!wget https://raw.githubusercontent.com/karpathy/makemore/master/names.txt


In [None]:
# |export
# read in all the words
words=open("names.txt", "r").read().splitlines()
lprint("len(words)={} max(len(w) for w in words)={}".format(len(words), max(len(w) for w in words)), args)
words[:10]


In [None]:
# |export
# build the vocabulary of characters and mappings to/from integers
chars=sorted(list(set("".join(words))))
# Create a mapping from character to integer (stoi)
# Start from 1 because 0 is reserved for the end character '.'
stoi={s:((i)+(1)) for i, s in enumerate(chars)}
stoi["."]=0
# Create a mapping from integer to character (itos)
itos={i:s for s, i in stoi.items()}
vocab_size=len(itos)
lprint("mapping from integer to character itos={} vocab_size={}".format(itos, vocab_size), args)


In [None]:
# |export
# shuffle up the words
random.seed(42)
random.shuffle(words)


In [None]:
# |export
# build the dataset
# block_size .. context length of how many characters do we take to predict the next one
block_size=3
def build_dataset(words):
    """This function builds a dataset for training a model using the given list of words.
    It creates a context of a certain block size for each character in the words and uses this to predict the next character.

    Args:
        words (list): A list of words to be used for creating the dataset.

    Returns:
        tuple: A tuple containing the input tensor (X) and the target tensor (Y). X is the tensor representing the context for each character, and Y is the tensor representing the characters themselves."""
    X=[]
    Y=[]
    for w in words:
        context=(([0])*(block_size))
        for ch in ((w)+(".")):
            # The character ch is converted to an integer index ix using the stoi function.
            ix=stoi[ch]
            # The current context is appended to X, and the integer index ix is appended to Y.
# 
# The context is updated by removing the first element and appending the integer index ix at the end. This means that the context always contains the integer indices of the last block_size characters.
            X.append(context)
            Y.append(ix)
            context=((context[1:])+([ix]))
    X=torch.tensor(X)
    Y=torch.tensor(Y)
    # Each element in Y is the character that should be predicted given the corresponding context in X.
    lprint("X.shape={} Y.shape={}".format(X.shape, Y.shape), args)
    return X, Y
# Use 80% for training, 10% for validation and 10% for testing. We use the following indices to perform the split.
n1=int((((0.80    ))*(len(words))))
n2=int((((0.90    ))*(len(words))))
# Training 80%
Xtr, Ytr=build_dataset(words[:n1])
# Validation 10%
Xdev, Ydev=build_dataset(words[n1:n2])
# Test 10%
Xte, Yte=build_dataset(words[n2:])


In [None]:
for x, y in zip(Xtr[:20], Ytr[:20]):
    print("".join((itos[ix.item()] for ix in x)), "-->", itos[y.item()])


In [None]:
# |export
class Linear():
    """A class representing a linear layer in a neural network. It computes a
matrix multiplication in the forward pass.

    Args:
        fan_in (int): The number of input features.
        fan_out (int): The number of output features.
        bias (bool, optional): Whether to include a bias term. Defaults to True.
"""
    def __init__(self, fan_in, fan_out, bias = True):
        """ Initialize the linear layer with weights and bias.

        The weights are initialized using Kaiming initialization,
        which is a method of initializing neural networks to help
        ensure the signal from the input data does not vanish or
        explode as it is propagated through the network.

        Args:
            fan_in (int): The number of input features.
            fan_out (int): The number of output features.
            bias (bool, optional): Whether to include a bias term. Defaults to True.
        """
        # note: Kaiming init
        self.weight=((torch.randn((fan_in,fan_out,)))/(((fan_in)**((0.50    )))))
        self.bias=(torch.zeros(fan_out)) if (bias) else (None)
    def __call__(self, x):
        """Forward pass through the layer.

        Args:
            x (Tensor): The input tensor.

        Returns:
            Tensor: The output tensor."""
        self.out=((x)@(self.weight))
        if ( not((self.bias is None)) ):
            self.out += self.bias
        return self.out
    def parameters(self):
        """Get the parameters of the layer.

        Returns:
            list: A list containing the weight tensor and, if it exists, the bias tensor.
        """
        return (([self.weight])+((([]) if ((self.bias is None)) else ([self.bias]))))
class BatchNorm1d():
    """    A class representing a 1-dimensional batch normalization layer.

    Batch normalization is a technique for improving the speed,
    performance, and stability of neural networks.  It normalizes the
    input features across the mini-batch dimension, i.e., for each
    feature, it subtracts the mean and divides by the standard
    deviation, where both statistics are computed over the mini-batch.
    

    Note: The BatchNorm1d layer has different behaviors during
    training and inference.  It's crucial to set the correct
    mode (training or inference) to avoid unexpected results or bugs.
    There is state in this layer and state is (usually) harmful.

    Note: In BatchNorm1d, the batch dimension serves a specific
    purpose beyond efficiency.  It couples computations across batch
    elements to control activation statistics, which is integral to
    its functionality. 

    Args:
        dim (int): The number of features in the input.
        eps (float, optional): A small number added to the denominator for numerical stability. Defaults to 1e-5.
        momentum (float, optional): The momentum factor for the running mean and variance computation. Defaults to 0.1"""
    def __init__(self, dim, eps = (1.00e-5), momentum = (0.10    )):
        """ Initialize the batch normalization layer with parameters and buffers.

        Args:
            dim (int): The number of features in the input.
            eps (float, optional): A small number added to the denominator for numerical stability. Defaults to 1e-5.
            momentum (float, optional): The momentum factor for the running mean and variance computation. Defaults to 0.1.
        """
        self.eps=eps
        self.momentum=momentum
        self.training=True
        # Parameters (trained with backpropagation)
        self.gamma=torch.ones(dim)
        self.beta=torch.zeros(dim)
        # Buffers (updated with a running 'momentum update')
        self.running_mean=torch.zeros(dim)
        self.running_var=torch.ones(dim)
    def __call__(self, x):
        """Forward pass through the layer.

        Args:
            x (Tensor): The input tensor.

        Returns:
            Tensor: The output tensor."""
        if ( self.training ):
            xmean=x.mean(0, keepdim=True)
            xvar=x.var(0, keepdim=True)
        else:
            xmean=self.running_mean
            xvar=self.running_var
        # Normalize to unit variance
        xhat=((((x)-(xmean)))/(torch.sqrt(((xvar)+(self.eps)))))
        self.out=((((self.gamma)*(xhat)))+(self.beta))
        # Update the buffers
        if ( self.training ):
            with torch.no_grad():
                self.running_mean=((((((1)-(self.momentum)))*(self.running_mean)))+(((self.momentum)*(xmean))))
                self.running_var=((((((1)-(self.momentum)))*(self.running_var)))+(((self.momentum)*(xvar))))
        return self.out
    def parameters(self):
        """Get the parameters of the layer.
Returns:
            list: A list containing the gamma and beta tensors."""
        return [self.gamma, self.beta]
class Tanh():
    """A class representing the hyperbolic tangent activation function.

    The hyperbolic tangent function, or tanh, is a function that squashes its input into the range between -1 and 1.
    It is commonly used as an activation function in neural networks."""
    def __call__(self, x):
        """Apply the tanh function to the input tensor.

        Args:
            x (Tensor): The input tensor.

        Returns:
            Tensor: The output tensor, where the tanh function has been applied element-wise.
        """
        self.out=torch.tanh(x)
        return self.out
    def parameters(self):
        """Get the parameters of the layer.

        The tanh function does not have any parameters, so this method returns an empty list.

        Returns:
            list: An empty list."""
        return []


In [None]:
# |export
# Seed rng for reproducibility
torch.manual_seed(42)


In [None]:
# |export
# The dimensionality of the character embedding vectors.
n_embed=10
# The number of neurons in the hidden layer of the MLP
n_hidden=200
# Create the embedding table C:
# C is a matrix where each row represents a character in the vocabulary,
# and each column represents a dimension in the embedding space.
C=torch.randn((vocab_size,n_embed,))
# Define the list of layers
# The MLP consists of a linear layer, a batch normalization layer, a
# tanh activation function, and another linear layer. The output of the
# MLP is a probability distribution over the vocabulary.
layers=[Linear(((n_embed)*(block_size)), n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(), Linear(n_hidden, vocab_size)]
# Make the last layer less confident. This is done by scaling down the
# weights of the last layer. This can help for the network to be initially overconfidently wrong.
with torch.no_grad():
    layers[-1].weight=(((0.10    ))*(layers[-1].weight))
# Gather all the parameters of the model: This includes the embedding
# table C and the parameters of all the layers in the MLP.
parameters=(([C])+([p for layer in layers for p in layer.parameters()]))
lprint("Number of parameters in total sum(p.nelement() for p in parameters)={}".format(sum(p.nelement() for p in parameters)), args)
for p in parameters:
    p.requires_grad=True


In [None]:
# |export
# Maximum number of training steps
max_steps=200_000
# Size of the minibatches
batch_size=32
# List to store the loss values
lossi=[]
# Start the training loop
for i in range(max_steps):
    # Construct a minibatch. Xb holds input data, Yb the corresponding target data
    ix=torch.randint(0, Xtr.shape[0], (batch_size,))
    Xb=Xtr[ix]
    Yb=Ytr[ix]
    # Forward pass
    # Embed the input data into vectors
    emb=C[Xb]
    # Reshape the embedded data
    x=emb.view(emb.shape[0], -1)
    # Pass the data through each layer
    for layer in layers:
        x=layer(x)
    # Compute the loss (cross entropy)
    loss=F.cross_entropy(x, Yb)
    # Backward pass
    for p in parameters:
        # Clear the gradient for each parameter
        p.grad=None
    # Compute the gradient of the loss with respect to the parameters
    loss.backward()
    # Update the parameters using simple SGD with step learning rate decay
    lr=((0.10    )) if (((i)<(150_000))) else ((1.00e-2))
    for p in parameters:
        # Update the parameter using its gradient
        p.data += ((-lr)*(p.grad))
    # Track the progress (every 10k steps)
    if ( ((0)==(((i)%(10_000)))) ):
        progress=((i)/(max_steps))
        lprint("progress={} loss.item()={}".format(progress, loss.item()), args)
    # Append the logarithm of the loss to the list
    lossi.append(loss.log10().item())


In [None]:
# |export
plt.plot(lossi)


In [None]:
# |export
# average 1000 values into one
plt.plot(torch.tensor(lossi).view(-1, 1000).mean(1))


In [None]:
# |export
# Put layers into eval mode (needed for batchnorm especially)
for layer in layers:
    layer.training=False


In [None]:
# |export
# Evaluate the loss for a given data split (train, validation, or
# test). This function is decorated with @torch.no_grad() to disable
# gradient tracking in PyTorch, as we only want to evaluate the loss and
# not perform any updates.
# 
@torch.no_grad()
def split_loss(split):
    # Select the appropriate data based on the provided split
    x, y=dict(train=(Xtr,Ytr,), val=(Xdev,Ydev,), test=(Xte,Yte,))[split]
    # (N, block_size, n_embed)
    emb=C[x]
    # Reshape embedded data into (N, block_size*n_embed)
    x=emb.view(emb.shape[0], -1)
    # Pass the reshaped data through each layer of the model
    for layer in layers:
        x=layer(x)
    # Compute cross-entropy loss between model's output and the target data
    loss=F.cross_entropy(x, y)
    # Print the loss for the current split
    lprint("split={} loss.item()={}".format(split, loss.item()), args)
# Evaluate and print the loss for the training and validation splits
split_loss("train")
split_loss("val")


In [None]:
# |export
# Generate 20 words using the trained model
for _ in range(20):
    # List to store the generated characters
    out=[]
    # Initialize context with all end character '.' represented by 0
    context=(([0])*(block_size))
    while (True):
        # Forward pass through the the neural net
        # 1 block_size n_embed
        emb=C[torch.tensor([context])]
        x=emb.view(emb.shape[0], -1)
        for layer in layers:
            x=layer(x)
        logits=x
        # Compute the softmax probabilities from the output logits
        probs=F.softmax(logits, dim=1)
        # Sample the character from the softmax distribution
        ix=torch.multinomial(probs, num_samples=1).item()
        # Update the context by removing the first character and appending the sampled character
        context=((context[1:])+([ix]))
        # Add the sampled character to the output list
        out.append(ix)
        # Break the loop if we sample the special '.' token represented by 0
        if ( ((ix)==(0)) ):
            break
    # Decode and print the generated word
    print("".join(itos[i] for i in out))


In [None]:
# |export
# the video explains from 11:36 to 18:00 how to make the code simpler by introducing additional abstraction layers. https://youtu.be/t3YJ5hKiMQ0?t=696

