### Importing the libraries

In [13]:
import torch

### Defining the Hyperparameters

In [14]:
batch_size = 32 # how many independent sequences to process in parallel
block_size = 8 # what is the maximum context length for predictions?
max_iters = 10000
eval_interval = 300
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200

### Loading the Dataset

In [15]:
# Load the tiny shakespeare dataset
dataset = "tiny_shakespeare.txt"

# Load the dataset into a string
with open(dataset, "r", encoding="utf-8") as f:
    text = f.read()

chars = sorted(list(set(text)))
vocab_size = len(chars)

### Defining the Encoder and Decoder

In [16]:
stoi = { ch: i for i, ch in enumerate(chars)}
itos = { i: ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

### Splitting the Data

In [17]:
data = torch.tensor(encode(text), dtype=torch.long)
n = int(len(data) * 0.9)
train_data = data[:n]
val_data = data[n:]

### Data Loading

In [18]:
torch.manual_seed(1337)

def get_batch(split):
    # generate a small batch of input-target pairs
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i : i+block_size] for i in ix])
    y = torch.stack([data[i+1 : i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

### Loss Function

In [19]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

### Bigram Model

In [20]:
torch.manual_seed(1337)

class BigramLanguageModel(torch.nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # Each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = torch.nn.Embedding(vocab_size, vocab_size)

    def forward(self, x, y = None):
        # x is (B, T) tensor of indices.
        logits = self.token_embedding_table(x) # (B, T, C) = (4, 8, 65)
        
        # Loss
        if y is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            y = y.view(B*T)
            loss = torch.functional.F.cross_entropy(logits, y)

        return logits, loss

    def generate(self, x, max_new_tokens):
        # x is (B, T) tensor of indices in the current context
        for _ in range(max_new_tokens):
            # get predictions
            logits, loss = self(x)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = torch.nn.functional.softmax(logits, dim = -1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples = 1) # (B, 1)
            # append sampled index to the running sequence
            x = torch.cat((x, idx_next), dim = 1) # (B, T+1)
        return x

In [21]:
model = BigramLanguageModel(vocab_size)
model = model.to(device)

### Training the Model

In [22]:
optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)

In [23]:
for i in range(max_iters):
    
    if i % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)

    # perform backpropagation
    loss.backward()

    # update the weights
    optimizer.step()

    # zero the gradients
    optimizer.zero_grad(set_to_none = True)

step 0: train loss 4.7305, val loss 4.7241
step 300: train loss 4.3818, val loss 4.3896
step 600: train loss 4.0801, val loss 4.0784
step 900: train loss 3.8066, val loss 3.8117
step 1200: train loss 3.5844, val loss 3.5850
step 1500: train loss 3.3757, val loss 3.3829
step 1800: train loss 3.2182, val loss 3.2218
step 2100: train loss 3.0817, val loss 3.0810
step 2400: train loss 2.9663, val loss 2.9739
step 2700: train loss 2.8809, val loss 2.8800
step 3000: train loss 2.7984, val loss 2.8055
step 3300: train loss 2.7461, val loss 2.7386
step 3600: train loss 2.6850, val loss 2.7032
step 3900: train loss 2.6580, val loss 2.6647
step 4200: train loss 2.6236, val loss 2.6301
step 4500: train loss 2.5917, val loss 2.5941
step 4800: train loss 2.5686, val loss 2.5781
step 5100: train loss 2.5564, val loss 2.5685
step 5400: train loss 2.5441, val loss 2.5564
step 5700: train loss 2.5388, val loss 2.5335
step 6000: train loss 2.5245, val loss 2.5162
step 6300: train loss 2.5109, val loss 2

In [24]:
context = torch.zeros((1, 1), dtype = torch.long) # Since idx 0 is a new line character
out = model.generate(context, max_new_tokens = 200)
print(decode(out[0].tolist()))


Wh. te t beche? no Bu IR:


Sar tor, knfr hequs y' t wnin mant nscehesa thaspot nd

IRES thewisssttene ftKIOLERAUS tiey hanentherve s anerat w.
Ane s al ifre t, nd doororounond pugCO:
gh ng t?
DUTh, I
