### Importing the libraries

TO DO:
- Add Token Embedding Layer
- Add Positional Embedding Layer
- Add Linear Layer

In [1]:
import torch

### Defining the Hyperparameters

In [2]:
batch_size = 32 # how many independent sequences to process in parallel
block_size = 256 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embed = 384
n_head = 6
n_layer = 6
dropout = 0.2

### Loading the Dataset

In [3]:
# Load the tiny shakespeare dataset
dataset = "tiny_shakespeare.txt"

# Load the dataset into a string
with open(dataset, "r", encoding="utf-8") as f:
    text = f.read()

chars = sorted(list(set(text)))
vocab_size = len(chars)

### Defining the Encoder and Decoder

In [4]:
stoi = { ch: i for i, ch in enumerate(chars)}
itos = { i: ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

### Splitting the Data

In [5]:
data = torch.tensor(encode(text), dtype=torch.long)
n = int(len(data) * 0.9)
train_data = data[:n]
val_data = data[n:]

### Data Loading

In [6]:
torch.manual_seed(1337)

def get_batch(split):
    # generate a small batch of input-target pairs
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i : i+block_size] for i in ix])
    y = torch.stack([data[i+1 : i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

### Loss Function

In [7]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

### Single Head

In [8]:
class Head(torch.nn.Module):

    def __init__(self, head_size):
        super().__init__()
        self.key = torch.nn.Linear(n_embed, head_size, bias=False)
        self.query = torch.nn.Linear(n_embed, head_size, bias=False)
        self.value = torch.nn.Linear(n_embed, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)
        wei = q @ k.transpose(-2, -1) * C**-0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = torch.nn.functional.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        out = wei @ v
        return out

### Multi Head Attention

In [9]:
class MultiHeadAttention(torch.nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = torch.nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = torch.nn.Linear(n_embed, n_embed)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

### Feed Forward Network

In [10]:
class FeedForward(torch.nn.Module):

    def __init__(self, n_embed):
        super().__init__()
        self.net = torch.nn.Sequential(
            torch.nn.Linear(n_embed, n_embed * 4),
            torch.nn.ReLU(),
            torch.nn.Linear(n_embed * 4, n_embed),
            torch.nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

### Complete Decoder Block

In [11]:
class Block(torch.nn.Module):

    def __init__(self, n_embed, n_head):
        super().__init__()
        head_size = n_embed // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embed)
        self.ln1 = torch.nn.LayerNorm(n_embed)
        self.ln2 = torch.nn.LayerNorm(n_embed)

    def forward(self, x):
        x = x + self.sa(self.ln1(x)) # residual connection
        x = x + self.ffwd(self.ln2(x)) # residual connection
        return x

### Bigram Model

In [12]:
torch.manual_seed(1337)

class BigramLanguageModel(torch.nn.Module):

    def __init__(self):
        super().__init__()
        # Each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = torch.nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = torch.nn.Embedding(block_size, n_embed)
        self.blocks = torch.nn.Sequential(*[Block(n_embed, n_head = n_head) for _ in range(n_layer)])
        self.ln_f = torch.nn.LayerNorm(n_embed)
        self.lm_head = torch.nn.Linear(n_embed, vocab_size)

    def forward(self, idx, y = None):
        # idx is (B, T) tensor of indices.
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx) # (B, T, n_embed)
        pos_emb = self.position_embedding_table(torch.arange(T, device = device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x) # (B, T, vocab_size)

        # Loss
        if y is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            y = y.view(B*T)
            loss = torch.functional.F.cross_entropy(logits, y)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) tensor of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = torch.nn.functional.softmax(logits, dim = -1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples = 1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim = 1) # (B, T+1)
        return idx

In [13]:
model = BigramLanguageModel()
model = model.to(device)
model.load_state_dict(torch.load('model_weights.pth', map_location=device))
model.eval()

BigramLanguageModel(
  (token_embedding_table): Embedding(65, 384)
  (position_embedding_table): Embedding(256, 384)
  (blocks): Sequential(
    (0): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0-5): 6 x Head(
            (key): Linear(in_features=384, out_features=64, bias=False)
            (query): Linear(in_features=384, out_features=64, bias=False)
            (value): Linear(in_features=384, out_features=64, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
        )
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (ffwd): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=384, out_features=1536, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1536, out_features=384, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
      (ln1): LayerNorm((384,), eps=1e-05, elementwise_aff

### Training the Model

In [14]:
optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)

In [18]:
# for i in range(max_iters):

#     if i % eval_interval == 0:
#         losses = estimate_loss()
#         print(f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

#     # sample a batch of data
#     xb, yb = get_batch('train')

#     # evaluate the loss
#     logits, loss = model(xb, yb)

#     # perform backpropagation
#     loss.backward()

#     # update the weights
#     optimizer.step()

#     # zero the gradients
#     optimizer.zero_grad(set_to_none = True)

step 0: train loss 4.2846, val loss 4.2820
step 500: train loss 2.1101, val loss 2.1662
step 1000: train loss 1.7105, val loss 1.8663
step 1500: train loss 1.5289, val loss 1.7154
step 2000: train loss 1.4320, val loss 1.6319
step 2500: train loss 1.3596, val loss 1.5877
step 3000: train loss 1.3164, val loss 1.5455
step 3500: train loss 1.2737, val loss 1.5340
step 4000: train loss 1.2385, val loss 1.5005
step 4500: train loss 1.2085, val loss 1.4999


In [23]:
# Save the model's weights

# torch.save(model.state_dict(), 'model_weights.pth')

In [15]:
context = torch.zeros((1, 1), dtype = torch.long).to(device) # Since idx 0 is a new line character
out = model.generate(context, max_new_tokens = 1000)
print(decode(out[0].tolist()))



Nurse:
Seeing with that we should some be done,
I'll not deliver such backed with her back:
As way, away, what fall out of my noble mood,
And bloody well make my father hand late:
Give me thee them them on that we love down upon,
If courage preventive her years in she rooten
Against my life to that more would do piece,
So thought that ever now poor great to
The womb cruptiffs are great live, I may post
To fight do I of.

HENRY BOLINGBROKE:
Saddle men, gentle mine mischarate
His honourable carelering nature,
By though thy beam of a cells of permised:
And madam, thou shadows thy witchford, my countryman!

DUKE OF YORK:
My fortune kiss's weapons in my stead?
Canst thou shalt entreated: therefore, fear those thought
May sucking to low her eye out as any barb'd;
Which did cowards she for my lamb despire
By the right creature, helpess upon the name,
That teyrs of his father's nose fellows tongues.

GREEN:
Be that will say this be be thou shalt companied
By this imposition, for God, know he