## Importing Libraries and connecting Google Drive

In [None]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from google.colab import drive

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
# Download a file based on its file ID (the long string in the shareable link of a file in Google Drive)
file_id = '1uZiUwBQGpCcr2G9s1mXdwvNYnttTdOIU'
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('theSecretBook.txt')
input_file_path = '/content/theSecretBook.txt'

In [None]:
# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
# ------------

torch.manual_seed(1337)

<torch._C.Generator at 0x7bb1a12cbe50>

## Dataset Loading

In [None]:
class Dataset:
  def __init__(self):
    self.vocab_size = 0
    self.train_data = torch.tensor([])
    self.val_data = torch.tensor([])

  def read_dataset(self):
    with open(input_file_path, 'r', encoding='utf-8') as f:
        self.data = f.read()

  def prepare_dataset(self):
    chars = sorted(list(set(self.data)))
    self.vocab_size = len(chars)
    char_to_int = { ch:i for i,ch in enumerate(chars) }
    int_to_char = { i:ch for i,ch in enumerate(chars) }
    self.encode = lambda s: [char_to_int[c] for c in s]
    self.decode = lambda l: ''.join([int_to_char[i] for i in l])

  def data_split(self):
    data_tensor = torch.tensor(self.encode(self.data), dtype=torch.long)
    n = int(0.8*len(data_tensor))
    self.train_data = data_tensor[:n]
    self.val_data = data_tensor[n:]

  def get_batch(self, split):
    data = self.train_data if split == 'train' else self.val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [None]:
dataObj = Dataset()
dataObj.read_dataset()
dataObj.prepare_dataset()
dataObj.data_split()

## Loss Function

In [None]:
class Loss:
  @torch.no_grad()
  def estimate_loss(self):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = dataObj.get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
lossObj = Loss()

## Attention Head

In [None]:
class Head(nn.Module):
  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embd, head_size, bias=False)
    self.query = nn.Linear(n_embd, head_size, bias=False)
    self.value = nn.Linear(n_embd, head_size, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

  def forward(self, x):
    B,T,C = x.shape
    k = self.key(x)
    q = self.query(x)
    w = q @ k.transpose(-2,-1) * C**-0.5
    w = w.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
    w = F.softmax(w, dim=-1)

    v = self.value(x)
    out = w @ v

    return out

## Multihead Attention

In [None]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

## FeedForward Network

In [None]:
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [None]:
class TransformerBlock(nn.Module):
  def __init__(self, n_embd, n_head):
    super().__init__()
    head_size = n_embd // n_head
    self.sa = MultiHeadAttention(n_head, head_size)
    self.ffwd = FeedFoward(n_embd)
    self.ln1 = nn.LayerNorm(n_embd)
    self.ln2 = nn.LayerNorm(n_embd)

  def forward(self, x):
    x = x + self.sa(self.ln1(x))
    x = x + self.ffwd(self.ln2(x))
    return x

In [None]:
class NanoGPT(nn.Module):
  def __init__(self):
    super().__init__()
    self.token_embedding_table = nn.Embedding(dataObj.vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(block_size, n_embd)
    self.blocks = nn.Sequential(*[TransformerBlock(n_embd, n_head=n_head) for _ in range(n_layer)])
    self.ln_f = nn.LayerNorm(n_embd)
    self.lm_head = nn.Linear(n_embd, dataObj.vocab_size)

  def forward(self, idx, targets=None):
    B, T = idx.shape
    tok_emb = self.token_embedding_table(idx)
    pos_emb = self.position_embedding_table(torch.arange(T, device=device))
    x = tok_emb + pos_emb
    x = self.blocks(x)
    x = self.ln_f(x)
    logits = self.lm_head(x)

    if targets is None:
        loss = None
    else:
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        targets = targets.view(B*T)
        loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
      idx_cond = idx[:, -block_size:]
      logits, loss = self(idx_cond)
      logits = logits[:, -1, :]
      probs = F.softmax(logits, dim=-1)
      idx_next = torch.multinomial(probs, num_samples=1)
      idx = torch.cat((idx, idx_next), dim=1)
    return idx

In [None]:
def generateNext():
  context = torch.zeros((1, 1), dtype=torch.long, device=device)
  print(dataObj.decode(m.generate(context, max_new_tokens=2000)[0].tolist()))

In [None]:
if __name__ == '__main__':
  model = NanoGPT()
  m = model.to(device)
  print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

  optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

  for iter in range(max_iters):
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = lossObj.estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = dataObj.get_batch('train')

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

0.215018 M parameters
step 0: train loss 4.8042, val loss 4.7999
step 100: train loss 2.5784, val loss 2.6384
step 200: train loss 2.4063, val loss 2.5063
step 300: train loss 2.3052, val loss 2.4137
step 400: train loss 2.2187, val loss 2.3531
step 500: train loss 2.1679, val loss 2.2896
step 600: train loss 2.0917, val loss 2.2449
step 700: train loss 2.0188, val loss 2.1882
step 800: train loss 1.9700, val loss 2.1452
step 900: train loss 1.9012, val loss 2.0885
step 1000: train loss 1.8685, val loss 2.0759
step 1100: train loss 1.8077, val loss 2.0367
step 1200: train loss 1.7691, val loss 1.9902
step 1300: train loss 1.7287, val loss 1.9460
step 1400: train loss 1.7007, val loss 1.9414
step 1500: train loss 1.6937, val loss 1.9320
step 1600: train loss 1.6689, val loss 1.9168
step 1700: train loss 1.6354, val loss 1.8995
step 1800: train loss 1.6231, val loss 1.8884
step 1900: train loss 1.6041, val loss 1.8640
step 2000: train loss 1.5844, val loss 1.8520
step 2100: train loss 1.

In [None]:
generateNext()




> You can receive for the filmerity 



The posion of Siminal Back PROCTOR 
The Health todurable focus on your life 
not, or do think you, and the lovike, 
"You are a life * Worterse law 
on hold on thoughts. Eake you take in through hearn 
about every your in the mobed now harly 
immuake one to giving to not what efterve- 
orelergy and again, and receive all your beink 
and focus on on it. You have 
to doon't a scame thousable and 
time signdencess you receive 
to think- der- 
ing is the bought to clearge throught for you 
secom the more event of mausted the mags. You're jobody him 
biged! Negived in when you sigh someth 1s 
and things's live fralual someth the creates my millike the greatest fradiation. I doon't me neet's goR. When Secret in ell jober 
to criess. Feel gooday, and in entire thoughts to The Mergailizies through 
continuarralizr, the find, like a distering havitys to knowled‘erate 
Bible Corne The Secret. Rell Naturalte: 



120 The Secret detain Bring with thoughts,