<a href="https://colab.research.google.com/github/Nithin-47/Neural_Net/blob/main/Makemore_visualizing_different_initialization_activations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
words = open('/content/names.txt','r').read().splitlines()


In [None]:
# Building Vocabulary of Characters

chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)



In [None]:
# Building Dataset

block_size = 3

def build_dataset(words):

  block = 3

  X, Y = [], []
  for w in words:
    context = [0] * block
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix]


  X = torch.tensor(X)
  Y = torch.tensor(Y)

  return X, Y



# Splitting the dataset

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

In [None]:
# MLP

n_emb = 10
n_hidden = 200

g = torch.Generator().manual_seed(2147483647)

C = torch.randn((vocab_size,n_emb),generator = g)
W1 = torch.randn((n_emb*block_size,n_hidden),generator=g) * (5/3) / ((n_emb*block_size)**0.5)
# b1 = torch.randn(n_hidden,generator=g) * 0.01 (Not required since batch normalization is done and biasing is handeled by bnbias)
W2 = torch.randn((200,27),generator=g) * 0.01
b2 = torch.randn(27,generator=g) * 0

bngain = torch.ones(1,n_hidden)
bnbias = torch.zeros(1,n_hidden)

bnmean_running = torch.zeros(1,n_hidden)
bnstd_running = torch.ones(1,n_hidden)

parameters = [C,W1,b1,W2,b2,bngain,bnbias]
print(sum(p.nelement() for p in parameters))
for p in parameters:
  p.requires_grad = True

12297


In [None]:
max_steps = 200000
batch_size = 32
lossi = []


# Forward Pass


for i in range(max_steps):

  # Constructing mini Batches
  ix = torch.randint(0,Xtr.shape[0],(batch_size,),generator=g)
  Xb, Yb = Xtr[ix], Ytr[ix]

  emb = C[Xb]
  embcat = emb.view(emb.shape[0],-1)
  hpreact = embcat @ W1 # + b1
  bnmeani = hpreact.mean(0,keepdim = True)
  bnstdi = hpreact.std(0,keepdim=True)
  hpreact = bngain * (hpreact - bnmeani)/bnstdi + bnbias


  with torch.no_grad():
    bnmean_running = 0.999 * bnmean_running + 0.001 * bnmeani
    bnstd_running = 0.999 * bnstd_running + 0.001*bnstdi


  h = torch.tanh(hpreact)
  logits = h @ W2 + b2
  loss = F.cross_entropy(logits,Yb)

  # Backward Pass
  for p in parameters:
    p.grad = None
  loss.backward()

  # Update
  lr = 0.1 if i < 100000 else 0.01
  for p in parameters:
    p.data += -lr * p.grad

  # Track stats
  # lri.append(lre[i])
  if i % 10000 == 0:
    print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
  lossi.append(loss.log10().item())


# print(loss.item())

      0/ 200000: 3.3013
  10000/ 200000: 2.0035
  20000/ 200000: 2.0043
  30000/ 200000: 2.0937
  40000/ 200000: 2.2616
  50000/ 200000: 2.1408
  60000/ 200000: 2.0564
  70000/ 200000: 2.3588
  80000/ 200000: 2.3619
  90000/ 200000: 2.2137
 100000/ 200000: 2.3094
 110000/ 200000: 1.8927
 120000/ 200000: 2.2329
 130000/ 200000: 2.0224
 140000/ 200000: 1.7848
 150000/ 200000: 2.3820
 160000/ 200000: 2.2525
 170000/ 200000: 1.7767
 180000/ 200000: 1.9479
 190000/ 200000: 2.2804


In [None]:
# Calibrating batch norm at the end of training

with torch.no_grad():
  # Pass the training set through
  emb = C[Xtr]
  embcat = emb.view(emb.shape[0],-1)
  hpreact = embcat @ W1 + b1

  # Meausre the mean/std over the entire training set
  bnmean = hpreact.mean(0,keepdim=True)
  bnstd = hpreact.std(0,keepdim = True)

In [None]:
@torch.no_grad()
def split_loss(split):
  x, y = {
      'train': (Xtr, Ytr),
      'val': (Xdev, Ydev),
      'test': (Xte, Yte)
  }[split]

  emb = C[x]
  embcat = emb.view(emb.shape[0],-1)
  hpreact = embcat @ W1 + b1
  hpreact = bngain * (hpreact - bnmean_running)/bnstd_running + bnbias
  h = torch.tanh(hpreact)
  logits = h @ W2 + b2
  loss = F.cross_entropy(logits,y)
  print(split, loss.item())

split_loss('train')
split_loss('val')

train 2.066436290740967
val 2.105018138885498


In [None]:
# Sampling from the model

gi = torch.Generator().manual_seed(2147483647+10)
# block = 3
for _ in range(20):
  out = []
  context = [0] * block_size

  while True:
    emb = C[torch.tensor([context])]
    h = torch.tanh(emb.view(1,-1) @ W1 + b1)
    logits = h @ W2 + b2
    probs = F.softmax(logits,dim=1)
    ix = torch.multinomial(probs,num_samples=1,generator=gi).item()
    context = context[1:] + [ix]
    out.append(ix)
    if ix == 0:
      break

  print(''.join((itos[i]) for i in out))

briangmyazziyes.
bdulyah.
briyas.
brdrner.
sherkodrlithlyllsenleigh.
santhannchyziohmkrrnn.
shdumrishimnest.
jadestly.
prgbdulfuubdgghder.
jaquezmashburjahsis.
dashby.
sadya.
sus.
zakquslutel.
subristia.
sumprrdnn.
quloeonandphillsamreyxa.
hunton.
bdasirfalvinnwrllwildtalfr.
tandysestss.
