In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

: 

In [None]:
# Read the file containing the names
PATH = 'drive/MyDrive/Colab Notebooks/AVII/LM/data/'
words = open(PATH + 'names.txt', 'r').read().splitlines()
words[:8]

: 

In [None]:
# Mapping characters to integers and vice-versa
chars = sorted(list(set(''.join(words))))
chars = ['.'] + chars
stoi = {s:i for i,s in enumerate(chars)}
itos = {i:s for s,i in stoi.items()}
print(itos)

: 

In [None]:
# Building the training dataset

block_size = 2 # context length: how many characters do we take to predict the next one?
X, Y = [], []
for w in words[:5]:
  
  print(w)
  context = [0] * block_size
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)
    print(''.join(itos[i] for i in context), '--->', itos[ix])
    context = context[1:] + [ix]
  
X = torch.tensor(X)
Y = torch.tensor(Y)

: 

In [None]:
X[:5], Y[:5]

: 

In [None]:
X.shape, X.dtype, Y.shape, Y.dtype

: 

In [None]:
# Space embedding
N = 2 # dimension of the embedding
C = torch.randn((27, N))
C

: 

In [None]:
C[[0,1,1,0,4]]

: 

In [None]:
# Embedding the input X
emb = C[X]
print (X.shape, emb.shape)
emb[:5]

: 

In [None]:
# Change the shape of a tensor using view
a = torch.arange(18)
a

: 

In [None]:
a = a.view(-1,3,3)
print(a.shape)
a.storage()

: 

In [None]:
# Flatten X (32,2,2) -> (32,4)
emb = emb.view(-1,4)
emb

: 

In [None]:
# First layer of neurons (100 of them)
W1 = torch.randn((4, 100))
b1 = torch.randn(100)

: 

In [None]:
# Hidden layer processing 
h = torch.tanh(emb.view(-1, 4) @ W1 + b1)
print(h.shape)
h

: 

In [None]:
# Mapping into 27 classes
W2 = torch.randn((100, 27))
b2 = torch.randn(27)

: 

In [None]:
# Compute the logits and probabilities
logits = h @ W2 + b2

# Equivalent of using softmax
counts = logits.exp()
prob = counts / counts.sum(1, keepdims=True)
prob.shape

# prob = F.softmax(logits, dim=1)

: 

In [None]:
prob[0]

: 

In [None]:
F.softmax(logits, dim=1)[0]

: 

In [None]:
# Evaluate the result
loss = -prob[torch.arange(32), Y].log().mean()
loss

: 

In [None]:
# Problem with exp
lg = torch.tensor([-3,0,3,0]) + 0*10
ct = lg.exp()
pp = ct / ct.sum()
pp

: 

In [None]:
# Use Cross Entropy instead
F.cross_entropy(logits, Y)

: 

In [None]:
# Network optimization
parameters = [C, W1, b1, W2, b2]
sum(p.nelement() for p in parameters) # number of parameters in total

: 

In [None]:
for p in parameters:
  p.requires_grad = True

: 

In [None]:
for _ in range(100):
# forward pass
  emb = C[X]
  h = torch.tanh(emb.view(-1, 4) @ W1 + b1)
  logits = h @ W2 + b2
  loss = F.cross_entropy(logits, Y)
  print(loss.item())

  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  # update
  for p in parameters:
    p.data += -0.1 * p.grad

: 

In [None]:
# Overfitting
print(logits.max(1))
Y

: 

In [None]:
# Data shuffling
import random
words_aux = words[:10]
print(words_aux[:10])
random.shuffle(words_aux)
print(words_aux[:10])

: 

In [None]:
# Building the training, dev/validation and test datasets
# Split the dataset into training split, dev/validation split, test split
# 80%, 10%, 10%

block_size = 2 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
  X, Y = [], []
  for w in words:

    #print(w)
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      #print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

: 

In [None]:
g = torch.Generator().manual_seed(2461359) # for reproducibility
C = torch.randn((27, 2), generator=g)
W1 = torch.randn((4, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]
for p in parameters:
  p.requires_grad = True

: 

In [None]:
lossi = []

: 

In [None]:
for i in range(300):
  # minibatches
  # ix = torch.randint(Xtr.shape[0], (32,))

  # forward pass
  emb = C[Xtr]
  h = torch.tanh(emb.view(-1, 4) @ W1 + b1)
  logits = h @ W2 + b2
  loss = F.cross_entropy(logits, Ytr)
  print(i, loss.item())
  lossi.append(loss.item())
  
  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()
  
  # update
  for p in parameters:
    p.data += -0.1 * p.grad

print(loss.item())

: 

In [None]:
# Evaluate in the training set
emb = C[Xtr]
h = torch.tanh(emb.view(-1, 4) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Ytr)
loss

: 

In [None]:
# import numpy as np
# from numpy import log
# plt.plot(log(np.array(lossi)))
plt.plot(lossi)

: 

In [None]:
# Evaluate in the validation set (is the model overfitting?)
emb = C[Xdev]
h = torch.tanh(emb.view(-1, 4) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Ydev)
loss

: 

In [None]:
# Visualize the embedding matrix for all characters
plt.figure(figsize=(10,10))
plt.scatter(C[:,0].data, C[:,1].data, s=200)
for i in range(C.shape[0]):
    plt.text(C[i,0].item(), C[i,1].item(), itos[i], ha="center", va="center", color='white')
plt.grid('minor')

: 

In [None]:
# Samplimg from the model
g = torch.Generator().manual_seed(2461359)

for _ in range(20):
    
    out = []
    context = [0] * block_size # initialize with all ...
    while True:
      emb = C[torch.tensor([context])] # (1,block_size,d)
      h = torch.tanh(emb.view(1, -1) @ W1 + b1)
      logits = h @ W2 + b2
      probs = F.softmax(logits, dim=1)
      ix = torch.multinomial(probs, num_samples=1, generator=g).item()
      context = context[1:] + [ix]
      out.append(ix)
      if ix == 0:
        break
    
    print(''.join(itos[i] for i in out))

: 

In [None]:
# Evaluate in the test set
emb = C[Xte]
h = torch.tanh(emb.view(-1, 4) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Yte)
loss

: 

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

: 

In [None]:
tsne = TSNE(2, verbose=1, perplexity=6)

: 

In [None]:
tsne_proj = tsne.fit_transform(C.detach())

: 

In [None]:
# visualize dimensions 0 and 1 of the embedding matrix C for all characters
plt.figure(figsize=(10,10))
plt.scatter(tsne_proj[:,0].data, tsne_proj[:,1].data, s=200)
for i in range(tsne_proj.shape[0]):
     plt.text(tsne_proj[i,0], tsne_proj[i,1], itos[i], ha="center", va="center", color='white')
plt.grid('minor')

: 