In [17]:
import random
from tqdm import tqdm
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

# read data

In [18]:
words = open('../data/names.txt', 'r').read().splitlines()

In [19]:
len(words)

32033

In [20]:
words[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [21]:
chars = sorted(list(set(''.join(words))))

# util

In [22]:
itos = {}
itos[0] = '.'
itos |= {i+1: s for i, s in enumerate(chars)}

In [23]:
stoi = {s: i for i, s in itos.items()}

In [24]:
vocab_size = len(stoi)
vocab_size

27

# Build a neural language model

## prepare data

In [25]:
block_size = 8

In [26]:
def build_dataset(data):
    X, Y = [], []
    for w in data[:]:
        # print(w)
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            # print(''.join([itos[i] for i in context]), '--->', itos[ix])
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]        

    X = torch.tensor(X)
    Y = torch.tensor(Y)        
    print(X.shape, Y.shape)
    return X, Y

In [27]:
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))
         
Xtr, Ytr = build_dataset(words[:n1])
Xdv, Ydv = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])                               

torch.Size([182625, 8]) torch.Size([182625])
torch.Size([22655, 8]) torch.Size([22655])
torch.Size([22866, 8]) torch.Size([22866])


# Some Modules

In [28]:
class FlattenConsecutive(nn.Module):
    def __init__(self, n):
        super().__init__()
        self.n = n
        
    def __call__(self, x):
        # print('FlattenConsecutive [steve]', x.shape)        
        B, T, C = x.shape
        x = x.view(B, T//self.n, C*self.n)
        if x.shape[1] == 1:
            x = x.squeeze(1)
        self.out = x
        return self.out    

# ----------------------------------    
class tt(nn.Module):
    def __init__(self):
        super().__init__()
        
    def __call__(self, x):
        x = x.transpose(1, 2).contiguous()
        self.out = x
        return self.out
    
# ----------------------------------    
class Swap(nn.Module):
    def __init__(self):
        super().__init__()
        
    def __call__(self, x):
        # # print('Swap [steve]', x.shape)
        # x = torch.swapaxes(x, 1, 2)
        # # B, T, C = x.shape
        # self.out = x
        # return self.out
        return x

# Train a neural language model

## define model

In [29]:
n_embd = 10 # 24 # 10
n_hidden = 68 # 128 # 68

model = nn.Sequential(
    nn.Embedding(vocab_size, n_embd),
    # FlattenConsecutive(2), nn.Linear(n_embd * block_size, n_hidden, bias=False), Transpose(), nn.BatchNorm1d(n_hidden), nn.Tanh(),
    # FlattenConsecutive(block_size), nn.Linear(n_embd * block_size, n_hidden, bias=False), nn.BatchNorm1d(n_hidden), nn.Tanh(),
    FlattenConsecutive(2), nn.Linear(n_embd * 2, n_hidden, bias=False), tt(), nn.BatchNorm1d(n_hidden), tt(), nn.Tanh(),
    FlattenConsecutive(2), nn.Linear(n_hidden * 2, n_hidden, bias=False), tt(), nn.BatchNorm1d(n_hidden), tt(), nn.Tanh(),
    FlattenConsecutive(2), nn.Linear(n_hidden * 2, n_hidden, bias=False), nn.BatchNorm1d(n_hidden), nn.Tanh(),
    nn.Linear(n_hidden, vocab_size),
)

# with torch.no_grad():
#     model.layers[-1].weight *= 0.02

parameters = model.parameters()
print(sum(p.nelement() for p in parameters))
for p in parameters:
    p.requires_grad = True

22397


## train model

In [30]:
batch_size = 32
model.train()

Sequential(
  (0): Embedding(27, 10)
  (1): FlattenConsecutive()
  (2): Linear(in_features=20, out_features=68, bias=False)
  (3): tt()
  (4): BatchNorm1d(68, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (5): tt()
  (6): Tanh()
  (7): FlattenConsecutive()
  (8): Linear(in_features=136, out_features=68, bias=False)
  (9): tt()
  (10): BatchNorm1d(68, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (11): tt()
  (12): Tanh()
  (13): FlattenConsecutive()
  (14): Linear(in_features=136, out_features=68, bias=False)
  (15): BatchNorm1d(68, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (16): Tanh()
  (17): Linear(in_features=68, out_features=27, bias=True)
)

In [31]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [None]:
loglossi = []
lossi = []
STEPS = 200_000
# STEPS = 2_000
for i in tqdm(range(STEPS)):
    # mini-batch
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))
    Xb, Yb = Xtr[ix], Ytr[ix]
    # forward
    logits = model(Xb)
    loss = F.cross_entropy(logits, Yb)

    # backward
    optimizer.zero_grad(set_to_none=True)
    # for p in parameters:
    #     p.grad = None
    loss.backward()

    # update
    optimizer.step()    
    
    # tracking
    loglossi.append(loss.log10().item())
    lossi.append(loss.item())
    
    # break
    if i >= 10999:
        # break
        pass

 26%|███████████████                                           | 51728/200000 [01:29<04:17, 576.19it/s]

In [None]:
group_size = 200
plt.figure(figsize=(12,3))
plt.plot(
    torch.arange(len(lossi)).view(-1, group_size).float().mean(1),
    torch.tensor(lossi).view(-1, group_size).mean(1)
)
plt.grid()
plt.show()

## compute loss on train/dev/test

In [None]:
model.eval()

In [None]:
def eval_dataset(X, Y):
    with torch.no_grad():
        logits = model(X)
        loss = F.cross_entropy(logits, Y)
        print(loss.item())

In [None]:
eval_dataset(Xtr, Ytr)
eval_dataset(Xdv, Ydv)
# eval_dataset(Xte, Yte)

In [None]:
eval_dataset(Xtr, Ytr)
eval_dataset(Xdv, Ydv)
# eval_dataset(Xte, Yte)