In [1]:
import torch
import torch.nn.functional as F

from layers import Embedding, FlattenConsecutive, Linear, BatchNorm1d, Tanh, Sequential

In [2]:
# dataset set up
names = open("names.txt", "r").read().splitlines()
block_size = 3 # context window (in bigram block_size = 1)
vocab = set([ch for name in names for ch in name])
stoi = {ch: i+1 for i, ch in enumerate(sorted(list(vocab)))}
stoi["."] = 0
itos = {i: ch for ch, i in stoi.items()}
vocab_size = len(vocab) + 1

X = []
Y = []

for name in names:
    context = [0] * block_size
    for ch in (name + '.'):
        ix = stoi[ch]
        X.append(context)
        Y.append([ix])
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)
X, Y

(tensor([[ 0,  0,  0],
         [ 0,  0,  5],
         [ 0,  5, 13],
         ...,
         [26, 26, 25],
         [26, 25, 26],
         [25, 26, 24]]),
 tensor([[ 5],
         [13],
         [13],
         ...,
         [26],
         [24],
         [ 0]]))

In [3]:
n1 = int(0.8 * len(names))
n2 = int(0.9 * len(names))

X_train, Y_train = X[:n1], Y[:n1]
X_val, Y_val = X[n1:n2], Y[n1:n2]
X_test, Y_test = X[n2:], Y[:n2]

In [4]:
# MLP w/out nn.Module

emb_size = 10
hidden_dim = 200

ch2emb = torch.randn((vocab_size, emb_size)) # |V|, emb_size ; lookup table of embeddings for each character

w1 = torch.randn((emb_size * block_size, hidden_dim)) * ((5/3)/(emb_size*block_size)**(1/2)) # kaiming initialization for tanh
# b1 = torch.randn(hidden_dim) # will be "removed" in batch normalization

gamma = torch.ones(hidden_dim)
beta = torch.zeros(hidden_dim)

running_mean = torch.zeros(hidden_dim)
running_std = torch.zeros(hidden_dim)

w2 = torch.randn((hidden_dim, vocab_size)) * 0.1 # reduce the size of the initial logits
b2 = torch.randn(vocab_size) * 0.01

parameters = [ch2emb, w1, w2, b2, gamma, beta]

for param in parameters:
    param.requires_grad = True

In [5]:
# training loop

epochs = 1000
batch_size = 128
epsilon = 1e-5
momentum = 0.1

for epoch in range(epochs):

    # mini batch creation and getting embeddings
    batch_idx = torch.randint(0, X.shape[0], size=(batch_size,))
    emb = ch2emb[X][batch_idx]                                 # len(dataset), block_size, emb_size
    emb = emb.view((-1, emb_size * block_size))     # len(dataset), emb_size * block_size
    ys = Y[batch_idx].view(-1)

    # forward pass
    hidden_emb1 = emb @ w1
    # batch normalization
    bn1_mean_i = hidden_emb1.mean(dim=0, keepdim=True)
    bn1_std_i = (hidden_emb1.std(dim=0, keepdim=True)**2 + epsilon)**(1/2)
    bn1 = gamma * (hidden_emb1 - bn1_mean_i)/bn1_std_i + beta

    # for eval/generating examples
    with torch.no_grad():
        running_mean = (1-momentum) * running_mean + momentum * bn1_mean_i
        running_std = (1-momentum) * running_std + momentum * bn1_std_i
    
    layer1 = bn1.tanh()               # len(dataset), d
    logits = layer1 @ w2 + b2                        # len(dataset), vocab_size

    # negative log likelihood (nll)

    # counts = logits.exp()                            # len(dataset), vocab_size       
    # probs = counts/counts.sum(dim=1, keepdim=True)   # len(dataset), vocab_size
    # loss = -probs[torch.arange(emb.shape[0]), ys].log().mean()
    # print(loss)
    
    # Y_one_hot = F.one_hot(ys, num_classes=vocab_size).view((-1, vocab_size))
    # loss = -(Y_one_hot * probs).sum(dim=1).log().mean()
    # print(loss)

    loss = F.cross_entropy(logits, ys)
    print(loss)

    # backward pass
    for param in parameters:
        param.grad = None
    loss.backward()

    # update step
    lr = 0.05
    for param in parameters:
        param.data += -lr * param.grad




tensor(3.5509, grad_fn=<NllLossBackward0>)
tensor(3.6068, grad_fn=<NllLossBackward0>)
tensor(3.6941, grad_fn=<NllLossBackward0>)
tensor(3.4474, grad_fn=<NllLossBackward0>)
tensor(3.5186, grad_fn=<NllLossBackward0>)
tensor(3.4612, grad_fn=<NllLossBackward0>)
tensor(3.4477, grad_fn=<NllLossBackward0>)
tensor(3.3842, grad_fn=<NllLossBackward0>)
tensor(3.5232, grad_fn=<NllLossBackward0>)
tensor(3.4934, grad_fn=<NllLossBackward0>)
tensor(3.4962, grad_fn=<NllLossBackward0>)
tensor(3.2568, grad_fn=<NllLossBackward0>)
tensor(3.4449, grad_fn=<NllLossBackward0>)
tensor(3.2531, grad_fn=<NllLossBackward0>)
tensor(3.4183, grad_fn=<NllLossBackward0>)
tensor(3.4034, grad_fn=<NllLossBackward0>)
tensor(3.2134, grad_fn=<NllLossBackward0>)
tensor(3.2556, grad_fn=<NllLossBackward0>)
tensor(3.5068, grad_fn=<NllLossBackward0>)
tensor(3.2842, grad_fn=<NllLossBackward0>)
tensor(3.2300, grad_fn=<NllLossBackward0>)
tensor(3.3933, grad_fn=<NllLossBackward0>)
tensor(3.2615, grad_fn=<NllLossBackward0>)
tensor(3.44

In [6]:
# generating samples

num_names = 3

for _ in range(num_names):

    context = [0] * block_size
    name = ''

    while True:
        # forward pass
        emb = ch2emb[context]
        emb = emb.view((1, -1))

        # forward pass
        hidden_emb1 = emb @ w1
        # batch normalization
        bn1 = gamma * (hidden_emb1 - running_mean)/(running_std**2 + epsilon)**(1/2) + beta
        
        layer1 = bn1.tanh()               # len(dataset), d
        logits = layer1 @ w2 + b2                        # len(dataset), vocab_size

        probs = F.softmax(logits, dim=1)
        # sample from the distribution
        ix = torch.multinomial(probs, num_samples=1, replacement=True).item()

        name += itos[ix]
        context = context[1:] + [ix]

        if ix == 0:
            break
    print(name)




delliexx.
jon.
oy.


In [10]:
# MLP w/out nn.Module

emb_size = 10
hidden_dim = 300

model = Sequential([
    Embedding(vocab_size, emb_size),
    FlattenConsecutive(block_size), Linear(emb_size * block_size, hidden_dim, bias=False), BatchNorm1d(hidden_dim), Tanh(),
    Linear(hidden_dim, vocab_size),
])

with torch.no_grad():
    model.layers[-1].W *= 0.1

parameters = model.parameters()

for param in parameters:
    param.requires_grad = True

In [None]:
# training loop

epochs = 1000
batch_size = 128

for epoch in range(epochs):

    # mini batch creation and getting embeddings
    batch_idx = torch.randint(0, X_train.shape[0], size=(batch_size,))
    xs = X_train[batch_idx]
    ys = Y[batch_idx].view(-1)

    # forward pass
    logits = model(xs)
    loss = F.cross_entropy(logits, ys)

    # backward pass
    for param in parameters:
        param.grad = None
    loss.backward()

    # update step
    lr = 0.05
    for param in parameters:
        param.data += -lr * param.grad

tensor(3.2894, grad_fn=<NllLossBackward0>)
tensor(3.2624, grad_fn=<NllLossBackward0>)
tensor(3.2260, grad_fn=<NllLossBackward0>)
tensor(3.2097, grad_fn=<NllLossBackward0>)
tensor(3.1623, grad_fn=<NllLossBackward0>)
tensor(3.1515, grad_fn=<NllLossBackward0>)
tensor(3.1037, grad_fn=<NllLossBackward0>)
tensor(3.0719, grad_fn=<NllLossBackward0>)
tensor(3.0868, grad_fn=<NllLossBackward0>)
tensor(3.0439, grad_fn=<NllLossBackward0>)
tensor(3.0632, grad_fn=<NllLossBackward0>)
tensor(2.9917, grad_fn=<NllLossBackward0>)
tensor(2.9982, grad_fn=<NllLossBackward0>)
tensor(2.9478, grad_fn=<NllLossBackward0>)
tensor(2.9364, grad_fn=<NllLossBackward0>)
tensor(3.0414, grad_fn=<NllLossBackward0>)
tensor(2.9606, grad_fn=<NllLossBackward0>)
tensor(2.9542, grad_fn=<NllLossBackward0>)
tensor(2.9145, grad_fn=<NllLossBackward0>)
tensor(2.8968, grad_fn=<NllLossBackward0>)
tensor(3.0185, grad_fn=<NllLossBackward0>)
tensor(2.9426, grad_fn=<NllLossBackward0>)
tensor(2.8445, grad_fn=<NllLossBackward0>)
tensor(2.85

In [12]:
for layer in model.layers:
    layer.training = False

In [13]:
# generating samples

num_names = 3

for _ in range(num_names):
    context = [0] * block_size
    name = ''

    while True:
        # forward pass
        logits = model(torch.tensor([context]))
        probs = F.softmax(logits, dim=1)
        # sample from the distribution
        ix = torch.multinomial(probs, num_samples=1, replacement=True).item()

        name += itos[ix]
        context = context[1:] + [ix]

        if ix == 0:
            break
    print(name)




hamaniya.
aylyne.
amgeqy.


# Wavenet Section

In [21]:
# dataset set up
names = open("names.txt", "r").read().splitlines()
block_size = 8 # context window (in bigram block_size = 1)
vocab = set([ch for name in names for ch in name])
stoi = {ch: i+1 for i, ch in enumerate(sorted(list(vocab)))}
stoi["."] = 0
itos = {i: ch for ch, i in stoi.items()}
vocab_size = len(vocab) + 1

X = []
Y = []

for name in names:
    context = [0] * block_size
    for ch in (name + '.'):
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

n1 = int(0.8 * len(names))
n2 = int(0.9 * len(names))

X_train, Y_train = X[:n1], Y[:n1]
X_val, Y_val = X[n1:n2], Y[n1:n2]
X_test, Y_test = X[n2:], Y[:n2]
X, Y

(tensor([[ 0,  0,  0,  ...,  0,  0,  0],
         [ 0,  0,  0,  ...,  0,  0,  5],
         [ 0,  0,  0,  ...,  0,  5, 13],
         ...,
         [ 0,  0,  0,  ..., 26, 26, 25],
         [ 0,  0,  0,  ..., 26, 25, 26],
         [ 0,  0,  0,  ..., 25, 26, 24]]),
 tensor([ 5, 13, 13,  ..., 26, 24,  0]))

In [24]:
emb_size = 24
hidden_dim = 200

model = Sequential([
    Embedding(vocab_size, emb_size),
    FlattenConsecutive(2), Linear(2 * emb_size, hidden_dim, bias=False), BatchNorm1d(hidden_dim), Tanh(),
    FlattenConsecutive(2), Linear(2 * hidden_dim, hidden_dim, bias=False), BatchNorm1d(hidden_dim), Tanh(),
    FlattenConsecutive(2), Linear(2 * hidden_dim, hidden_dim, bias=False), BatchNorm1d(hidden_dim), Tanh(),
    Linear(hidden_dim, vocab_size)
])

with torch.no_grad():
    model.layers[-1].W *= 0.1

parameters= model.parameters()

for p in parameters:
    p.requires_grad = True

In [25]:
# training loop

epochs = 1000
batch_size = 128

for epoch in range(epochs):

    # mini batch creation and getting embeddings
    batch_idx = torch.randint(0, X_train.shape[0], size=(batch_size,))
    xs = X_train[batch_idx]
    ys = Y[batch_idx].view(-1)

    # forward pass
    logits = model(xs)
    loss = F.cross_entropy(logits, ys)

    # backward pass
    for param in parameters:
        param.grad = None
    loss.backward()
    print(loss.item())

    # update step
    lr = 0.05
    for param in parameters:
        param.data += -lr * param.grad

3.2894446849823
3.292128562927246
3.248857259750366
3.2474045753479004
3.230952501296997
3.201843500137329
3.20062255859375
3.1586461067199707
3.1550023555755615
3.1177287101745605
3.1279847621917725
3.050001621246338
3.0574021339416504
3.0316905975341797
3.0652782917022705
3.0694491863250732
2.9013493061065674
2.968414306640625
2.960031509399414
2.9912269115448
2.982114791870117
3.05415415763855
2.820807933807373
2.8316924571990967
2.743335485458374
2.88623046875
2.97670316696167
2.873579263687134
2.9468770027160645
2.9458916187286377
2.8086752891540527
2.8504011631011963
2.8736510276794434
2.838120460510254
2.850911855697632
2.832043409347534
2.8063714504241943
2.843447208404541
2.8314740657806396
2.781594753265381
2.840872287750244
2.802152633666992
2.90743350982666
2.717411756515503
2.7977685928344727
2.7603647708892822
2.9353370666503906
2.747422218322754
2.740048408508301
2.8093421459198
2.699428081512451
2.6177637577056885
2.761293411254883
2.851919174194336
2.6821205615997314
2