In [1]:
import time
import random
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


In [4]:
fp = Path('/content/drive/MyDrive/colab_notebooks/data/text/names.txt')

In [271]:
names = list(set(open(fp, 'r').read().splitlines()))
random.shuffle(names)

print('number of names:', len(names))

number of names: 29494


In [660]:
names_concat = '.' + '.'.join(names) + '.'

In [662]:
abc = ['.'] + sorted(list(set(''.join(names))))
len_abc = len(abc)

itoc = {i:c for i,c in enumerate(abc)}
ctoi = {c:i for i,c in enumerate(abc)}

print(len_abc, abc)

27 ['.', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [663]:
def encode(s: str) -> list[int]:
    return [ctoi[c] for c in s]

def decode(l: list[int]) -> str:
    return ''.join([itoc[i] for i in l])

def enc2tnsr(l: list[int]) -> torch.Tensor():
    return torch.tensor(l).long()

def enc2seq(l: list[int]) -> torch.Tensor:
    return F.one_hot(enc2tnsr(l), len_abc).float()

def enct2seq(t: torch.Tensor) -> torch.Tensor:
    return F.one_hot(t, len_abc).float()

def str2seq(s: str) -> torch.Tensor:
    encoded = torch.tensor(encode(s)).long()
    return F.one_hot(encoded, len_abc).float()

In [664]:
def str_to_samples(s: str) -> tuple:
    seqs, trgts, lens = [], [], []
    for i in range(1, len(s)):
        seqs.append(s[0:i])
        trgts.append(s[i])
        lens.append(i)
    return (seqs, trgts, lens)

In [665]:
word = '.creatures.'
x, t, l = str_to_samples(word)

for i in range(len(x)):
    print(x[i], t[i], l[i])

. c 1
.c r 2
.cr e 3
.cre a 4
.crea t 5
.creat u 6
.creatu r 7
.creatur e 8
.creature s 9
.creatures . 10


In [628]:
X, Y, L = [], [], []

for name in names:
    name = enc2tnsr(encode(f'.{name}.'))
    x, y, l = str_to_samples(name)
    X.extend(x)
    Y.extend(y)
    L.extend(l)

X = torch.nn.utils.rnn.pad_sequence(X, batch_first=True, padding_side='left')
Y = torch.tensor(Y).long()
L = torch.tensor(L).long()

print('number of samples:', len(X))

number of samples: 211328


In [666]:
block_size = 8

X, Y = [], []
for i in range(0, len(names_concat)-block_size):
    j, k = i, i+block_size
    X.append(encode(names_concat[j:k]))
    Y.append(ctoi[names_concat[k]])

X = torch.tensor(X).long()
Y = torch.tensor(Y).long()

print(len(X))

211321


In [667]:
for i in range(len(names[0])):
    print(decode(X[i].tolist()), itoc[Y[i].item()], L[i].item())

.loran.o p 1
loran.op i 2
oran.opi e 3
ran.opie . 4
an.opie. b 5


In [668]:
train_frac = 0.8
valid_frac = 0.1
test_frac = 0.1

n = len(X)
n1 = int(train_frac*n)
n2 = n1+int(valid_frac*n)
n3 = n2+int(test_frac*n)

X_train, Y_train, L_train = X[0:n1], Y[0:n1], L[0:n1]
X_valid, Y_valid, L_valid = X[n1:n2], Y[n1:n2], L[n1:n2]
X_test, Y_test, L_test = X[n2:n3], Y[n2:n3], L[n2:n3]

print(f'# train examples: {len(X_train)}')
print(f'# validation examples: {len(X_valid)}')
print(f'# test examples: {len(X_test)}')

# train examples: 169056
# validation examples: 21132
# test examples: 21132


In [731]:
class Model(nn.Module):
    def __init__(self, in_dim, rnn_h_dim, mlp_h_dim, out_dim) -> None:
        super(Model, self).__init__()

        # self.emb = nn.Embedding(len_abc, in_dim)
        self.rnn = nn.RNN(in_dim, rnn_h_dim, batch_first=True, dropout=0.2)
        # self.rnn = nn.LSTM(in_dim, h_dim, batch_first=True)
        self.classifier = nn.Sequential(
            nn.Linear(rnn_h_dim, mlp_h_dim),
            nn.ReLU(),
            nn.Linear(mlp_h_dim, out_dim),
            nn.LogSoftmax(dim=0)
        )

    def forward(self, x: torch.Tensor, h: torch.Tensor=None) -> tuple:
        # x = self.emb(x)
        s, c = self.rnn(x, h)
        y = self.classifier(c[-1])
        return (c, y)
        # o, (h, c) = self.rnn(x, h)
        # y = self.lsm(self.linear(h[-1]))
        # return ((h, c), y)

@torch.no_grad
def calculate_loss(model, dataset_type, loss_fn):
    if dataset_type == 'test':
        x, y = X_test, Y_test
    elif dataset_type == 'validate':
        x, y = X_valid, Y_valid

    x, y = x.to(device), y.to(device)
    c, y_hat = model(enct2seq(x))
    # c, y_hat = model(x)
    # (h, c), y_hat = model(enct2seq(x))
    return loss_fn(y_hat, y)


In [736]:
IN_DIM = len_abc
RNN_H_DIM = 8
MLP_H_DIM = 8
OUT_DIM = len_abc

model = Model(IN_DIM, RNN_H_DIM, MLP_H_DIM, OUT_DIM).to(device)
optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.NLLLoss()



In [733]:
def pack_batch(batch_idxs):
    x, y, l = enct2seq(X_train[batch_idxs]), Y_train[batch_idxs], L_train[batch_idxs]
    l, idxs = l.sort(descending=True)
    return nn.utils.rnn.pack_padded_sequence(x[idxs], l, batch_first=True, enforce_sorted=True), y[idxs]

In [750]:
n_epoch = 4
batch_size = 64

model.train()
start_time = time.time()

for epoch in range(n_epoch):
    idxs = torch.randperm(len(X_train))
    batches = torch.split(idxs, batch_size)

    for batch in batches:
        optimizer.zero_grad()

        # x, y = pack_batch(batch)
        # c, y_hat = model(x.to(device))
        # loss_fn(y_hat, y.to(device)).backward()

        # (h, c), y_hat = model(enct2seq(X_train[batch]).to(device))
        c, y_hat = model(enct2seq(X_train[batch]).to(device))
        # c, y_hat = model(X_train[batch].to(device))
        loss_fn(y_hat, Y_train[batch].to(device)).backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

    print(calculate_loss(model, 'validate', loss_fn).item())

print(f'training time: {round(time.time()-start_time)} sec.')

9.447746276855469
9.440382957458496
9.436433792114258
9.433266639709473
training time: 45 sec.


In [677]:
print(calculate_loss(model, 'test', loss_fn).item())

9.465780258178711


In [187]:
parameters_fp = '/content/drive/MyDrive/colab_notebooks/data/parameters/rnn_name_generator_2.pt'
# torch.save(model.state_dict(), parameters_fp)

In [None]:
# state_dict = torch.load(fp, weights_only=True)
# model.load_state_dict(state_dict)

In [783]:
@torch.no_grad()
def generate_name() -> str:
    model.eval()

    i, name = 0, []

    h = None
    # h = torch.zeros(1, H_DIM)
    # c = torch.zeros(1, H_DIM)

    while True:
        h, y = model(enc2seq([i]), h)
        # h, y = model(enc2tnsr([i]), h)
        # (h, c), y = model(enc2seq([i]), (h, c))
        i = torch.multinomial(torch.exp(y), 1, True).item()
        if i == 0:
            break
        name.append(i)

    return decode(name)

In [787]:
for i in range(10):
    print(generate_name())

fajegh
guahe
qaeah
jogad
authzo
gahgatfah
jatka
aaqahah
juudahuce
guzaif
