In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
# Read all the names (words)
with open('names.txt') as f:
    words = f.read().splitlines()
print(len(words), words[:10])

32033 ['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia', 'harper', 'evelyn']


In [3]:
# Build the vocabulary of characters
SPECIAL_CHAR = '.'
vocab = [SPECIAL_CHAR] + sorted(list(set(''.join(words))))
itoc = {i:c for i, c in enumerate(vocab)}
ctoi = {c:i for i, c in itoc.items()}
VOCAB_SIZE = len(vocab)
print(VOCAB_SIZE, ctoi)

27 {'.': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}


In [4]:
# Build the dataset
X, y = [], []
CONTEXT_DIM = 3

for word in words:
    context = [0] * CONTEXT_DIM
    for char in word + SPECIAL_CHAR:
        idx = ctoi[char]
        X.append(context)
        y.append(idx)
        # print(''.join([itoc[i] for i in context]) + ' ---> ' + char)
        context = context[1:] + [idx]
X = torch.tensor(X)
y = torch.tensor(y)

In [5]:
import os
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

BATCH_SIZE = 256

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, train_size=0.8)


train_ds = TensorDataset(X_train, y_train)
valid_ds = TensorDataset(X_valid, y_valid)
test_ds = TensorDataset(X_test, y_test)

train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, num_workers=os.cpu_count() // 2, prefetch_factor=2)
valid_dl = DataLoader(valid_ds, batch_size=BATCH_SIZE, num_workers=os.cpu_count() // 2, prefetch_factor=2)
test_dl = DataLoader(test_ds)

In [6]:
EMB_DIM = 3
class MLP(nn.Module):
    def __init__(self, outputs: list) -> None:
        super().__init__()
        self.emb = nn.Embedding(VOCAB_SIZE, EMB_DIM)
        self.flatten = nn.Flatten(start_dim=1)

        outputs = [CONTEXT_DIM * EMB_DIM] +  outputs + [VOCAB_SIZE]
        
        self.layers = nn.ModuleList()
        for i in range(len(outputs) - 1):
            self.layers.append(nn.Linear(outputs[i], outputs[i + 1]))

    def forward(self, x):
        x = self.emb(x)
        x = self.flatten(x)
        for i, layer in enumerate(self.layers, 1):
            x = layer(x)
            if i != len(self.layers):
                x = F.relu(x)
        return x

In [7]:
from os import device_encoding
from statistics import fmean

def fit(model, epochs, loss_func, optimizer, train_dl, valid_dl):
    device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
    model.to(device)
    global_losses = []
    for epoch in range(epochs):
        model.train()
        losses = [] 
        for step, data in enumerate(train_dl, 1):
            X, y = data
            X = X.to(device)
            y = y.to(device)
            y_pred = model(X)
            loss = loss_func(y_pred, y)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            losses.append(loss.item())
            if step % 100 == 0:
                # print(f'Epoch={epoch + 1} Step={step} Loss={fmean(losses):.3f}')
                losses = []

        model.eval()
        valid_losses = []
        for step, data in enumerate(valid_dl, 1):
            X, y = data
            X = X.to(device)
            y = y.to(device)
            with torch.no_grad():
                loss = loss_func(model(X), y)
            valid_losses.append(loss.item())
        print(f'Epoch={epoch + 1} Validation Loss={fmean(valid_losses):.3f}')
        global_losses.append(fmean(valid_losses))
    # plt.plot(global_losses)

In [8]:
model = MLP([16, 32])
loss_func = F.cross_entropy
optimizer = torch.optim.RMSprop(model.parameters())

In [9]:
fit(model, 10, loss_func, optimizer, train_dl, valid_dl)

Epoch=1 Validation Loss=2.340
Epoch=2 Validation Loss=2.318
Epoch=3 Validation Loss=2.305
Epoch=4 Validation Loss=2.296
Epoch=5 Validation Loss=2.293
Epoch=6 Validation Loss=2.293
Epoch=7 Validation Loss=2.288
Epoch=8 Validation Loss=2.287
Epoch=9 Validation Loss=2.286
Epoch=10 Validation Loss=2.285


In [10]:
from statistics import mean

results = []
for X, y in test_dl:
    model.to('cpu')
    model.eval()
    with torch.no_grad():
        y_pred = model(X)
    probs = F.softmax(y_pred, dim=1)
    _, preds = torch.max(probs.data, dim=1)    
    results.append(preds.item() == y.item())
mean(results)

0.2950909489371028

In [11]:
preds = []
for i in range(10):
    x = [0] * CONTEXT_DIM
    output = ''
    while True:
        y_pred = model(torch.tensor([x]))
        idx = torch.multinomial(y_pred.exp() / y_pred.exp().sum(), num_samples=1, replacement=True).item()
        if idx == 0:
            break
        output += itoc[idx]
        x = x[1:] + [idx]
    preds.append(output)    
preds

['hamari',
 'joydrayzy',
 'syany',
 'mie',
 'kakiir',
 'aelisanyuchaymana',
 'sety',
 'jadojalee',
 'jaanna',
 'andi']