In [1]:
words = open('names.txt', 'r').read().splitlines()

tokens = sorted(list(set(''.join(words))))
ttoi = { token:index+1 for index,token in enumerate(tokens) }; ttoi['.'] = 0
itot = { index:token for token,index in ttoi.items()}

In [2]:
# Training Data
import torch

bi_in, bi_out = [], []
tri_in, tri_out = [], []

for word in words:
    word = ['.'] + list(word) + ['.']
    for in1, out in zip(word, word[1:]):
        in1 = ttoi[in1]
        out = ttoi[out]

        bi_in.append(in1)
        bi_out.append(out)

    word = ['.'] + list(word)
    for (in1, in2), out in zip(zip(word, word[1:]), word[2:]):
        in1 = ttoi[in1]
        in2 = ttoi[in2]
        out = ttoi[out]

        tri_in.append(27 * in1 + in2)
        tri_out.append(out)

bi_in = torch.tensor(bi_in)
bi_out = torch.tensor(bi_out)
tri_in = torch.tensor(tri_in).to(torch.device('mps'))
tri_out = torch.tensor(tri_out).to(torch.device('mps'))

g = torch.Generator().manual_seed(42)
bi_train_ids, bi_dev_ids, bi_test_ids = torch.utils.data.random_split(range(bi_out.shape[0]), [0.8, 0.1, 0.1], generator=g)
tri_train_ids, tri_dev_ids, tri_test_ids = torch.utils.data.random_split(range(tri_out.shape[0]), [0.8, 0.1, 0.1], generator=g)

bi_train_in, bi_train_out = bi_in[bi_train_ids], bi_out[bi_train_ids]
bi_test_in, bi_test_out = bi_in[bi_test_ids], bi_out[bi_test_ids]
bi_dev_in, bi_dev_out = bi_in[bi_dev_ids], bi_out[bi_dev_ids]

tri_train_in, tri_train_out = tri_in[tri_train_ids], tri_out[tri_train_ids]
tri_test_in, tri_test_out = tri_in[tri_test_ids], tri_out[tri_test_ids]
tri_dev_in, tri_dev_out = tri_in[tri_dev_ids], tri_out[tri_dev_ids]

# def bi_sample(inp, out, idx):
#     ch1 = itot[inp[idx].item()]
#     ch2 = itot[out[idx].item()]
#     print(f'{ch1}:{ch2}')

# def tri_sample(inp, out, idx):
#     ch1 = itot[int(inp[idx].item() / len(itot.items()))] + itot[inp[idx].item() % len(itot.items())]
#     ch2 = itot[out[idx].item()]
#     print(f'{ch1}:{ch2}')

# for i in range(len(tri_train_out)):
#     bi_sample(bi_train_in, bi_train_out, i)
# for i in range(len(tri_train_out)):
#     tri_sample(tri_train_in, tri_train_out, i)

In [3]:
import torch.nn.functional as F

def softmax(logits):
    counts = logits.exp()
    return counts / counts.sum(1, keepdim=True)

def nll(probs, labels: torch.Tensor, weights: torch.Tensor | None = None, reg: float = 0):
    if reg and not weights is None:
        return -probs[torch.arange(labels.nelement()), labels].log().mean() + reg * (weights**2).mean()
    else:
        return -probs[torch.arange(labels.nelement()), labels].log().mean()
    
cross_entropy = F.cross_entropy

In [4]:
# Set up bigram
g = torch.Generator().manual_seed(0)
bi_weights = torch.randn((27, 27), generator=g, requires_grad=True)

In [5]:
# gradient descent
for k in range(15):

    # forward pass
    logits = bi_weights[bi_train_in,:]
    probs = softmax(logits)
    loss = nll(probs, bi_train_out) #, bi_weights, reg=0.3)
    print(f'Loss[{k}] - [Train]: {loss.item():4f}, [Dev]: {nll(probs, bi_dev_out):4f}, [Test]: {nll(probs, bi_test_out):4f}')

    # backward pass
    bi_weights.grad = None
    loss.backward()

    # update
    bi_weights.data += -50 * bi_weights.grad

Loss[0] - [Train]: 3.767875, [Dev]: 3.806053, [Test]: 3.813841
Loss[1] - [Train]: 3.357414, [Dev]: 3.522970, [Test]: 3.532342
Loss[2] - [Train]: 3.148772, [Dev]: 3.417052, [Test]: 3.425639
Loss[3] - [Train]: 3.019019, [Dev]: 3.359639, [Test]: 3.366506
Loss[4] - [Train]: 2.929806, [Dev]: 3.320242, [Test]: 3.325183
Loss[5] - [Train]: 2.862049, [Dev]: 3.292201, [Test]: 3.295558
Loss[6] - [Train]: 2.808945, [Dev]: 3.272936, [Test]: 3.274992
Loss[7] - [Train]: 2.766779, [Dev]: 3.260004, [Test]: 3.260945
Loss[8] - [Train]: 2.732884, [Dev]: 3.251523, [Test]: 3.251503
Loss[9] - [Train]: 2.705285, [Dev]: 3.246052, [Test]: 3.245222
Loss[10] - [Train]: 2.682480, [Dev]: 3.242569, [Test]: 3.241071
Loss[11] - [Train]: 2.663318, [Dev]: 3.240383, [Test]: 3.238346
Loss[12] - [Train]: 2.646944, [Dev]: 3.239054, [Test]: 3.236586
Loss[13] - [Train]: 2.632752, [Dev]: 3.238323, [Test]: 3.235516
Loss[14] - [Train]: 2.620317, [Dev]: 3.238041, [Test]: 3.234968


In [6]:
def bigram(W):
    g = torch.Generator().manual_seed(0)
    
    for _ in range(10):
        inp = 0
        out = []

        while True:
            logits = W[[inp],:]
            p = softmax(logits)

            inp = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
            out.append(itot[inp])
            if inp == 0:
                break
        
        print(''.join(out))

bigram(bi_weights)

kedan.
fdnnakayxaderyna.
salzaer.
rion.
kefwwjunn.
zuala.
kanocojaylinilexa.
m.
cyl.
kar.


In [7]:
# Set up trigram network
g = torch.Generator(device=torch.device('mps')).manual_seed(0)
tri_weights = torch.randn((27**2, 27), generator=g, requires_grad=True, device=torch.device('mps'))

In [8]:
for i in range(50):
    # forward Pass
    logits = tri_weights[tri_train_in,:]
    p = softmax(logits)
    loss = nll(p, tri_train_out, weights=tri_weights, reg=0.4)
    print(f'Loss[{i}] - [Train]: {loss.item():4f}, [Dev]: {nll(p, tri_dev_out):4f}')

    # backward Pass
    tri_weights.grad = None
    loss.backward()

    # update
    tri_weights.data += -100 * tri_weights.grad

Loss[0] - [Train]: 4.163890, [Dev]: 3.785672
Loss[1] - [Train]: 3.950311, [Dev]: 3.665133
Loss[2] - [Train]: 3.820945, [Dev]: 3.603215
Loss[3] - [Train]: 3.718646, [Dev]: 3.568808
Loss[4] - [Train]: 3.630326, [Dev]: 3.540179
Loss[5] - [Train]: 3.554028, [Dev]: 3.520751
Loss[6] - [Train]: 3.488144, [Dev]: 3.504799
Loss[7] - [Train]: 3.431180, [Dev]: 3.493918
Loss[8] - [Train]: 3.381530, [Dev]: 3.484564
Loss[9] - [Train]: 3.337733, [Dev]: 3.477659
Loss[10] - [Train]: 3.298646, [Dev]: 3.471278
Loss[11] - [Train]: 3.263435, [Dev]: 3.466229
Loss[12] - [Train]: 3.231478, [Dev]: 3.461488
Loss[13] - [Train]: 3.202294, [Dev]: 3.457610
Loss[14] - [Train]: 3.175498, [Dev]: 3.453983
Loss[15] - [Train]: 3.150773, [Dev]: 3.450948
Loss[16] - [Train]: 3.127857, [Dev]: 3.448121
Loss[17] - [Train]: 3.106528, [Dev]: 3.445708
Loss[18] - [Train]: 3.086597, [Dev]: 3.443464
Loss[19] - [Train]: 3.067906, [Dev]: 3.441510
Loss[20] - [Train]: 3.050319, [Dev]: 3.439692
Loss[21] - [Train]: 3.033721, [Dev]: 3.43808

In [9]:
logits = tri_weights[tri_test_in,:]
p = softmax(logits)
loss = nll(p, tri_test_out, weights=tri_weights)
print(f'Test Set Loss: {loss}')

Test Set Loss: 2.508300542831421


In [10]:
def trigram(W):
    g = torch.Generator(device=torch.device('mps')).manual_seed(0)

    for _ in range(10):
        in1, in2 = 0, 0
        out = []

        while True:
            logints = W[[27 * in1 + in2],:]
            p = softmax(logints)
            o = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
            in1, in2 = in2, o

            out.append(itot[o])

            if o == 0:
                break
        
        print(''.join(out))

trigram(tri_weights)

alan.
laila.
eztuptfbjspjfmbgocazikfzntoi.
brawpzhtojsbelle.
oltkyhy.
re.
ely.
ara.
ohc.
chkffkvdptvmxeaocnkon.


#### Cross entropy

In [11]:
# Set up trigram network
g = torch.Generator(device=torch.device('mps')).manual_seed(0)
tri_weights = torch.randn((27**2, 27), generator=g, requires_grad=True, device=torch.device('mps'))

In [12]:
for i in range(1000):
    # forward Pass
    logits = tri_weights[tri_train_in,:]
    loss = cross_entropy(logits, tri_train_out)
    print(f'Loss[{i}] - [Train]: {loss.item():4f}')

    # backward Pass
    tri_weights.grad = None
    loss.backward()

    # update
    tri_weights.data += -100 * tri_weights.grad

Loss[0] - [Train]: 3.768633
Loss[1] - [Train]: 3.564515
Loss[2] - [Train]: 3.442633
Loss[3] - [Train]: 3.346857
Loss[4] - [Train]: 3.264207
Loss[5] - [Train]: 3.192926
Loss[6] - [Train]: 3.131569
Loss[7] - [Train]: 3.078767
Loss[8] - [Train]: 3.033004
Loss[9] - [Train]: 2.992866
Loss[10] - [Train]: 2.957241
Loss[11] - [Train]: 2.925319
Loss[12] - [Train]: 2.896499
Loss[13] - [Train]: 2.870317
Loss[14] - [Train]: 2.846403
Loss[15] - [Train]: 2.824452
Loss[16] - [Train]: 2.804211
Loss[17] - [Train]: 2.785466
Loss[18] - [Train]: 2.768036
Loss[19] - [Train]: 2.751765
Loss[20] - [Train]: 2.736524
Loss[21] - [Train]: 2.722201
Loss[22] - [Train]: 2.708700
Loss[23] - [Train]: 2.695941
Loss[24] - [Train]: 2.683854
Loss[25] - [Train]: 2.672382
Loss[26] - [Train]: 2.661471
Loss[27] - [Train]: 2.651077
Loss[28] - [Train]: 2.641161
Loss[29] - [Train]: 2.631688
Loss[30] - [Train]: 2.622627
Loss[31] - [Train]: 2.613952
Loss[32] - [Train]: 2.605637
Loss[33] - [Train]: 2.597660
Loss[34] - [Train]: 2.59

In [13]:
logits = tri_weights[tri_test_in,:]
p = softmax(logits)
loss = nll(p, tri_test_out, weights=tri_weights)
print(f'Test Set Loss: {loss}')

Test Set Loss: 2.239563465118408


In [14]:
def trigram(W):
    g = torch.Generator(device=torch.device('mps')).manual_seed(0)

    for _ in range(10):
        in1, in2 = 0, 0
        out = []

        while True:
            logints = W[[27 * in1 + in2],:]
            p = softmax(logints)
            o = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
            in1, in2 = in2, o

            out.append(itot[o])

            if o == 0:
                break
        
        print(''.join(out))

trigram(tri_weights)

alan.
laila.
ezamarie.
saifmely.
aziannator.
brawsynto.
seelle.
oltayah.
re.
eighara.
