In [19]:
import torch
import numpy as np
from sklearn.utils import shuffle
words = open('names.txt', 'r').read().splitlines()

## bigram nn approach

In [2]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [3]:
xs, ys = [], []
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)
        
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()

In [4]:
import torch.nn.functional as F
W = torch.randn((27, 27), requires_grad=True)

for i in range(10):
    # flatten the two arrays in each row into a single array
    xenc= F.one_hot(xs, num_classes=27).float()
    logits = xenc @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdim=True)
    loss = -probs[torch.arange(num), ys].log().mean()
    print(f"Iteration {i} Loss: {loss.item()}")

    W.grad = None
    loss.backward()
    W.data += -30.0 * W.grad

Iteration 0 Loss: 3.751142740249634
Iteration 1 Loss: 3.5050597190856934
Iteration 2 Loss: 3.3318803310394287
Iteration 3 Loss: 3.203662157058716
Iteration 4 Loss: 3.1074867248535156
Iteration 5 Loss: 3.032045602798462
Iteration 6 Loss: 2.9698197841644287
Iteration 7 Loss: 2.917283296585083
Iteration 8 Loss: 2.8726439476013184
Iteration 9 Loss: 2.83463978767395


In [5]:
g = torch.Generator().manual_seed(2147483647)
for i in range(5):
    out = []
    ix = 0
    while True:
        xenc = F.one_hot(torch.tensor([ix]), num_classes = 27).float()
        logits = xenc @ W # predict log counts
        # softmax (next two lines)
        counts = logits.exp() # counts, equivalent to N
        probs = counts / counts.sum(1, keepdim=True) # probabiltiies for the next character

        # sample from a distribution
        ix = torch.multinomial(probs, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix])
        if ix == 0:
            break
    print(''.join(out))

cexzdaleglkurailezitxhn.
vinimjttain.
lgfykzka.
ar.
swaivpubptvhrigotzi.


## trigram nn approach

In [6]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [7]:
xs, ys = [], []
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        xs.append([ix1, ix2])
        ys.append(ix3)
        
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()

In [8]:
import torch.nn.functional as F
W = torch.randn((54, 27), requires_grad=True)

for i in range(10):
    # flatten the two arrays in each row into a single array
    xenc= F.one_hot(xs, num_classes=27).float().view(-1, 54)
    logits = xenc @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdim=True)
    loss = -probs[torch.arange(num//2), ys].log().mean()
    print(f"Iteration {i} Loss: {loss.item()}")

    W.grad = None
    loss.backward()
    W.data += -30.0 * W.grad

Iteration 0 Loss: 4.1346611976623535
Iteration 1 Loss: 3.586719512939453
Iteration 2 Loss: 3.2827024459838867
Iteration 3 Loss: 3.1056246757507324
Iteration 4 Loss: 2.980825424194336
Iteration 5 Loss: 2.8855695724487305
Iteration 6 Loss: 2.8106770515441895
Iteration 7 Loss: 2.7505600452423096
Iteration 8 Loss: 2.7015597820281982
Iteration 9 Loss: 2.6611087322235107


In [9]:
g = torch.Generator().manual_seed(2147483647)
for i in range(5):
    out = []
    context = [0, 0]
    while True:
        xenc = F.one_hot(torch.tensor([context]), num_classes = 27).float().view(1, -1)
        logits = xenc @ W # predict log counts
        # softmax (next two lines)
        counts = logits.exp() # counts, equivalent to N
        probs = counts / counts.sum(1, keepdim=True) # probabiltiies for the next character

        # sample from a distribution
        ix = torch.multinomial(probs, num_samples=1, replacement=True, generator=g).item()
        context = context[1:] + [ix] # cause you're using two letters to predict, not one.
        out.append(itos[ix])
        if ix == 0:
            break
    print(''.join(out))

dexzmriogjpurkylqzvqxh.
ellzimjttain.
augak.
man.
a.


## BIGRAM AND TRIGRAM CLASSES

In [114]:
import torch.nn.functional as F

class Bigram:
    def __init__(self, data, num_inputs, num_outputs, training_epochs):
        self.data = data
        self.num_inputs = num_inputs
        self.num_outputs = num_outputs
        self.training_epochs = training_epochs
    
    def generate_data(self):
        chars = sorted(list(set(''.join(words))))
        self.stoi = {s:i+1 for i, s in enumerate(chars)}
        self.stoi['.'] = 0
        self.itos = {i:s for s,i in self.stoi.items()}

        xs, ys = [], []
        for w in words:
            chs = ['.'] + list(w) + ['.']
            for ch1, ch2 in zip(chs, chs[1:]):
                ix1 = self.stoi[ch1]
                ix2 = self.stoi[ch2]
                xs.append(ix1)
                ys.append(ix2)
                
        xs = torch.tensor(xs)
        ys = torch.tensor(ys)
        self.num = xs.nelement()
        return xs, ys
    
    def split_data(self, xs, ys, train_p, test_p):
        # missing percentage goes to validation set
        xs = np.array(xs)
        ys = np.array(ys)
        X, y = shuffle(xs, ys)
        num_examples = len(X)

        X_train = torch.tensor(X[0:int(train_p * num_examples)])
        y_train = torch.tensor(y[0:int(train_p * num_examples)])

        X_test = torch.tensor(X[int(train_p * num_examples): int((train_p + test_p) * num_examples)])
        y_test = torch.tensor(y[int(train_p * num_examples): int((train_p + test_p) * num_examples)])

        X_dev = torch.tensor(X[int((train_p + test_p) * num_examples): num_examples])
        y_dev = torch.tensor(y[int((train_p + test_p) * num_examples): num_examples])

        return X_train, y_train, X_test, y_test, X_dev, y_dev
    
    def fit(self, X, y, print_loss, reg_value):
        W = torch.randn((self.num_outputs, self.num_inputs), requires_grad=True)

        for i in range(self.training_epochs):
            # xenc = F.one_hot(X, num_classes=self.num_inputs).float()
            # logits = xenc @ W
            logits = W[X, :]
            counts = logits.exp()
            probs = counts / counts.sum(1, keepdim=True)
            loss = -probs[torch.arange(len(X)), y].log().mean() + reg_value * (W**2).mean()
            if(print_loss):
                print(f"Iteration {i} Loss: {loss.item()}")

            W.grad = None
            loss.backward()
            W.data += -30.0 * W.grad
        
        return W
    
    def evaluate_model(self, W, X, y, is_test):
        # xenc = F.one_hot(X, num_classes=self.num_outputs).float()
        # logits = xenc @ W
        logits = W[X, :]
        counts = logits.exp()
        probs = counts / counts.sum(1, keepdim=True)
        loss = -probs[torch.arange(len(X)), y].log().mean()
        if is_test:
            print(f"X_Test Loss: {loss}")
        else:
            print(f"X_Dev Loss: {loss}")
    
    def generate_samples(self, num_words, W):
        g = torch.Generator().manual_seed(2147483647)
        for i in range(num_words):
            out = []
            ix = 0
            while True:
                # xenc = F.one_hot(torch.tensor([ix]), num_classes = 27).float()
                # logits = xenc @ W # predict log counts
                
                logits = W[ix, :]
                counts = logits.exp() # counts, equivalent to N
                probs = counts / counts.sum() # probabiltiies for the next character

                # sample from a distribution
                ix = torch.multinomial(probs, num_samples=1, replacement=True, generator=g).item()
                out.append(self.itos[ix])
                if ix == 0:
                    break
            print(''.join(out)) 

my_bigram = Bigram(words, 27, 27, 10)
xs, ys = my_bigram.generate_data()
X_train, y_train, X_test, y_test, X_dev, y_dev = my_bigram.split_data(xs, ys, 0.8, 0.1)
W = my_bigram.fit(X_train, y_train, False, 2.0)

my_bigram.evaluate_model(W, X_test, y_test, True)
my_bigram.evaluate_model(W, X_dev, y_dev, False)
my_bigram.generate_samples(10, W)

X_Test Loss: 2.864698648452759
X_Dev Loss: 2.864078998565674
cexbmaloglkurkicczktyhwbvmzimjttainrlkfukzka.
da.
sfcxvpubjtbhrmiotzx.
mczieqckvujkwptedogkkjemvmmsidguenkavgynywftbspmhwcivgbvtahlvsu.
dsdxxblnwglhpyaw.
iswn.
wrpfdwipkezkm.
deru.
firmt.
gbikajbquabsvath.


In [134]:
import torch.nn.functional as F

class Trigram:
    def __init__(self, data, num_inputs, num_outputs, training_epochs):
        self.data = data
        self.num_inputs = num_inputs
        self.num_outputs = num_outputs
        self.training_epochs = training_epochs
    
    def generate_data(self):
        chars = sorted(list(set(''.join(words))))
        self.stoi = {s:i+1 for i, s in enumerate(chars)}
        self.stoi['.'] = 0
        self.itos = {i:s for s,i in self.stoi.items()}

        xs, ys = [], []
        for w in words:
            chs = ['.'] + list(w) + ['.']
            for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
                ix1 = self.stoi[ch1]
                ix2 = self.stoi[ch2]
                ix3 = self.stoi[ch3]
                xs.append([ix1, ix2])
                ys.append(ix3)
                
        xs = torch.tensor(xs)
        ys = torch.tensor(ys)
        self.num = xs.nelement()
        return xs, ys
    
    def split_data(self, xs, ys, train_p, test_p):
        # missing percentage goes to validation set
        xs = np.array(xs)
        ys = np.array(ys)
        X, y = shuffle(xs, ys)
        num_examples = len(X)

        X_train = torch.tensor(X[0:int(train_p * num_examples)])
        y_train = torch.tensor(y[0:int(train_p * num_examples)])

        X_test = torch.tensor(X[int(train_p * num_examples): int((train_p + test_p) * num_examples)])
        y_test = torch.tensor(y[int(train_p * num_examples): int((train_p + test_p) * num_examples)])

        X_dev = torch.tensor(X[int((train_p + test_p) * num_examples): num_examples])
        y_dev = torch.tensor(y[int((train_p + test_p) * num_examples): num_examples])

        return X_train, y_train, X_test, y_test, X_dev, y_dev
    
    def fit(self, X, y, print_loss, reg_value):
        W = torch.randn((self.num_inputs, self.num_outputs), requires_grad=True)

        for i in range(self.training_epochs):
            # flatten the two arrays in each row into a single array
            # xenc = F.one_hot(X, num_classes=self.num_outputs).float().view(-1, self.num_inputs)
            # logits = xenc @ W

            logits = W[X.flatten(), :]
            counts = logits.exp()
            probs = counts / counts.sum(1, keepdim=True)
            loss = -probs[torch.arange(len(X)), y].log().mean() + reg_value * (W**2).mean()
            if(print_loss):
                print(f"Iteration {i} Loss: {loss.item()}")

            W.grad = None
            loss.backward()
            W.data += -30.0 * W.grad
        
        return W
    
    def evaluate_model(self, W, X, y, is_test):
        # xenc = F.one_hot(X, num_classes=self.num_outputs).float().view(-1, self.num_inputs)
        # logits = xenc @ W

        logits = W[X.flatten(), :]
        counts = logits.exp()
        probs = counts / counts.sum(1, keepdim=True)
        loss = -probs[torch.arange(len(X)), y].log().mean()
        if is_test:
            print(f"X_Test Loss: {loss}")
        else:
            print(f"X_Dev Loss: {loss}")
    
    def generate_samples(self, num_words, W):
        g = torch.Generator().manual_seed(2147483647)
        for i in range(num_words):
            out = []
            context = [0, 0]
            while True:
                xenc = F.one_hot(torch.tensor([context]), num_classes = 27).float().view(-1, 1)
                logits = xenc.T @ W # predict log counts
                # softmax (next two lines)

                logits = W[context, :][0]
                counts = logits.exp() # counts, equivalent to N
                probs = counts / counts.sum() # probabiltiies for the next character
                # sample from a distribution
                ix = torch.multinomial(probs, num_samples=1, replacement=True, generator=g).item()
                context = context[1:] + [ix]
                out.append(self.itos[ix])
                if ix == 0:
                    break
            print(''.join(out)) 

my_trigram = Trigram(words, 54, 27, 10)
xs, ys = my_trigram.generate_data()
X_train, y_train, X_test, y_test, X_dev, y_dev = my_trigram.split_data(xs, ys, 0.8, 0.1)
W = my_trigram.fit(X_train, y_train, False, 2.0)

my_trigram.evaluate_model(W, X_test, y_test, True)
my_trigram.evaluate_model(W, X_dev, y_dev, False)
my_trigram.generate_samples(10, W)

"""
For the same learning rate and number of epochs, trigram has a 
lower loss than bigram. 
"""

X_Test Loss: 2.949021816253662
X_Dev Loss: 2.9497745037078857
dexbm.
ioglkurxicazkwyhnevlzimjtnainrlkfuk.
kataa.
rnaxypubjtbhr.
iotai.
iczixqckxugnwptedagek.
emvmmsadlu.
nkavnynyhftbsp.
hwnivebvtahlvsu.
nsdrx.


'\nFor the same learning rate and number of epochs, trigram has a \nlower loss than bigram. \n'