In [1]:
words = open('names.txt', 'r').read().splitlines()
#splitlines() uses newline to denote a new element in the list

words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [2]:
len(words)

32033

In [3]:
min(len(w) for w in words)

2

In [4]:
max(len(w) for w in words)

15

In [5]:
#each word is used to train what might follow each character.
#like 'isabella' tells us that if a word starts with 'i', it likely has 's' after that
#continuing, if a word has 'is' it likely has 'isa' after that
#so on and so forth until we can predict that if a word has all 'isabella' it is likely the end of that name

#a bigram is a 2 letter word
#we start by building a simplified model that only predicts the next letter, seen one character (bigram generator)
#(our main model is just this but extrapolated)

b = {} #this is a list of our bigrams
for w in words:
  chs = ['<S>'] + list(w) + ['<E>']
  #<S> and <E> are arbitrary startword and endword characters
  #so for each word in the dataset, we're building something like:
  #[<S>, 'e', 'm', 'm', 'a', <E>]
  for ch1, ch2 in zip(chs, chs[1:]):
    bigram = (ch1, ch2) #creates a bigram tupple for every possible bigram in each word
    b[bigram] = b.get(bigram, 0) + 1. #assigns or updates the count of each bigram
    # print(ch1, ch2)

In [62]:
x = sorted(b.items(), key = lambda kv: -kv[1])
#creates a list of tuples

#what this tells us is that n for example is that n and a are very common ending characters.


In [19]:
import torch
N0 = torch.zeros((28,28), dtype=torch.int32)
#28x28 torch to accomodate 26 possible characters + a start and an end character
N0

#currently our tensor is 2 dimensional to accomodate the frequency of bigrams.
#we can imagine it as a matrix:
#[aa, ab, ac, ad, ...
# ba, bb, bc, bd, ...]

#where the cells are arranged alphabetically, and the data inside each cell is the frequency of each bigram.

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [20]:
#building a lookup table (mapping dict) to map characters to unique integers for encoding

chars = sorted(list(set(''.join(words))))

#stoi is s to i (encoding from string character to integer)
stoi0 = {s:i for i, s in enumerate(chars)}
stoi0['<S>'] = 26
stoi0['<E>'] = 27

In [21]:
# now we can follow the bigram generation code above to populate our tensor (with integers)

for w in words:
  chs = ['<S>'] + list(w) + ['<E>']
  #<S> and <E> are arbitrary startword and endword characters
  #so for each word in the dataset, we're building something like:
  #[<S>, 'e', 'm', 'm', 'a', <E>]
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi0[ch1]
    ix2 = stoi0[ch2]
    N0[ix1, ix2] +=1

In [22]:
N0

tensor([[ 556,  541,  470, 1042,  692,  134,  168, 2332, 1650,  175,  568, 2528,
         1634, 5438,   63,   82,   60, 3264, 1118,  687,  381,  834,  161,  182,
         2050,  435,    0, 6640],
        [ 321,   38,    1,   65,  655,    0,    0,   41,  217,    1,    0,  103,
            0,    4,  105,    0,    0,  842,    8,    2,   45,    0,    0,    0,
           83,    0,    0,  114],
        [ 815,    0,   42,    1,  551,    0,    2,  664,  271,    3,  316,  116,
            0,    0,  380,    1,   11,   76,    5,   35,   35,    0,    0,    3,
          104,    4,    0,   97],
        [1303,    1,    3,  149, 1283,    5,   25,  118,  674,    9,    3,   60,
           30,   31,  378,    0,    1,  424,   29,    4,   92,   17,   23,    0,
          317,    1,    0,  516],
        [ 679,  121,  153,  384, 1271,   82,  125,  152,  818,   55,  178, 3248,
          769, 2675,  269,   83,   14, 1958,  861,  580,   69,  463,   50,  132,
         1070,  181,    0, 3983],
        [ 242,    0,

In [53]:
#changing some stuff up:
#on closer look, we dont actually need the last row where <E> precedes a character or when <S> is the last character
#we can remote these and to reduce redundancy only have one special character for both start and end

N = torch.zeros((27, 27), dtype=torch.int32)

chars = sorted(list(set(''.join(words))))

stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {s:i for i, s in stoi.items()}

for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    N[ix1, ix2] +=1

In [47]:
N

#tensor can be iterated like a normal matrix:
N[0]

tensor([   0, 4410, 1306, 1542, 1690, 1531,  417,  669,  874,  591, 2422, 2963,
        1572, 2538, 1146,  394,  515,   92, 1639, 2055, 1308,   78,  376,  307,
         134,  535,  929], dtype=torch.int32)

In [48]:
#the tensor currently contains raw frequencies. we want to convert it into probabilities or weights

p = N[0].float() #converting all values of first row to float (prepare to divide) in a new tensor 'p'
p = p / p.sum()

p #p is now the relative probability of seeing each bigram


#we need to loop over all the rows and do this

tensor([0.0000, 0.1377, 0.0408, 0.0481, 0.0528, 0.0478, 0.0130, 0.0209, 0.0273,
        0.0184, 0.0756, 0.0925, 0.0491, 0.0792, 0.0358, 0.0123, 0.0161, 0.0029,
        0.0512, 0.0642, 0.0408, 0.0024, 0.0117, 0.0096, 0.0042, 0.0167, 0.0290])

In [61]:
#NOTE: torch.multinomial() is used to draw a sample based on given probabilities in a 1 dimensional tensor
#       torch.Generator() is used to generate a deterministic random integer -- to be used in multinomial sampling

g = torch.Generator().manual_seed(1337)


for  i in range(20):

    out = []

    ix = 0 #row number
    #we start with row 0 because all of row 0 starts with the starting character:
    #[<S>a, <S>b, <S>c, <S>d, ... <S>z]
    while True:
        p = N[ix].float() #for each row we see, we convert frequencies to float
        p = p / p.sum() #converting frequencies to normalized probabilities

        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item() #.item at the end because otherwise it returns a tensor object
        
        out.append(itos[ix])
        if ix == 0:
            break

    print(''.join(out))

gun.
kaneliy.
dy.
exylell.
eleleahmariss.
modarrinam.
rn.
vybeartosay.
kugr.
mare.
vayioalian.
alaydanam.
li.
srar.
lyner.
anierigaeylla.
si.
jania.
aranideava.
tamaynaclysaroen.
