##  Load the text file

In [1]:
# Load the text file
words = open('data/names.txt', 'r').read().splitlines()

In [2]:
len(words)

32033

## Build the bigram model

In [3]:
import torch

### Build empty bigram counter and the mapping

In [4]:
# bigram_count[i,j] = num of occurrence where character i is followed by character j
bigram_count = torch.zeros((27,27), dtype=torch.int32)

# This marks beginning or end
SPECIAL_CHAR = '.'
CHAR_TO_ID_STR = SPECIAL_CHAR + 'abcdefghijklmnopqrstuvwxyz'

char_to_id = {}
id_to_char = []

for i in range(len(CHAR_TO_ID_STR)):
    c = CHAR_TO_ID_STR[i]
    id_to_char.append(c)
    char_to_id[c] = i

### Fill in the bigram counter

In [5]:
for word in words:
    bigram_count[char_to_id[SPECIAL_CHAR], char_to_id[word[0]]] += 1
    for left in range(len(word) - 1):
        bigram_count[char_to_id[word[left]], char_to_id[word[left + 1]]] += 1
    bigram_count[char_to_id[word[-1]], char_to_id[SPECIAL_CHAR]] += 1

In [6]:
bigram_count[0]

tensor([   0, 4410, 1306, 1542, 1690, 1531,  417,  669,  874,  591, 2422, 2963,
        1572, 2538, 1146,  394,  515,   92, 1639, 2055, 1308,   78,  376,  307,
         134,  535,  929], dtype=torch.int32)

## Sample from the bigram counter

In [17]:
def get_sample(bigram_count):
    result = []
    cur_char_id = char_to_id[SPECIAL_CHAR]
    while True:
        proba_dist = bigram_count[cur_char_id].float()
        # You don't have to normalize
        # proba_dist /= proba_dist.sum()
        cur_char_id = torch.multinomial(proba_dist, num_samples = 1, replacement=True, generator=g).item()
        if id_to_char[cur_char_id] == SPECIAL_CHAR:
            break
        result.append(id_to_char[cur_char_id])
    return ''.join(result)

g = torch.Generator().manual_seed(2147483647)
num_samples = 20
for sample_i in range(num_samples):
    print(get_sample(bigram_count))

junide
janasah
p
cony
a
nn
kohin
tolian
juee
ksahnaauranilevias
dedainrwieta
ssonielylarte
faveumerifontume
phynslenaruani
core
yaenon
ka
jabdinerimikimaynin
anaasn
ssorionsush
