In [14]:
import pandas as pd
import os

"""
Read a file providing general Finnish names
"""
df = pd.read_excel(os.path.join('data','etunimitilasto-2022-08-04-dvv.xlsx'), sheet_name='Miehet ens')
print(df.shape)
df.head()

(6651, 2)


Unnamed: 0,Etunimi,Lukumäärä
0,Juha,44293
1,Timo,43711
2,Matti,40752
3,Kari,38967
4,Mikko,38053


In [15]:
"""
Create a list of all possible characters
"""
names = list(df.Etunimi.str.lower())
chars = set(''.join(names))
print([e for e in chars])

['y', 'ã', '-', 'j', 'é', 'p', 'q', 'å', 'o', 'ú', 'h', 'ë', 'l', 'ø', 'ò', 'ö', 'õ', 'n', 'ä', 's', 't', 'i', 'r', 'u', 'k', 'è', 'b', 'f', 'c', 'á', 'v', 'í', 'ó', 'ç', 'w', 'z', "'", 'a', 'e', 'x', 'd', 'g', '.', 'ü', 'm']


In [16]:
"""
Create look-up dictionaries:
- c2i: to map a character to an index
- i2c: to map the other way
"""

c2i = {c:i for i, c in enumerate(chars, start=1)}
c2i['<s>'] = 0
c2i['<e>'] = len(c2i)

i2c = {i:c for c, i in c2i.items()}

In [17]:
"""
Create an N matrix to store counts where
N[i,j] refers to a count of bigram (i2c[i],i2c[j])
"""

import numpy as np

N = np.ones(shape=(len(c2i),len(c2i)))  # Add-one smoothing

for name in names:
    name_chars = list(name)
    bigrams = [(a,b) for a,b in zip(['<s>'] + name_chars, name_chars + ['<e>'])]
    for a, b in bigrams:
        N[c2i[a], c2i[b]] += 1

In [18]:
"""
Data exploration: Find the most frequent bigrams
"""
idx = [(i, j) for i in range(len(c2i)) for j in range(len(c2i))]
bigrams = {(i2c[a],i2c[b]): N[a,b] for (a,b) in idx}

sorted(bigrams.items(), key=lambda kv:-kv[1])[:10]

[(('i', '<e>'), 1089.0),
 (('a', 'n'), 1045.0),
 (('n', '<e>'), 877.0),
 (('a', 'r'), 853.0),
 (('<s>', 'a'), 852.0),
 (('r', 'i'), 844.0),
 (('o', '<e>'), 710.0),
 (('e', 'r'), 704.0),
 (('<s>', 'j'), 676.0),
 (('k', 'a'), 635.0)]

In [19]:
"""
Create a probapility matrix P
"""

normalization = N.sum(axis=1, keepdims=True)
P = N / normalization

In [28]:
"""
Generate new Finnish names using bigram model
"""

for _ in range(10):
    tokens = ['<s>']
    while True:
        recent = tokens[-1]
        next_token_idx = np.random.multinomial(n=1, pvals=P[c2i[recent]]).argmax()
        tokens.append(i2c[next_token_idx])

        if i2c[next_token_idx] == '<e>':
            break

    print(''.join(tokens[1:-1]))

avoi
piril
hacqaremm
perconil-ãnjoomam
j
aneun
ai
hor
alirsera
mir
