## Modelling text through character-level bigrams

In [3]:
names = [line.rstrip() for line in open("../data/streets_zh.txt")]
names[:5]

['Aargauerstrasse',
 'Abeggweg',
 'Abendweg',
 'Ackermannstrasse',
 'Ackersteinstrasse']

In [4]:
# Add $ to mark beginning and ending of names
for i in range(len(names)):
    names[i] = "$" + names[i] + "$"

In [5]:
chars = set()
bigrams = set()
freq = dict()
for street in names:
    for pair in zip(street, street[1:]):
            chars.add(pair[0])
            bigrams.add(pair)
            freq[pair] = 1+freq.get(pair, 0)

In [6]:
freq_per_char = {i : dict() for i in chars}
for bigram, f in freq.items():
    i, j = bigram
    freq_per_char[i][j] = f

In [17]:
from random import choices

for _ in range(100):
    gen = ["$"]
    for _ in range(30):
        c = gen[-1]
        samples = list(freq_per_char[c].keys())
        p_samples = list(freq_per_char[c].values())
        next_c = choices(samples, p_samples)[0]
        if next_c=="$":
            break
        gen.append(next_c)
    print("".join(gen))
        

$Mauf
$Ziellchtrasersswilistrahisster
$Bi
$Ranweldwersseneg
$Imgastrasckhg
$Kaswanstranbsldenpphmaseralwe
$Eke
$Je
$Be
$Herasenge
$Juratrasstrd-Schrtran
$Alssstrasenebe
$Häbersttghe
$Re
$Gikoffweg-Schraseg
$Eg
$Niv-Simpin-We
$Are
$Eykelalentzlzelle
$Gise
$Le
$Heple
$Antrblllastholba-Bof
$Strolmaveieg
$Basstraipole
$Werrastrlichwinaastzwe
$Latretwie
$Kägänasenrunnstrassse
$Tangse de
$Frastrasttraseindwehodie
$Welhonasegaunssegeginzwase
$Idetrade
$Masselzwe
$Aligasse
$Wastrase
$Spladeg
$Brasenzend
$Reg
$Stratreigipenrast.-Brachatryrt
$Lastache
$Dedälbieg
$Virckkolatterachoqussssssedig
$Haselalnhfrng
$Maveim Kellseligrtenwiliastzhb
$Pünraserchasseg
$Bo-Strasraistrase Höneltrhasst
$He
$Juchseasssssteggge
$Fraite
$Enassse
$Em Begplwigraranstteng
$Sinee
$Myepltr-Flstrrdensese
$Ze
$Kidorastrofeusequhasigastrasen
$Sikeriserobe
$Kaustebertrasstintrigie
$Waseuchne
$Hode
$Splastegeg
$Graserarma-Flratrffe
$Ristzranlarauleig
$Be-Lasterase
$Höbüte
$Batraurdogeg
$Dihostwimfussenamürsse
$Binotrollase


In [27]:
sorted(list(freq.items()), key= lambda x: x[1])[-5:]

[(('r', 'a'), 1189),
 (('e', '$'), 1283),
 (('a', 's'), 1309),
 (('s', 's'), 1359),
 (('s', 'e'), 1366)]

## Use a smarter way to tokenize 

In [118]:
#In every iteration step (Byte-pair encoding): 
# 1. find most commmon pair
# 2. merge them and add to vocabulary
# 3. get counts on new vocabulary

new_tokens = set()
n_iterations = 5
names2 = names.copy()
vocab2 = set()
bigrams2 = set()
freq2 = freq.copy()

for n in range(n_iterations):
    #1
    new_token, _ = sorted(list(freq2.items()), key= lambda x: x[1])[-1]
    print(f"{new_token=}")
    #2
    new_tokens.add("".join(new_token))

    # need to treat words in wordlist as list of chars so we can handle merged tokens
    new_names = names.copy()
    temp_list = []
    for i, street in enumerate(names2):
        temp = []
        skip = False
        for i in range(len(street)-1):
            if "".join(street)=="$Aargauerstrasse$":
                print(temp, i)
            if skip:
                skip = False
                continue
            if new_token == (street[i], street[i+1]):
                temp.append("".join(new_token))
                skip = True
            else:
                temp.append(street[i])
        temp_list.append(temp+["$"] if temp[-1][-1]!="$" else temp)
    names2 = temp_list
    
    # create count of tokens
    freq2 = dict()
    for street in names2:
        for pair in zip(street, street[1:]):
                vocab2.add(pair[0])
                bigrams2.add(pair)
                freq2[pair] = 1+freq2.get(pair, 0)
    freq_per_char2 = {i : dict() for i in vocab2}
    for bigram, f in freq2.items():
        i, j = bigram
        freq_per_char2[i][j] = f
    

new_token=('s', 'e')
[] 0
['$'] 1
['$', 'A'] 2
['$', 'A', 'a'] 3
['$', 'A', 'a', 'r'] 4
['$', 'A', 'a', 'r', 'g'] 5
['$', 'A', 'a', 'r', 'g', 'a'] 6
['$', 'A', 'a', 'r', 'g', 'a', 'u'] 7
['$', 'A', 'a', 'r', 'g', 'a', 'u', 'e'] 8
['$', 'A', 'a', 'r', 'g', 'a', 'u', 'e', 'r'] 9
['$', 'A', 'a', 'r', 'g', 'a', 'u', 'e', 'r', 's'] 10
['$', 'A', 'a', 'r', 'g', 'a', 'u', 'e', 'r', 's', 't'] 11
['$', 'A', 'a', 'r', 'g', 'a', 'u', 'e', 'r', 's', 't', 'r'] 12
['$', 'A', 'a', 'r', 'g', 'a', 'u', 'e', 'r', 's', 't', 'r', 'a'] 13
['$', 'A', 'a', 'r', 'g', 'a', 'u', 'e', 'r', 's', 't', 'r', 'a', 's'] 14
['$', 'A', 'a', 'r', 'g', 'a', 'u', 'e', 'r', 's', 't', 'r', 'a', 's', 'se'] 15
new_token=('a', 's')
[] 0
['$'] 1
['$', 'A'] 2
['$', 'A', 'a'] 3
['$', 'A', 'a', 'r'] 4
['$', 'A', 'a', 'r', 'g'] 5
['$', 'A', 'a', 'r', 'g', 'a'] 6
['$', 'A', 'a', 'r', 'g', 'a', 'u'] 7
['$', 'A', 'a', 'r', 'g', 'a', 'u', 'e'] 8
['$', 'A', 'a', 'r', 'g', 'a', 'u', 'e', 'r'] 9
['$', 'A', 'a', 'r', 'g', 'a', 'u', 'e', 'r'

In [114]:
names2

[['$', 'A', 'a', 'r', 'g', 'a', 'u', 'e', 'r', 'st', 'r', '$'],
 ['$', 'A', 'b', 'e', 'g', 'g', 'w', 'e', 'g', '$'],
 ['$', 'A', 'b', 'e', 'n', 'd', 'w', 'e', 'g', '$'],
 ['$', 'A', 'c', 'k', 'e', 'r', 'm', 'a', 'n', 'n', 'st', 'r', '$'],
 ['$', 'A', 'c', 'k', 'e', 'r', 'st', 'e', 'i', 'n', 'st', 'r', '$'],
 ['$', 'A', 'c', 'k', 'e', 'r', 'st', 'r', '$'],
 ['$', 'A', 'd', 'l', 'i', 's', 'b', 'e', 'r', 'g', 'st', 'r', '$'],
 ['$',
  'A',
  'd',
  'o',
  'l',
  'f',
  '-',
  'L',
  'ü',
  'c',
  'h',
  'i',
  'n',
  'g',
  'e',
  'r',
  '-',
  'S',
  't',
  'r',
  '$'],
 ['$', 'A', 'e', 'g', 'e', 'r', 't', 'e', 'n', 'st', 'r', '$'],
 ['$', 'A', 'e', 'h', 'r', 'e', 'n', 'w', 'e', 'g', '$'],
 ['$', 'A', 'e', 'm', 'm', 'e', 'r', 'l', 'i', 'w', 'e', 'g', '$'],
 ['$', 'A', 'e', 'm', 't', 'l', 'e', 'r', 'st', 'r', '$'],
 ['$', 'A', 'f', 'f', 'o', 'l', 't', 'e', 'r', 'n', 'st', 'r', '$'],
 ['$', 'A', 'g', 'l', 'e', 'i', 'st', 'r', '$'],
 ['$', 'A', 'g', 'n', 'e', 's', 'st', 'r', '$'],
 ['$', 'A

In [101]:
for _ in range(100):
    gen = ["$"]
    for _ in range(30):
        c = gen[-1]
        samples = list(freq_per_char2[c].keys())
        p_samples = list(freq_per_char2[c].values())
        next_c = choices(samples, p_samples)[0]
        if next_c=="$":
            break
        gen.append(next_c)
    print("".join(gen))

$Brssstätrg
$Drasse
$Lengndf
$Gustrasse
$Imatrenstrht. Burasse
$Brbeg
$Hoberasse
$Dorasse
$Derstrperasse
$Käggwelstz
$Idorelwenstrasse
$Aumstzsag
$Nerasse
$Nogasse
$Frasse
$Sim Strstrbechwierwerf
$Frasse
$Esteikestrtrg
$Nelstrasse
$Aprendaste
$Höstzgnhi
$Machohwergasse
$Zinstrbenerasse
$Holtrsstrasse
$Uerwinsstrenkastrurlzinstrachla
$Barasse
$Estralarasse
$Akatrasse
$Gelstzweg
$Drasse
$Nenperasse
$Zeachantengegeeieg
$Dacistrderstrasse
$Honsttrasse
$Astetralainkerasse
$Edeien-Re
$Pf-Watrhienbu
$Mackrldölstrstrasse
$Sutrrleg
$Homlzstilegusennstrasse
$Strsstertrneiplenbestacholvr A
$Sornfegprlenwieirasse
$Olg
$Wad-Weichweg
$Flbernterasse
$Bulaitrasse
$Frstsstenndeiswenklalweneuhar-
$Cobenkstrasse
$Sühosbautrst-Albüstrasse
$Dosteistrrerg
$Feserasse
$Berasse
$Nerasse
$Hiebmmmabatralwigeröm Bachrtrasse
$Fresigg
$Stzldofusenwermegasse
$Stsstrg
$Minsstratenhofulstrasse
$Mujatrchging
$Goserasse
$Mansttenstrstegg
$Leistrdeloli
$Hilerasse
$Riwesstrasse
$Strsquberg
$Singstrasse
$Bonstensbedegasse


In [76]:
names[0:6]

['$Aargauerstrasse$',
 '$Abeggweg$',
 '$Abendweg$',
 '$Ackermannstrasse$',
 '$Ackersteinstrasse$',
 '$Ackerstrasse$']

In [75]:
names2[0:6]

['$Aargauerstrasse$',
 '$Abeggweg$',
 '$Abendweg$',
 '$Ackermannstrasse$',
 '$Ackersteinstrasse$',
 ['$', 'W', 'ü', 'h', 'r', 'e', '$']]

### How good is our model?

In [None]:
d