## Modelling text through character-level bigrams

In [1]:
names = [line.rstrip() for line in open("../data/streets_zh.txt")]
names[:5]

['Aargauerstrasse',
 'Abeggweg',
 'Abendweg',
 'Ackermannstrasse',
 'Ackersteinstrasse']

In [2]:
# Add $ to mark beginning and ending of names
for i in range(len(names)):
    names[i] = "$" + names[i] + "$"

In [3]:
chars = set()
bigrams = set()
freq = dict()
for street in names:
    for pair in zip(street, street[1:]):
            chars.add(pair[0])
            bigrams.add(pair)
            freq[pair] = 1+freq.get(pair, 0)

In [4]:
freq_per_char = {i : dict() for i in chars}
for bigram, f in freq.items():
    i, j = bigram
    freq_per_char[i][j] = f

In [5]:
for k in freq_per_char.keys():
    total = sum(list(freq_per_char[k].values()))
    for v in freq_per_char[k]:
        freq_per_char[k][v]/= total

In [6]:
from random import choices

for _ in range(100):
    gen = ["$"]
    for _ in range(30):
        c = gen[-1]
        samples = list(freq_per_char[c].keys())
        p_samples = list(freq_per_char[c].values())
        next_c = choices(samples, p_samples)[0]
        gen.append(next_c)
        if next_c=="$":
            break
    print("".join(gen))
        

$Masse$
$Hallachassgi-Süd-Müserastasens
$Rubasbe$
$Am Wauckenralerwehegsblmm Stri
$Obentrasseggasgwiweustratrolas
$Eistalg$
$Hurostrastrastse$
$We$
$Pfipere$
$Obanseirlie$
$Auaslssteurachomünniselöst$
$Re$
$Rotrrckegg$
$Geisstengseg$
$Gise$
$Webasttzwelsersstrasstrerastrn
$Schöwengerlen$
$Lolasssse$
$We$
$Zwigeransserinbele$
$Henerasenetestraule$
$Zütrm Sinz$
$Wege$
$Hof$
$Sotrofssesserste$
$Hoseng$
$Hoteig$
$Schastrsserasennasse$
$Flasstz$
$Rorad$
$Kldeg$
$Gneraelstzwe$
$Folähsenbe$
$Imolg$
$Biktrwimofe$
$Ze$
$Mape$
$Gesetrinenbee$
$Kerssserase$
$Rür-Krachense$
$Jachggwen$
$Harherase$
$Schalialerassssstzhühoohmastra
$Schachastenkurachplolagsstr$
$Jorachwerenseidstre$
$Flininbrdweng$
$Feg$
$Kelz$
$Häswedasserasse$
$Rig$
$Ematrastrasstibe$
$Nästegg$
$Hersste$
$Utzegas$
$Strstrase$
$Ne$
$Stlverghstrag$
$Löbenge$
$Stoltrd$
$Kli-St-Splilenz$
$Gasteurasegase$
$Röfuracharase$
$Rolassterm Sckserim Pertrasstz
$Ettrzweggf$
$Kachblpfersegasseg$
$Obeinaseie$
$Lamaseildge$
$Gnklsstze$
$Strlihe$
$R

## Use a smarter way to tokenize 

In [7]:
#In every iteration step (Byte-pair encoding): 
# 1. find most commmon pair
# 2. merge them and add to vocabulary
# 3. get counts on new vocabulary

new_tokens = set()
n_iterations = 12
names2 = names.copy()
vocab2 = set()
bigrams2 = set()
freq2 = freq.copy()

for n in range(n_iterations):
    #1
    new_token, _ = sorted(list(freq2.items()), key= lambda x: x[1])[-1]
    print(f"{new_token=}")
    #2
    new_tokens.add("".join(new_token))

    # need to treat words in wordlist as list of chars so we can handle merged tokens
    new_names = names.copy()
    temp_list = []
    for i, street in enumerate(names2):
        temp = []
        skip = False
        for i in range(len(street)-1):
            if skip:
                skip = False
                continue
            if new_token == (street[i], street[i+1]):
                temp.append("".join(new_token))
                skip = True
            else:
                temp.append(street[i])
        temp_list.append(temp+[street[-1]] if temp[-1][-1]!="$" else temp)
    names2 = temp_list
    
    # create count of tokens
    freq2 = dict()
    for street in names2:
        for pair in zip(street, street[1:]):
                vocab2.add(pair[0])
                bigrams2.add(pair)
                freq2[pair] = 1+freq2.get(pair, 0)
    freq_per_char2 = {i : dict() for i in vocab2}
    for bigram, f in freq2.items():
        i, j = bigram
        freq_per_char2[i][j] = f
    

new_token=('s', 'e')
new_token=('a', 's')
new_token=('as', 'se')
new_token=('asse', '$')
new_token=('s', 't')
new_token=('r', 'asse$')
new_token=('st', 'rasse$')
new_token=('e', 'r')
new_token=('e', 'n')
new_token=('g', '$')
new_token=('e', 'g$')
new_token=('c', 'h')


In [8]:
for _ in range(100):
    gen = ["$"]
    for _ in range(30):
        c = gen[-1]
        samples = list(freq_per_char2[c].keys())
        p_samples = list(freq_per_char2[c].values())
        next_c = choices(samples, p_samples)[0]
        gen.append(next_c)
        if next_c[-1]=="$":
            break
    print("".join(gen))

$Spenplanhalatzweg$
$Neiassin$
$Alavegatpolkweg$
$Hinenstrasse$
$Sirgls$
$Sühlhm Tibenpppweg$
$Selstrasse$
$Schögasse$
$Beusibuläue$
$Gicheiepfenstrasse$
$Brmellig$
$Riggkagleistrasse$
$Lakerglstrasse$
$Oes$
$Ratzenanweg$
$Petthlieusckeshldstrasse$
$Bügstrasse$
$Käbrauneteseruschhoplanerwalergagw
$Weg$
$Kölad-We$
$Grastrasse$
$Machwageledatstrasse$
$Zwienken$
$Heiesenstrasse$
$Muhlergstrasse$
$Elingenastgasse$
$Uleipinkergstetlichstrasse$
$Schöckweg$
$Grchholzestegasse$
$Borm Korardwili-Rauber$
$Imooohat$
$Jus$
$Toschiweg$
$Weistrasse$
$Bastistesenstrasse$
$Sterhnanzstrasse$
$Ho-Schstrasse$
$Lustrasse$
$Butrasse$
$Bienstrasse$
$Zolbsm Stel$
$Säckenstergasse$
$Giadenstrasse$
$Histrasse$
$Maweg$
$Stlgstrasse$
$Gisenkstrasse$
$Geiz$
$Grstrasse$
$Gelmachtofickwimperenhös-Wo-Peichh
$Halbergatstrasse$
$Ileinditistrasse$
$Uleistrasse$
$Klf-Mos$
$Gweg$
$Honstrasse$
$Schbes$
$Wieiscomizesch-Sirldelllkitügweg$
$Lattz$
$Margstrasse$
$Bägstrasse$
$Bädhofalig$
$Bäckstrasse$
$Ungstrasse$
$Bühanelela

## How good is our model?

{'MARINOL',
 'ORTIKOS',
 'LODINE XL',
 'NEXCEDE',
 'EDECRIN',
 'HYDROCORTISONE ACETATE 1% AND PRAMOXINE HYDROCHLORIDE 1%',
 'AMINOSYN II 5% W/ ELECTROLYTES IN DEXTROSE 25% W/ CALCIUM IN PLASTIC CONTAINER',
 'SAXENDA',
 'ACCUTANE',
 'DELZICOL',
 'RIFAMPIN',
 'BRETHAIRE',
 'DEXTROSE 5%, SODIUM CHLORIDE 0.33% AND POTASSIUM CHLORIDE 30MEQ IN PLASTIC CONTAINER',
 'PRANDIN',
 'FAMVIR',
 'PROCAINE HYDROCHLORIDE W/ EPINEPHRINE',
 'MEDICAL AIR, USP',
 'AQUATAG',
 'TRAVASOL 2.75% IN DEXTROSE 20% IN PLASTIC CONTAINER',
 'RIVASTIGMINE TARTRATE',
 'LEVOFLOXACIN',
 'SODIUM CHLORIDE 0.9% AND POTASSIUM CHLORIDE 0.15% IN PLASTIC CONTAINER',
 'NEO-POLYCIN',
 'CONTRAVE',
 'NAMENDA XR',
 'FORADIL',
 'ALLEGRA ALLERGY',
 'MECAMYLAMINE HYDROCHLORIDE',
 'IVADANTIN',
 'PARATHAR',
 'ZINBRYTA',
 'CODAMINE',
 'CODEINE, ASPIRIN, APAP FORMULA NO. 4',
 'JYLAMVO',
 "LACTATED RINGER'S AND DEXTROSE 5% IN PLASTIC CONTAINER",
 'SINEMET CR',
 'AMINOSYN II 15% IN PLASTIC CONTAINER',
 'ANTHELIOS 40',
 'LORATADINE AND PSEUDO

In [10]:
from math import log

In [209]:
total = 0
for street in names:
    likelihood = 1
    for pair in zip(street, street[1:]):
        likelihood *=  freq_per_char[pair[0]][pair[1]]
    total += -log(likelihood)
print(likelihood)
print(log(likelihood))
print(total/len(names))

8.974390446462344e-15
-32.3444013796715
29.491154302940576


In [26]:
import torch
import torch.nn.functional as F
chars2int = {j:i for i,j in enumerate(chars)}

In [201]:
weights = torch.rand([len(chars), len(chars)], requires_grad=True)

In [170]:
out.min()

tensor(0., grad_fn=<MinBackward1>)

In [177]:
min(out)

tensor(0., grad_fn=<UnbindBackward0>)

In [194]:
torch.tensor([1, 2]) + 0.1

tensor([1.1000, 2.1000])

In [208]:
optimizer = torch.optim.SGD([weights], lr=1e-2)
iterations = 20
for _ in range(iterations):
    total_loss = 0
    for street in names:
        batch_loss = 0
        for pair in zip(street, street[1:]):
            #print(pair)
            inp = F.one_hot(torch.tensor(chars2int[pair[0]]), num_classes = len(chars)).float()
            target = F.one_hot(torch.tensor(chars2int[pair[1]]), num_classes = len(chars)).float()
            out = torch.matmul(weights, inp)
            out = out - out.min() + 1e-3
            out = out/out.sum() # or softmax
            likelihood = torch.matmul(out, target)
            #print(likelihood, out)
            nll = -torch.log(likelihood)
            #print(nll)
            batch_loss += nll
        total_loss += batch_loss
        #print(batch_loss)
        batch_loss.backward()
        optimizer.step()
    print(total_loss/len(names))

tensor(44.9318, grad_fn=<DivBackward0>)
tensor(44.9864, grad_fn=<DivBackward0>)
tensor(44.9819, grad_fn=<DivBackward0>)


KeyboardInterrupt: 

In [141]:
tensor(47.9957, grad_fn=<DivBackward0>)
tensor(46.6217, grad_fn=<DivBackward0>)
tensor(45.9667, grad_fn=<DivBackward0>)


tensor([[ 0.2481,  0.3928,  0.5752,  ...,  0.0691,  0.7404,  0.4393],
        [ 0.1059,  0.1935,  0.8688,  ...,  0.4674,  0.9046,  0.4681],
        [ 0.1003,  0.0408,  0.0404,  ..., -0.0074,  0.1837,  0.6896],
        ...,
        [ 0.5165,  0.7170,  0.1036,  ..., -0.0599,  0.4424,  0.5370],
        [ 0.9685,  0.9567,  0.3389,  ..., -0.0942,  0.0415,  0.4950],
        [ 0.0741,  0.5331,  0.9845,  ...,  0.7413,  0.6096,  0.4676]],
       requires_grad=True)

In [106]:
out

tensor([0.0175, 0.0275, 0.0173, 0.0248, 0.0136, 0.0186, 0.0280, 0.0199, 0.0036,
        0.0092, 0.0101, 0.0191, 0.0147, 0.0163, 0.0275, 0.0215, 0.0065, 0.0290,
        0.0132, 0.0217, 0.0149, 0.0297, 0.0273, 0.0185, 0.0086, 0.0058, 0.0033,
        0.0011, 0.0177, 0.0196, 0.0294, 0.0225, 0.0321, 0.0125, 0.0285, 0.0130,
        0.0198, 0.0046, 0.0012, 0.0305, 0.0033, 0.0122, 0.0282, 0.0109, 0.0231,
        0.0265, 0.0214, 0.0014, 0.0190, 0.0002, 0.0126, 0.0113, 0.0311, 0.0185,
        0.0268, 0.0136, 0.0163, 0.0231], grad_fn=<DivBackward0>)

In [50]:
(out/out.sum()).sum()

tensor(1.)

In [66]:
torch.arange(5)

tensor([0, 1, 2, 3, 4])

In [164]:
torch.nn.functional.normalize(torch.tensor([1.0, 2.0]))

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [None]:
# Model with a NN
# Check embeddings (similair embedding between "Strasse", "Weg")