## Modelling text through character-level bigrams

In [186]:
names = [line.rstrip() for line in open("../data/streets_zh.txt")]
names[:5]

['Aargauerstrasse',
 'Abeggweg',
 'Abendweg',
 'Ackermannstrasse',
 'Ackersteinstrasse']

In [187]:
# Add $ to mark beginning and ending of names
for i in range(len(names)):
    names[i] = "$" + names[i] + "$"

In [188]:
chars = set()
bigrams = set()
freq = dict()
for street in names:
    for pair in zip(street, street[1:]):
            chars.add(pair[0])
            bigrams.add(pair)
            freq[pair] = 1+freq.get(pair, 0)

In [200]:
freq_per_char = {i : dict() for i in chars}
for bigram, f in freq.items():
    i, j = bigram
    freq_per_char[i][j] = f

In [None]:
for k in freq_per_char.keys():
    total = sum(list(freq_per_char[k].values()))
    for v in freq_per_char[k]:
        freq_per_char[k][v]/= total

In [173]:
from random import choices

for _ in range(100):
    gen = ["$"]
    for _ in range(30):
        c = gen[-1]
        samples = list(freq_per_char[c].keys())
        p_samples = list(freq_per_char[c].values())
        next_c = choices(samples, p_samples)[0]
        gen.append(next_c)
        if next_c=="$":
            break
    print("".join(gen))
        

$Baiachmmbastofessse$
$Wooornstrsstred-Kobe Stendeseg
$Dre$
$Helastedimasttie$
$Dairlileg$
$Virauang$
$Masegetr$
$Werazg$
$Harasckeltrarassemaumeasstense
$Nigfe$
$Hühöckendi$
$Schssssegse$
$Amise$
$Nasense$
$Ze$
$Augas$
$Kestrsederastbe$
$Klotrare$
$Lolanegaratrasse$
$Irbaschasscholinwe$
$He$
$Imoralenwitrapfsselik$
$Ampldelwe$
$Maseratz$
$Ilarhreg$
$Mubassselesstterengasenweraufs
$Hasstrastrasserstrg$
$Frastzweransebatastrse$
$Kühsse$
$Listenssegabhase$
$Trd-Frachnseinsttrsckese$
$Gueserolichsse$
$Relense$
$Imerhalpleg$
$Elssse$
$Amme$
$Inzwe$
$Dasssässteggastz$
$Gegde$
$Ge$
$Otrarase$
$Oberolssenk$
$Ersseg$
$Ime$
$We$
$Enwenkerarackelwerssstrstr$
$Bastrase$
$Sofsstralerg$
$Zwe$
$Klutwergachtrastrsensserstratr
$Wadelinfsstrasseghöftsebalike$
$Wirasstrasse$
$Sckebe$
$Kusste$
$Geg$
$Zeng$
$Otrg$
$Memarte$
$Re$
$Scke$
$Gorarug$
$Worasssssstramassse$
$Sttrsenranpsenbugssberg$
$Nüg$
$Usse$
$Lene$
$Albassserrchünfhsänhastranwens
$Lofwe$
$Paatrarassse$
$Stz$
$Rasorassedwedasenmaselweinnf-F
$

In [190]:
sorted(list(freq.items()), key= lambda x: x[1])[-5:]

[(('r', 'a'), 1189),
 (('e', '$'), 1283),
 (('a', 's'), 1309),
 (('s', 's'), 1359),
 (('s', 'e'), 1366)]

{'E': {'i': 20,
  's': 7,
  'g': 6,
  'b': 1,
  'd': 4,
  'f': 1,
  'k': 2,
  'l': 9,
  'm': 4,
  'n': 11,
  'r': 13,
  't': 4,
  'u': 4,
  'y': 2},
 't': {'r': 1110,
  'e': 262,
  'l': 27,
  '-': 26,
  's': 65,
  'z': 103,
  'h': 28,
  't': 94,
  'w': 27,
  'b': 8,
  'i': 38,
  'o': 31,
  'a': 53,
  'n': 4,
  'g': 9,
  'u': 6,
  'f': 1,
  'p': 2,
  '$': 14,
  '.': 7,
  'ü': 4,
  'q': 1,
  'ä': 3,
  'ö': 5},
 'F': {'e': 24,
  'o': 8,
  'a': 7,
  'ä': 1,
  'i': 5,
  'I': 1,
  'A': 1,
  'l': 9,
  'ö': 2,
  'r': 43,
  'u': 7,
  'ü': 3},
 '.': {'-': 3, ' ': 7},
 'x': {'-': 4, 'w': 1},
 'ü': {'c': 16,
  't': 27,
  'h': 53,
  'd': 6,
  'm': 2,
  'g': 7,
  'n': 21,
  'l': 3,
  'r': 22,
  's': 15,
  'b': 5,
  'e': 2,
  'p': 1,
  'f': 1},
 'b': {'e': 165,
  'i': 14,
  'u': 12,
  'ü': 29,
  'o': 5,
  'r': 30,
  'a': 56,
  's': 16,
  'l': 13,
  'h': 5,
  '-': 4,
  't': 1,
  '$': 1,
  'm': 1,
  'y': 1,
  'b': 2,
  'g': 2,
  'ä': 2},
 'K': {'a': 31,
  'i': 14,
  'o': 13,
  'l': 21,
  'e': 11,
  'r'

## Use a smarter way to tokenize 

In [166]:
#In every iteration step (Byte-pair encoding): 
# 1. find most commmon pair
# 2. merge them and add to vocabulary
# 3. get counts on new vocabulary

new_tokens = set()
n_iterations = 12
names2 = names.copy()
vocab2 = set()
bigrams2 = set()
freq2 = freq.copy()

for n in range(n_iterations):
    #1
    new_token, _ = sorted(list(freq2.items()), key= lambda x: x[1])[-1]
    print(f"{new_token=}")
    #2
    new_tokens.add("".join(new_token))

    # need to treat words in wordlist as list of chars so we can handle merged tokens
    new_names = names.copy()
    temp_list = []
    for i, street in enumerate(names2):
        temp = []
        skip = False
        for i in range(len(street)-1):
            if skip:
                skip = False
                continue
            if new_token == (street[i], street[i+1]):
                temp.append("".join(new_token))
                skip = True
            else:
                temp.append(street[i])
        temp_list.append(temp+[street[-1]] if temp[-1][-1]!="$" else temp)
    names2 = temp_list
    
    # create count of tokens
    freq2 = dict()
    for street in names2:
        for pair in zip(street, street[1:]):
                vocab2.add(pair[0])
                bigrams2.add(pair)
                freq2[pair] = 1+freq2.get(pair, 0)
    freq_per_char2 = {i : dict() for i in vocab2}
    for bigram, f in freq2.items():
        i, j = bigram
        freq_per_char2[i][j] = f
    

new_token=('I', 'N')
new_token=('E', ' ')
new_token=('R', 'O')
new_token=('E', '$')
new_token=('T', 'A')
new_token=('L', 'O')
new_token=('R', 'I')
new_token=('IN', ' ')
new_token=('O', 'N')
new_token=(' ', 'C')
new_token=('A', 'N')
new_token=('E', 'R')


In [167]:
for _ in range(100):
    gen = ["$"]
    for _ in range(30):
        c = gen[-1]
        samples = list(freq_per_char2[c].keys())
        p_samples = list(freq_per_char2[c].values())
        next_c = choices(samples, p_samples)[0]
        gen.append(next_c)
        if next_c[-1]=="$":
            break
    print("".join(gen))

$TROCLORIDONTAM HLOITEQ IN PIONTAD W/30, 
$TIUNOGYDE 5% ET$
$BZ$
$EDET$
$ACIRDID ANCICLODROVIPARMBIROP$
$TIMASOLADRY$
$TE FOTANE 5%, 10% 20$
$M IN DA$
$ORVER$
$POS AMILLIN PROC-00 IANDE$
$CHLORIDESE LEELC CONE$
$ENOTHATASIUBICHLOXEFADRAINE M$
$L$
$STAR ANCAPLARATENETASTIUPHAQUIANDE 
$EE$
$PRIDROMD IN PATIROPA AN$
$COLAL$
$PTRUINE 5% KAMITABRETRP X 5/0 L$
$KETALEDE$
$Z IN PSITATE$
$PINERMIDOSACHM$
$ARIDE$
$RIVAMISILASYC CON ANINER$
$SODOC CONTATE ANDROMIDIPA$
$MODIEQUMONTAINER$
$MENORLARIXTHADE 0$
$HASORENBIIBEOLFICHFUNAPHYLARAS
$ENTICARMEORINETENOD IN PRAGRETAINE F
$OVALVOPROVITIONA$
$S$
$ZIA$
$ZINER$
$LEX$
$GL$
$ARARANEFLORITIS STHUT IN$
$OL W/Z-DAPROCAZORIN$
$DN'SSISAL$
$ACICHLOVIPTAINER$
$MIDIS$
$AFOMEDE$
$OLFOTAINOSSUMOLUMESINATITAIN$
$VERYLETY-"280.99%$
$TROSIULEMIN SO$
$LORIDUCATE$
$XRACARIAFOSTIC CLARANTAIN$
$M CONTANOCODROSOLAROL ADE FLTESOLORIPA
$ECONTASYD 4)$
$DE 1% EDARAQUPOL 75% SE$
$A-TIUMARTERRA$
$C CONTAVE IN PLAM$
$LY$
$TOPRANOZILF$
$AR BAN$
$ORM ECIN W/30.45% (FALLAPZI

In [76]:
names[0:6]

['$Aargauerstrasse$',
 '$Abeggweg$',
 '$Abendweg$',
 '$Ackermannstrasse$',
 '$Ackersteinstrasse$',
 '$Ackerstrasse$']

In [75]:
names2[0:6]

['$Aargauerstrasse$',
 '$Abeggweg$',
 '$Abendweg$',
 '$Ackermannstrasse$',
 '$Ackersteinstrasse$',
 ['$', 'W', 'ü', 'h', 'r', 'e', '$']]

### How good is our model?

In [None]:
# compute negative log likelihood of above model
# use NN to learn parameters (freq dictionary) to minimize log likelihood

In [148]:
import pandas as pd
a = pd.read_csv("../data/fda.txt", sep='\t', on_bad_lines='skip')

In [152]:
len(a["DrugName"].unique())

7753

In [159]:
with open('../data/fda.txt', 'w') as f:
    for line in set(a["DrugName"]):
        f.write(f"{line}\n")

{'MARINOL',
 'ORTIKOS',
 'LODINE XL',
 'NEXCEDE',
 'EDECRIN',
 'HYDROCORTISONE ACETATE 1% AND PRAMOXINE HYDROCHLORIDE 1%',
 'AMINOSYN II 5% W/ ELECTROLYTES IN DEXTROSE 25% W/ CALCIUM IN PLASTIC CONTAINER',
 'SAXENDA',
 'ACCUTANE',
 'DELZICOL',
 'RIFAMPIN',
 'BRETHAIRE',
 'DEXTROSE 5%, SODIUM CHLORIDE 0.33% AND POTASSIUM CHLORIDE 30MEQ IN PLASTIC CONTAINER',
 'PRANDIN',
 'FAMVIR',
 'PROCAINE HYDROCHLORIDE W/ EPINEPHRINE',
 'MEDICAL AIR, USP',
 'AQUATAG',
 'TRAVASOL 2.75% IN DEXTROSE 20% IN PLASTIC CONTAINER',
 'RIVASTIGMINE TARTRATE',
 'LEVOFLOXACIN',
 'SODIUM CHLORIDE 0.9% AND POTASSIUM CHLORIDE 0.15% IN PLASTIC CONTAINER',
 'NEO-POLYCIN',
 'CONTRAVE',
 'NAMENDA XR',
 'FORADIL',
 'ALLEGRA ALLERGY',
 'MECAMYLAMINE HYDROCHLORIDE',
 'IVADANTIN',
 'PARATHAR',
 'ZINBRYTA',
 'CODAMINE',
 'CODEINE, ASPIRIN, APAP FORMULA NO. 4',
 'JYLAMVO',
 "LACTATED RINGER'S AND DEXTROSE 5% IN PLASTIC CONTAINER",
 'SINEMET CR',
 'AMINOSYN II 15% IN PLASTIC CONTAINER',
 'ANTHELIOS 40',
 'LORATADINE AND PSEUDO

## log likelihood

In [215]:
# P(parameters|data) = P(data|parameters)*P(parameters)/P(data)
# P("Aargauerstrasse"| parameters) =


In [208]:
sample = names[0]

In [221]:
from math import log

In [223]:
for street in [sample]:
    likelihood = 1
    for pair in zip(street, street[1:]):
        likelihood *=  freq_per_char[pair[0]][pair[1]]
        print(pair, freq_per_char[pair[0]][pair[1]])
    print(likelihood)
    print(log(likelihood))
    print(-log(likelihood))

('$', 'A') 0.055776892430278883
('A', 'a') 0.00847457627118644
('a', 'r') 0.04895104895104895
('r', 'g') 0.05328376703841388
('g', 'a') 0.15490375802016498
('a', 'u') 0.047639860139860137
('u', 'e') 0.05530973451327434
('e', 'r') 0.16194644696189495
('r', 's') 0.0912845931433292
('s', 't') 0.28368964688926257
('t', 'r') 0.5757261410788381
('r', 'a') 0.49111937216026436
('a', 's') 0.5721153846153846
('s', 's') 0.3264472736007687
('s', 'e') 0.3281287533029066
('e', '$') 0.3303295571575695
1.2080002980894415e-14
-32.04722495564126
32.04722495564126
