# Experiment: Standart Tokenizer vs BytePairEncoding

### Standard Tokenizer

In [1]:
from Tokenizer import Tokenizer, load_sst_data, word_freq_dictionary, tokenize, clean

In [2]:
dim = 5000

In [3]:
# Loading datasets
dataset = dict()
for data in ('train', 'test', 'dev'):
    dataset[data] = load_sst_data(f"./data/{data}.txt")

In [4]:
word_freq = word_freq_dictionary(dataset['train'], sep='')
word_freq.head()

Unnamed: 0,word,freq
4,the,5954
26,a,4361
14,and,3831
46,of,3631
18,to,2438


In [5]:
tokenizer = Tokenizer(dim)
tokenizer.train(word_freq)

In [6]:
words = [
     'lessons',
     'stilted',
     'bond',
     'relentlessly',
     'caper',
     'dysfunctional',
     'multiplex',
     'pleasant',
     'asaaaa'
        ]

for word in words:
    code = tokenizer.encode(word)
    n_word = tokenizer.decode(code)
    print(word, code, n_word)

lessons 2526 lessons
stilted 4189 stilted
bond 468 bond
relentlessly 3544 relentlessly
caper 591 caper
dysfunctional 1299 dysfunctional
multiplex 2868 multiplex
pleasant 3241 pleasant
asaaaa 0 <UNK>


In [7]:
dataset['train']

[{'label': 0,
  'text': 'Yet another entry in the sentimental oh-those-wacky-Brits genre that was ushered in by The Full Monty and is still straining to produce another smash hit .'},
 {'label': 0,
  'text': 'It desperately wants to be a wacky , screwball comedy , but the most screwy thing here is how so many talented people were convinced to waste their time .'},
 {'label': 1,
  'text': 'A bit of a downer and a little over-dramatic at times , but this is a beautiful film for people who like their romances to have that French realism .'},
 {'label': 0, 'text': "It 's horribly depressing and not very well done ."},
 {'label': 1,
  'text': 'In painting an unabashedly romantic picture of a nation whose songs spring directly from the lives of the people , the movie exalts the Marxian dream of honest working folk , with little to show for their labor , living harmoniously , joined in song .'},
 {'label': 1,
  'text': "A plethora of engaging diatribes on the meaning of ` home , ' delivered i

In [8]:
import numpy as np

def one_hot_encoder(sequence, dim):
    v = np.zeros(dim)
    for i in sequence:
        if 0 <= i < dim:
            v[i] += 1
    return v

def generate_data(dataset, dim):
    n = len(dataset)
    X = np.zeros((n, dim))
    y = np.zeros(n)
    for i, line in enumerate(dataset):
        sentence = []
        for word in tokenize(line['text']):
            sentence.append(tokenizer.encode(clean(word, '')))
        X[i,:] = one_hot_encoder(sentence, dim)
        y[i] = line['label']
    
    return X, y

In [9]:
X_train, y_train = generate_data(dataset["train"], dim)
X_test, y_test = generate_data(dataset["test"], dim)
X_dev, y_dev = generate_data(dataset["dev"], dim)

In [10]:
import tensorflow as tf
import math as m
dim = 5000

units = 128

model = tf.keras.Sequential([
    tf.keras.layers.Dense(units=units, input_shape=(dim,), activation='relu', name='hidden_layer'),
    tf.keras.layers.Dense(units=1, activation='sigmoid', name='output_layer')
])

# Compilar el modelo
model.compile(
    optimizer='adam',
    loss="binary_crossentropy",
    metrics=['accuracy']
)
model.summary()




Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 hidden_layer (Dense)        (None, 128)               640128    
                                                                 
 output_layer (Dense)        (None, 1)                 129       
                                                                 
Total params: 640257 (2.44 MB)
Trainable params: 640257 (2.44 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [11]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3)
history = model.fit(x=X_train, y=y_train, epochs=50, validation_data=(X_dev, y_dev), callbacks=[callback])

Epoch 1/50


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50


In [12]:
history.history.keys()

dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])

In [18]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

def evaluate(X, y, model):
    y_p = (model.predict(X)>=0.5).astype(float).flatten()
    return [accuracy_score(y, y_p), precision_score(y, y_p), recall_score(y, y_p), f1_score(y, y_p)]

def get_df(model, train, test, dev):
    data = []
    for dataset in (train, test, dev):
        X, y = dataset
        data.append(evaluate(X, y, model))
    return pd.DataFrame(data, columns=['Accuracy', 'Precision', 'Recall', 'F1'], index=["Train", "Test", "Dev"])

In [19]:
get_df(model, (X_train, y_train), (X_test, y_test), (X_dev, y_dev))



Unnamed: 0,Accuracy,Precision,Recall,F1
Train,0.993064,0.993625,0.993075,0.99335
Test,0.791323,0.778714,0.812981,0.795479
Dev,0.767202,0.768374,0.777027,0.772676


In [20]:
print(X_train[:, :1].sum(axis=1).mean(), X_train[:, 1:].sum(axis=1).mean())
print(X_test[:, :1].sum(axis=1).mean(), X_test[:, 1:].sum(axis=1).mean())
print(X_dev[:, :1].sum(axis=1).mean(), X_dev[:, 1:].sum(axis=1).mean())

3.853612716763006 15.444653179190752
4.385502471169687 14.846787479406919
4.377293577981652 15.170871559633028


In [33]:
prob = word_freq.copy()
prob["freq"] = prob["freq"] / sum(prob["freq"])
prob["freq_acumulada"] = prob["freq"].cumsum()
prob

Unnamed: 0,word,freq,freq_acumulada
4,the,0.050238,0.050238
26,a,0.036797,0.087035
14,and,0.032325,0.119360
46,of,0.030637,0.149998
18,to,0.020571,0.170569
...,...,...,...
632,intermediary,0.000008,0.999966
8429,mattered,0.000008,0.999975
8427,stills,0.000008,0.999983
8425,banger,0.000008,0.999992


In [40]:
import random
def generate_word(df):
    rand = random.random()
    return df[df["freq_acumulada"]>rand]["word"].iloc[0]

In [49]:
words = []
for _ in range(5*500):
    words.append(generate_word(prob))

In [50]:
print(" ".join(words))

style mastering then through dearly nothing adults nt a or beyond is as his shows touching that nothing imitative it movie too theater their s the steal pokemon as get nt a tragedy nt before filmmaker cameo it the s in the hits giant inventive the slambang this notes out at life almost joy been fully a bluescreen them everyone funny to enhance specter the is needed just every s engaged rather either is the room should me and comic do it lovely unashamedly criterion and kind of and you this with snore nearly that fun give and at trailer engagingly the means wanders may amount shocks shallow resident besotted souls insightful plotline again have have like a pile and the a a greene have does matter haunted despite teensleaze horrifying my a a selfreflexive give disney lobby riveted by a him the the tragedy not from think it climactic is version an watchable a and edits in the definitely re that s a the the but it and production he all lane to of have it repetition cartoon grant chin the h