In [1]:
from collections import Counter
import pickle

In [2]:
import numpy as np
import pandas as pd
import tensorflow.keras as keras

# load data

In [3]:
laws = pickle.load(open('./data/law_objects.p', 'rb'))
aoda = laws['Accessibility for Ontarians with Disabilities Act, 2005, S.O. 2005, c. 11']

In [4]:
# load the table of contents...
# df_toc = pd.DataFrame(aoda.table_of_content.values())
# df_toc

## make training data for model

In [5]:
def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [6]:
full_text = aoda.get_full_text().split()

In [7]:
vocab_size = 1000

In [9]:
data, count, dictionary, reversed_dictionary = build_dataset(full_text, vocab_size)

In [10]:
sampling_table = keras.preprocessing.sequence.make_sampling_table(vocab_size)

In [11]:
window_size = 3

In [12]:
couples, labels = keras.preprocessing.sequence.skipgrams(data, vocab_size, window_size=window_size, sampling_table=sampling_table)

In [13]:
word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

# define model

In [8]:
vector_dim = 50

In [14]:
input_target = keras.Input((1,))
input_context = keras.Input((1,))

embedding = keras.layers.Embedding(vocab_size, vector_dim, input_length=1, name='embedding')

In [15]:
target = embedding(input_target)

In [16]:
target.shape

TensorShape([None, 1, 50])

In [17]:
target = keras.layers.Reshape((vector_dim, 1))(target)

In [18]:
target.shape

TensorShape([None, 50, 1])

In [19]:
context = embedding(input_context)
context = keras.layers.Reshape((vector_dim, 1))(context)

In [20]:
context.shape

TensorShape([None, 50, 1])

In [21]:
# keras.layers.Dot?

In [22]:
# now perform the dot product operation to get a similarity measure
dot_product = keras.layers.Dot(axes=(1, 1))([target, context])
dot_product = keras.layers.Reshape((1,))(dot_product)
# add the sigmoid output layer
output = keras.layers.Dense(1, activation='sigmoid')(dot_product)

In [23]:
dot_product.shape

TensorShape([None, 1])

In [24]:
# keras.models.Model?

In [25]:
model = keras.models.Model(inputs=[input_target, input_context], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='rmsprop')

# training

In [26]:
arr_1 = np.zeros((1,))
arr_2 = np.zeros((1,))
arr_3 = np.zeros((1,))


def train_on_idx(idx):
    arr_1[0,] = word_target[idx]
    arr_2[0,] = word_context[idx]
    arr_3[0,] = labels[idx]
    loss = model.train_on_batch([arr_1, arr_2], arr_3)
    return loss


for epoch in range(5):
    for i, _ in enumerate(labels):
        idx = np.random.randint(0, len(labels)-1)
        loss = train_on_idx(idx)
        if i % 100 == 0:
            print(f"epoch={epoch}, iteration={i}, loss={loss}")
            status_dict = dict(
                epoch=epoch,
                iteration=i,
                loss=loss
            )
            print(json.dumps(status_dict))


epoch=0, iteration=0, loss=0.6934535503387451
epoch=0, iteration=100, loss=0.6907022595405579
epoch=0, iteration=200, loss=0.7000821828842163
epoch=0, iteration=300, loss=0.6992325186729431
epoch=0, iteration=400, loss=0.7059688568115234
epoch=0, iteration=500, loss=0.7028411030769348
epoch=0, iteration=600, loss=0.6720324158668518
epoch=0, iteration=700, loss=0.6694661378860474
epoch=0, iteration=800, loss=0.7160507440567017
epoch=0, iteration=900, loss=0.7050637602806091
epoch=0, iteration=1000, loss=0.6892139911651611
epoch=0, iteration=1100, loss=0.6951765418052673
epoch=0, iteration=1200, loss=0.6999302506446838
epoch=0, iteration=1300, loss=0.6990377902984619
epoch=0, iteration=1400, loss=0.715488076210022
epoch=0, iteration=1500, loss=0.6624873280525208
epoch=0, iteration=1600, loss=0.7055563926696777
epoch=0, iteration=1700, loss=0.6980881690979004
epoch=0, iteration=1800, loss=0.7003752589225769
epoch=0, iteration=1900, loss=0.7037999629974365
epoch=0, iteration=2000, loss=0.6

# inspect model

In [27]:
w = embedding.get_weights()[0]

In [28]:
idx = dictionary['person']

In [29]:
w[idx]

array([-0.06638943,  0.11270314,  0.11104234,  0.06465197, -0.1461588 ,
       -0.02137691, -0.21196291, -0.07766849, -0.04437681, -0.09051446,
        0.00324201,  0.09179306, -0.07834518,  0.02021119, -0.0828746 ,
        0.10674976, -0.15503407,  0.06106364, -0.0236591 ,  0.0548764 ,
        0.09322184, -0.04171119,  0.0598115 , -0.05916625, -0.14053914,
        0.11972107,  0.16020969,  0.01603727, -0.09535272, -0.17483722,
        0.03450698, -0.12684678,  0.00837175, -0.0446505 , -0.04457981,
       -0.07069454, -0.04461625, -0.07375158, -0.02310784, -0.07724912,
       -0.06035157,  0.04088867, -0.08193712,  0.04613021,  0.07917677,
        0.18445614,  0.01336339, -0.15753128, -0.02324865,  0.02681453],
      dtype=float32)

In [33]:
w[idx].shape

(50,)

## query nearest neighbors in embedding space

In [34]:
from sklearn.neighbors import KNeighborsTransformer

In [35]:
knn = KNeighborsTransformer(n_neighbors=1, metric='cosine')

In [36]:
knn.fit(w)

KNeighborsTransformer(algorithm='auto', leaf_size=30, metric='cosine',
                      metric_params=None, mode='distance', n_jobs=1,
                      n_neighbors=1, p=2)

In [63]:
# knn.kneighbors?

neigh_dist, neigh_ind = knn.kneighbors(w[idx].reshape(1, -1), n_neighbors=5)

print(neigh_dist[0])
print(neigh_ind[0])
print(list(reversed_dictionary[i] for i in neigh_ind[0]))

[0.        0.4581476 0.4967268 0.5803944 0.6285573]
[ 10  13 254 236 763]
['person', 'for', 'order;', 'internet', 'reliance']
