In [1]:
from collections import Counter
import statistics
import pickle

In [2]:
import numpy as np
import pandas as pd
import tensorflow.keras as keras

In [3]:
np.set_printoptions(suppress=True)

# load data

In [4]:
laws = pickle.load(open('./data/law_objects.p', 'rb'))
aoda = laws['Accessibility for Ontarians with Disabilities Act, 2005, S.O. 2005, c. 11']

In [5]:
# load the table of contents...
# df_toc = pd.DataFrame(aoda.table_of_content.values())
# df_toc

## make training data for model

In [6]:
def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [7]:
full_text = aoda.get_full_text().split()

In [8]:
vocab_size = 1000

In [9]:
data, count, dictionary, reversed_dictionary = build_dataset(full_text, vocab_size)

In [10]:
sampling_table = keras.preprocessing.sequence.make_sampling_table(vocab_size)

In [11]:
window_size = 3

In [12]:
couples, labels = keras.preprocessing.sequence.skipgrams(data, vocab_size, window_size=window_size, sampling_table=sampling_table)

In [13]:
word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

# define model

In [14]:
vector_dim = 50

In [15]:
input_target = keras.Input((1,))
input_context = keras.Input((1,))

embedding = keras.layers.Embedding(vocab_size, vector_dim, input_length=1, name='embedding')

In [16]:
target = embedding(input_target)

In [17]:
target.shape

TensorShape([None, 1, 50])

In [18]:
target = keras.layers.Reshape((vector_dim, 1))(target)

In [19]:
target.shape

TensorShape([None, 50, 1])

In [20]:
context = embedding(input_context)
context = keras.layers.Reshape((vector_dim, 1))(context)

In [21]:
context.shape

TensorShape([None, 50, 1])

In [22]:
# keras.layers.Dot?

In [23]:
# now perform the dot product operation to get a similarity measure
dot_product = keras.layers.Dot(axes=(1, 1))([target, context])
dot_product = keras.layers.Reshape((1,))(dot_product)
# add the sigmoid output layer
output = keras.layers.Dense(1, activation='sigmoid')(dot_product)

In [24]:
dot_product.shape

TensorShape([None, 1])

In [25]:
# keras.models.Model?

In [26]:
model = keras.models.Model(inputs=[input_target, input_context], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='rmsprop')

# inspect embedding

In [27]:
w = embedding.get_weights()[0]

In [28]:
idx = dictionary['person']

In [29]:
w[idx]

array([ 0.02925575, -0.01843834,  0.03733312, -0.0313976 , -0.01940695,
        0.01041225,  0.00201752,  0.02015884,  0.0386101 ,  0.00533808,
        0.03039486, -0.00579585,  0.033606  , -0.01014917,  0.04261595,
        0.04498413, -0.04882659,  0.02630835,  0.01966413,  0.04584287,
        0.00514659, -0.04451193, -0.04878862, -0.02215931, -0.01911682,
       -0.0174652 ,  0.0365127 ,  0.04618626,  0.00439218,  0.01646158,
        0.04733505, -0.00554984, -0.00264033,  0.01767657, -0.02857149,
        0.01559896, -0.02571657, -0.02604746, -0.0153017 ,  0.0063947 ,
       -0.00148872, -0.01271559, -0.02182945,  0.033891  , -0.01929326,
        0.00202904,  0.04824536, -0.01251889, -0.00886708, -0.02695013],
      dtype=float32)

In [30]:
w[idx].shape

(50,)

## query nearest neighbors in embedding space

In [31]:
from sklearn.neighbors import KNeighborsTransformer

In [32]:
knn = KNeighborsTransformer(n_neighbors=1, metric='cosine')

In [33]:
knn.fit(w)

KNeighborsTransformer(algorithm='auto', leaf_size=30, metric='cosine',
                      metric_params=None, mode='distance', n_jobs=1,
                      n_neighbors=1, p=2)

In [34]:
# knn.kneighbors?

neigh_dist, neigh_ind = knn.kneighbors(w[idx].reshape(1, -1), n_neighbors=5)

print(neigh_dist[0])
print(neigh_ind[0])
print(list(reversed_dictionary[i] for i in neigh_ind[0]))

[0.00000006 0.59828985 0.6276052  0.6665269  0.67346984]
[ 10 193 646 531 474]
['person', 'set', 'known', 'associated', 'Executive']


In [35]:
def query_neighbors_in_embedding(w, words, n_neighbors=5):
    knn = KNeighborsTransformer(n_neighbors=1, metric='cosine')
    knn.fit(w)
    idx = [dictionary[word] for word in words]
    neigh_dist, neigh_ind = knn.kneighbors(w[(idx,)], n_neighbors + 1)
    # omit the first neighbor, which is always word the word being queried
    neigh_dist, neigh_ind = neigh_dist[:, 1:] , neigh_ind[:, 1:]
    results = {
        word: {
            reversed_dictionary[neighbor_ind]: float(neigh_dist[ith_word][k])
            for k, neighbor_ind in enumerate(neigh_ind[ith_word])
        }
        for ith_word, word in enumerate(words)
    }
    return results


In [36]:
query_results = query_neighbors_in_embedding(
    w,
    ['person', 'accessibility', 'organization', 'penalty'],
    n_neighbors=4)

print(json.dumps(query_results, indent=2))

{
  "person": {
    "set": 0.5982899069786072,
    "known": 0.6276051998138428,
    "associated": 0.6665268540382385,
    "Executive": 0.6734698414802551
  },
  "accessibility": {
    "25": 0.5333746671676636,
    "employment,": 0.5708682537078857,
    "range": 0.5768356323242188,
    "proposed": 0.5900076627731323
  },
  "organization": {
    "made": 0.5606701374053955,
    "duties": 0.575109601020813,
    "enforcement": 0.5856479406356812,
    "developed": 0.6025199890136719
  },
  "penalty": {
    "birth": 0.5703414678573608,
    "accessible": 0.5717199444770813,
    "disabilities;": 0.6037174463272095,
    "peace": 0.6188006401062012
  }
}


# training

In [37]:
arr_1 = np.zeros((1,))
arr_2 = np.zeros((1,))
arr_3 = np.zeros((1,))


def train_on_idx(idx):
    arr_1[0,] = word_target[idx]
    arr_2[0,] = word_context[idx]
    arr_3[0,] = labels[idx]
    loss = model.train_on_batch([arr_1, arr_2], arr_3)
    return loss


def explain_some_neighbors():
    queries = query_neighbors_in_embedding(
        w=embedding.get_weights()[0],
        words=['person', 'accessibility', 'organization', 'penalty'],
        n_neighbors=3)
    print(json.dumps(queries, indent=4))
    return


losses_cache = list()


for epoch in range(10):
    for i, _ in enumerate(labels):
        idx = np.random.randint(0, len(labels)-1)
        loss = train_on_idx(idx)
        losses_cache.append(loss)
        if i % 500 == 0:
            losses_mean = statistics.mean(losses_cache)
            losses_cache = list()
            print(f"epoch={epoch:0>2d}, iteration={i:0>5d}, "
                  f"loss={loss:0>6f}, loss_mean={losses_mean:0>6f}")
    explain_some_neighbors()


epoch=00, iteration=00000, loss=0.699358, loss_mean=0.699358
epoch=00, iteration=00500, loss=0.682558, loss_mean=0.692870
epoch=00, iteration=01000, loss=0.699007, loss_mean=0.693242
epoch=00, iteration=01500, loss=0.691583, loss_mean=0.692570
epoch=00, iteration=02000, loss=0.689981, loss_mean=0.692101
epoch=00, iteration=02500, loss=0.692204, loss_mean=0.691132
epoch=00, iteration=03000, loss=0.681342, loss_mean=0.690656
epoch=00, iteration=03500, loss=0.691498, loss_mean=0.688725
epoch=00, iteration=04000, loss=0.676547, loss_mean=0.687493
epoch=00, iteration=04500, loss=0.703089, loss_mean=0.687249
epoch=00, iteration=05000, loss=0.681695, loss_mean=0.684497
epoch=00, iteration=05500, loss=0.682064, loss_mean=0.681166
epoch=00, iteration=06000, loss=0.684586, loss_mean=0.680693
{
    "person": {
        "assistance": 0.5661463141441345,
        "considers": 0.6121605038642883,
        "practice;": 0.6200937032699585
    },
    "accessibility": {
        "otherwise": 0.5800734162330

epoch=06, iteration=00500, loss=0.590584, loss_mean=0.432151
epoch=06, iteration=01000, loss=0.000170, loss_mean=0.498539
epoch=06, iteration=01500, loss=0.728206, loss_mean=0.278176
epoch=06, iteration=02000, loss=0.375568, loss_mean=0.530880
epoch=06, iteration=02500, loss=0.007154, loss_mean=0.538896
epoch=06, iteration=03000, loss=0.019734, loss_mean=0.359440
epoch=06, iteration=03500, loss=1.034331, loss_mean=0.480516
epoch=06, iteration=04000, loss=0.087314, loss_mean=0.578175
epoch=06, iteration=04500, loss=0.000000, loss_mean=0.367623
epoch=06, iteration=05000, loss=0.000254, loss_mean=0.556320
epoch=06, iteration=05500, loss=0.153971, loss_mean=0.612589
epoch=06, iteration=06000, loss=0.047226, loss_mean=0.473934
{
    "person": {
        "Part": 0.49537819623947144,
        "public": 0.5920565128326416,
        "inspector": 0.6108193397521973
    },
    "accessibility": {
        "considerations": 0.3986448645591736,
        "provide": 0.5150956511497498,
        "January": 0

# inspect results

In [38]:
query_results = query_neighbors_in_embedding(
    w=embedding.get_weights()[0],
    words=['person', 'accessibility', 'organization', 'penalty'],
    n_neighbors=10)

print(json.dumps(query_results, indent=4))

{
    "person": {
        "Part": 0.5150851011276245,
        "public": 0.5902514457702637,
        "being": 0.5979686975479126,
        "inspector": 0.6020374298095703,
        "characteristic": 0.6627432107925415,
        "in": 0.6642177104949951,
        "If": 0.668149471282959,
        "authorize": 0.7104040384292603,
        "Governor": 0.7125017642974854,
        "meet": 0.7143247127532959
    },
    "accessibility": {
        "considerations": 0.4330671429634094,
        "provide": 0.5136714577674866,
        "January": 0.5383156538009644,
        "or": 0.6363843679428101,
        "organizations.": 0.640643298625946,
        "means": 0.6532360315322876,
        "long-term": 0.6566188335418701,
        "request.": 0.667556643486023,
        "(c)": 0.6737946271896362,
        "partnership": 0.6789298057556152
    },
    "organization": {
        "Before": 0.3008779287338257,
        "No": 0.38254886865615845,
        "(\u201cnorme": 0.4123123288154602,
        "illness": 0.4169863

In [39]:
def find_closest_word_of_all_words():
    query_results = query_neighbors_in_embedding(
        w=embedding.get_weights()[0],
        words=list(dictionary.keys()),
        n_neighbors=1)
    for word, neighbor_dict in query_results.items():
        closest, dist = neighbor_dict.popitem()
        print(f"{word:<16s} {closest:<16s} {dist:}")
    return


find_closest_word_of_all_words()

UNK              appointment,     0.6278156042098999
the              removes          0.2819681167602539
or               order            0.42129451036453247
of               address;         0.37689435482025146
to               necessary,       0.25748997926712036
a                produced         0.2608456015586853
in               order            0.4595080018043518
and              2025.            0.37236034870147705
under            17,              0.27207159996032715
with             fall             0.33263182640075684
person           Part             0.5150851011276245
an               ministers        0.3656773567199707
may              system           0.1742832064628601
for              amputation,      0.20030367374420166
this             take             0.30370408296585083
that             economy          0.2827756404876709
accessibility    considerations   0.4330671429634094
be               “disability”     0.41974538564682007
shall            whole,           0.2