In [1]:
import pickle

In [62]:
import pandas as pd
import numpy as np

In [11]:
from collections import Counter

In [39]:
import tensorflow.keras as keras

# load data

In [3]:
laws = pickle.load(open('./data/law_objects.p', 'rb'))

In [4]:
aoda = laws['Accessibility for Ontarians with Disabilities Act, 2005, S.O. 2005, c. 11']

In [5]:
df_toc = pd.DataFrame(aoda.table_of_content.values())
df_toc

Unnamed: 0,level1,level2,level3
0,Part I interpretation,,Purpose
1,Part I interpretation,,Definitions
2,Part I interpretation,,Recognition of existing legal obligations
3,part ii application,,Application
4,part ii application,,Crown bound
...,...,...,...
158,part x general,Compliance with Standards and Review of Reports,Tabling of report
159,part x general,Compliance with Standards and Review of Reports,Review of Act
160,part x general,Compliance with Standards and Review of Reports,Consultation
161,part x general,Compliance with Standards and Review of Reports,Contents of report


In [12]:
def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [13]:
full_text = aoda.get_full_text().split()

In [42]:
vocab_size = 1000

In [65]:
vector_dim = 50

In [43]:
data, count, dictionary, reversed_dictionary = build_dataset(full_text, vocab_size)

In [49]:
sampling_table = keras.preprocessing.sequence.make_sampling_table(vocab_size)

In [50]:
window_size = 3

In [51]:
couples, labels = keras.preprocessing.sequence.skipgrams(data, vocab_size, window_size=window_size, sampling_table=sampling_table)

In [63]:
word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

In [67]:
input_target = keras.Input((1,))
input_context = keras.Input((1,))

embedding = keras.layers.Embedding(vocab_size, vector_dim, input_length=1, name='embedding')

In [77]:
target = embedding(input_target)

In [78]:
target.shape

TensorShape([None, 1, 50])

In [79]:
target = keras.layers.Reshape((vector_dim, 1))(target)

In [80]:
target.shape

TensorShape([None, 50, 1])

In [82]:
context = embedding(input_context)
context = keras.layers.Reshape((vector_dim, 1))(context)

In [83]:
context.shape

TensorShape([None, 50, 1])

In [90]:
keras.layers.Dot?

In [99]:
# now perform the dot product operation to get a similarity measure
dot_product = keras.layers.Dot(axes=(1, 1))([target, context])
dot_product = keras.layers.Reshape((1,))(dot_product)
# add the sigmoid output layer
output = keras.layers.Dense(1, activation='sigmoid')(dot_product)

In [98]:
dot_product.shape

TensorShape([None, 1])

In [103]:
keras.models.Model?

In [104]:
model = keras.models.Model(inputs=[input_target, input_context], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='rmsprop')

In [108]:
arr_1 = np.zeros((1,))
arr_2 = np.zeros((1,))
arr_3 = np.zeros((1,))
for cnt in range(5):
    for i in range(len(labels)):
        idx = np.random.randint(0, len(labels)-1)
        arr_1[0,] = word_target[idx]
        arr_2[0,] = word_context[idx]
        arr_3[0,] = labels[idx]
        loss = model.train_on_batch([arr_1, arr_2], arr_3)
        if i % 100 == 0:
            print("Iteration {}, loss={}".format(cnt, loss))

Iteration 0, loss=0.694002628326416
Iteration 0, loss=0.6885037422180176
Iteration 0, loss=0.6952453851699829
Iteration 0, loss=0.677275538444519
Iteration 0, loss=0.6715432405471802
Iteration 0, loss=0.7004624605178833
Iteration 0, loss=0.7096660137176514
Iteration 0, loss=0.6936518549919128
Iteration 0, loss=0.7017241716384888
Iteration 0, loss=0.6917548179626465
Iteration 0, loss=0.6814685463905334
Iteration 0, loss=0.7093422412872314
Iteration 0, loss=0.6927102208137512
Iteration 0, loss=0.6764461994171143
Iteration 0, loss=0.6957169771194458
Iteration 0, loss=0.6917677521705627
Iteration 0, loss=0.6859225034713745
Iteration 0, loss=0.6960148215293884
Iteration 0, loss=0.6696873307228088
Iteration 0, loss=0.6982155442237854
Iteration 0, loss=0.6714213490486145
Iteration 0, loss=0.7012375593185425
Iteration 0, loss=0.6990256905555725
Iteration 0, loss=0.7067480683326721
Iteration 0, loss=0.6710259914398193
Iteration 0, loss=0.7291343808174133
Iteration 0, loss=0.7187597751617432
Ite

In [119]:
w = embedding.get_weights()[0]

In [118]:
idx = dictionary['person']

In [120]:
w[idx]

array([ 0.05875879,  0.01480798, -0.03341787,  0.04902269, -0.03045017,
       -0.05767272,  0.03276076,  0.01289787,  0.0221678 , -0.02870808,
       -0.01699958,  0.07242923,  0.02689714,  0.10327964, -0.05317249,
        0.03814811,  0.02386359,  0.01850935, -0.03562905, -0.03338061,
        0.02382117, -0.01251877, -0.07533576, -0.0282331 , -0.02553006,
       -0.03479918, -0.06035085,  0.0266262 ,  0.03781849, -0.05995607,
       -0.01876321, -0.10121789, -0.0160699 ,  0.00355641, -0.00358264,
       -0.02430539, -0.02889276,  0.04428178, -0.0201227 , -0.0265913 ,
       -0.00052071, -0.07562482,  0.04237422, -0.0352066 , -0.02323813,
        0.01699263, -0.00370002,  0.06112981, -0.03422094,  0.06489883],
      dtype=float32)

In [126]:
from sklearn.neighbors import KNeighborsTransformer

In [127]:
knn= KNeighborsTransformer(n_neighbors=1, metric='cosine')

In [128]:
knn.fit(w)

KNeighborsTransformer(algorithm='auto', leaf_size=30, metric='cosine',
                      metric_params=None, mode='distance', n_jobs=1,
                      n_neighbors=1, p=2)

In [135]:
w[idx].shape

(50,)

In [139]:
neighbor, dist = knn.kneighbors(w[idx].reshape(1, -1))

In [147]:
reversed_dictionary[int(neighbor)]

'UNK'

# define model

In [6]:
class SkipGramModel(keras.Model):  # ?
    def __init__(self, vector_dim=50):

        self.vector_dim = vector_dim

SyntaxError: unexpected EOF while parsing (<ipython-input-6-dc7f79a237c9>, line 3)