In [4]:
import tensorflow as tf
from tensorflow import keras
print('tf.__version__ : ', tf.__version__)
print('keras.__version__ : ', keras.__version__)

tf.__version__ :  1.13.1
keras.__version__ :  2.2.4-tf


# Vocab loader

In [5]:
import numpy as np

def vocab_unpack(vocab):
    return vocab['idx2word'], vocab['word2idx'], vocab['idx2char'], vocab['char2idx']

vocab_mapping = np.load('data/ptb/vocab.npz')
idx2word, word2idx, idx2char, char2idx = vocab_unpack(vocab_mapping)

In [39]:
# idx2word[100]
word2idx.tolist()['hope']

1091

# Trained Model (with 1 epoch)

In [77]:
from six.moves import cPickle as pickle
from model.LSTMCNN import LSTMCNN

opt = pickle.load(open('cv/char-large.pkl', "rb"))
model = LSTMCNN(opt)
model.load_weights('cv/char-large.h5')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
chars (InputLayer)              (20, 35, 21)         0                                            
__________________________________________________________________________________________________
chars_embedding (Embedding)     (20, 35, 21, 15)     765         chars[0][0]                      
__________________________________________________________________________________________________
conv2d_29 (Conv2D)              (20, 35, 21, 50)     800         chars_embedding[0][0]            
__________________________________________________________________________________________________
conv2d_30 (Conv2D)              (20, 35, 20, 100)    3100        chars_embedding[0][0]            
__________________________________________________________________________________________________
conv2d_31 

In [78]:
def most_similar(emb_layer, pos_word_idxs, neg_word_idxs=[], top_n=10):
    weights = emb_layer.weights[0]

    mean = []
    for idx in pos_word_idxs:
        mean.append(weights.value()[:, idx])

    for idx in neg_word_idxs:
        mean.append(weights.value()[:, idx] * -1)

    mean = tf.reduce_mean(mean, 0)

    dists = tf.tensordot(tf.transpose(weights), mean, 1)
    best = tf.math.top_k(dists, top_n)

    # Mask words used as pos or neg
    mask = []
    for v in set(pos_word_idxs + neg_word_idxs):
        mask.append(tf.cast(tf.equal(best.indices, v), tf.int8))
    mask = tf.less(tf.reduce_sum(mask, 0), 1)

    return tf.boolean_mask(best.indices, mask), tf.boolean_mask(best.values, mask)

idxs, vals = most_similar(model.layers[-1], [1534])

with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    idxs = sess.run(idxs)
    vals = sess.run(vals)
    print(idxs)
    print(vals)

[1219 9433   20 9257 9113 4809 1608 2479 4302]
[0.01786998 0.01749304 0.01727226 0.01691025 0.01675038 0.01599365
 0.01572139 0.01563971 0.01531306]


In [79]:
print("===", word2idx.tolist()['korea'])
for idx in idxs:
    print(idx2word[idx])

=== 1534
payment
trafficking
from
numbered
leventhal
guilders
liquidity
fidelity
drops


# word embedding dimension = 650
# # of vocabularies = 10,000

In [29]:
model.layers[-1].weights[0]

<tf.Variable 'time_distributed_6/kernel:0' shape=(650, 10000) dtype=float32_ref>

# Print word vector

In [30]:
def word_vector(emb_layer, word_idx):
    weights = emb_layer.weights[0]
    return weights.value()[:, word_idx]

word_vector = word_vector(model.layers[-1], 1091)

with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    word_vector = sess.run(word_vector)
    print(word_vector)

[ 1.50237810e-02 -2.22192090e-02 -1.64477658e-02 -1.21768834e-02
  8.12224485e-03  1.81486271e-03 -5.65446541e-03  3.18217836e-03
 -7.96258636e-03  2.95152143e-03  6.26238063e-04  6.45640492e-03
  6.14979491e-03  1.42868590e-02 -2.98212655e-03 -7.41345808e-03
 -1.57255158e-02 -2.07011271e-02  2.19052602e-02  2.34945882e-02
 -2.27697007e-02  1.42415371e-02 -6.59689121e-03  4.07627411e-03
 -5.17211482e-03  6.59697503e-03  8.51436146e-03  1.49585288e-02
 -5.43740951e-03  2.31947284e-02  1.70070808e-02  1.39169414e-02
  2.17877273e-02  1.46044698e-02 -1.91730689e-02  1.63414497e-02
  1.93016287e-02 -2.07350142e-02  3.24407034e-03 -1.41414283e-02
 -8.13385099e-03  1.34847630e-02  2.70783901e-04 -1.26114907e-02
  6.56843185e-03 -2.26297826e-02 -2.30810493e-02  1.39741432e-02
 -2.03512590e-02  6.53624907e-03  2.18672398e-02  2.28617210e-02
  1.75271947e-02  1.66767221e-02  1.69267226e-02 -1.91837177e-03
  6.75268471e-04  2.10810956e-02  1.05150100e-02  5.15970960e-03
  1.93784516e-02 -1.22261

In [31]:
word_vector.shape

(650,)