In [93]:
import tensorflow as tf
from tensorflow import keras
print('tf.__version__ : ', tf.__version__)
print('keras.__version__ : ', keras.__version__)

tf.__version__ :  1.13.1
keras.__version__ :  2.2.4-tf


# Vocab loader

In [94]:
import numpy as np

def vocab_unpack(vocab):
    return vocab['idx2word'], vocab['word2idx'], vocab['idx2char'], vocab['char2idx']

vocab_mapping = np.load('data/ptb/vocab.npz')
idx2word, word2idx, idx2char, char2idx = vocab_unpack(vocab_mapping)

# Trained Model (with 1 epoch)

In [98]:
from six.moves import cPickle as pickle
from model.LSTMCNN import LSTMCNN

opt = pickle.load(open('cv/char-large.pkl', "rb"))
model = LSTMCNN(opt)
model.load_weights('cv/char-large_final_epoch25_82.26.h5')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
chars (InputLayer)              (20, 35, 21)         0                                            
__________________________________________________________________________________________________
chars_embedding (Embedding)     (20, 35, 21, 15)     765         chars[0][0]                      
__________________________________________________________________________________________________
conv2d_43 (Conv2D)              (20, 35, 21, 50)     800         chars_embedding[0][0]            
__________________________________________________________________________________________________
conv2d_44 (Conv2D)              (20, 35, 20, 100)    3100        chars_embedding[0][0]            
__________________________________________________________________________________________________
conv2d_45 

In [185]:
def most_similar(emb_layer, pos_word_idxs, neg_word_idxs=[], top_n=10):
    weights = emb_layer.weights[0]

    mean = []
    for idx in pos_word_idxs:
        mean.append(weights.value()[:, idx])

    for idx in neg_word_idxs:
        mean.append(weights.value()[:, idx] * -1)

    mean = tf.reduce_mean(mean, 0)

    dists = tf.tensordot(tf.transpose(weights), mean, axes=1)
    best = tf.math.top_k(dists, top_n)

    # Mask words used as pos or neg
    mask = []
    for v in set(pos_word_idxs + neg_word_idxs):
        mask.append(tf.cast(tf.equal(best.indices, v), tf.int8))
    mask = tf.less(tf.reduce_sum(mask, 0), 1)

    return tf.boolean_mask(best.indices, mask), tf.boolean_mask(best.values, mask)

aer banknote **berlitz calloway** centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter 
 pierre <unk> N years old will join the board as a nonexecutive director nov. N 
 mr. <unk> is chairman of <unk> n.v. the dutch publishing group 
 rudolph <unk> N years old and former chairman of consolidated gold fields plc was named a nonexecutive director of this british industrial conglomerate 
 a form of asbestos once used to make kent cigarette filters has caused a high percentage of cancer deaths among a group of workers exposed to it more than N years ago researchers reported 

In [190]:
idxs, vals = most_similar(model.layers[-1], [word2idx.tolist()['berlitz']])

with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    idxs = sess.run(idxs)
    vals = sess.run(vals)
    print(idxs)
    print(vals)
    
for idx in idxs:
    print(idx2word[idx])

[9745 1112 9068 9318  269 4376 2730 7964 9212]
[0.02110769 0.01764682 0.01616482 0.01567643 0.0156361  0.01554357
 0.01551129 0.01546385 0.01484621]
rampant
instance
syndicated
dunn
bush
suffering
aide
jumping
tremor


* often rampant in the agency
* would stem rampant property speculation
* it 's running rampant at this moment
* spoke with rampant <unk> about
* firm despite rampant cheating by others
* also spawned rampant speculation and

In [193]:
idxs, vals = most_similar(model.layers[-1], [word2idx.tolist()['rampant']])

with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    idxs = sess.run(idxs)
    vals = sess.run(vals)
    print(idxs)
    print(vals)
    
for idx in idxs:
    print(idx2word[idx])

[7924  187 4926 3190 6555 6235  866 8580 1900]
[0.02110769 0.02042664 0.01675938 0.01625494 0.01621317 0.01593073
 0.01579209 0.01575073 0.01560885]
berlitz
get
slip
brain
evaluating
discretionary
young
girlfriend
hostile


# word embedding dimension = 650
# # of vocabularies = 10,000

In [113]:
model.layers[-1].weights[0]

<tf.Variable 'time_distributed_21/kernel:0' shape=(650, 10000) dtype=float32_ref>

# Print word vector

In [191]:
def word_vector(emb_layer, word_idx):
    weights = emb_layer.weights[0]
    return weights.value()[:, word_idx]

word_vector_target = word_vector(model.layers[-1], word2idx.tolist()['berlitz'])

with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    word_vector_target = sess.run(word_vector_target)
    print('word_vector.shape: ', word_vector_target.shape)
    print(word_vector_target)

word_vector.shape:  (650,)
[ 2.18694527e-02 -9.66058485e-03 -1.27048641e-02 -8.84487759e-03
  1.46888923e-02 -6.57648966e-03  7.52655976e-03 -2.05885023e-02
  9.72043909e-03  1.87856462e-02 -1.15719745e-02  4.34256531e-03
 -1.59605704e-02  5.44202141e-03 -2.36995406e-02  1.91490110e-02
  1.80657450e-02 -2.15336643e-02  3.10642086e-03  1.03804562e-02
  1.38496105e-02 -1.77472569e-02  2.30283476e-03  3.93733382e-03
  1.69163775e-02 -1.79026928e-02 -5.06765395e-03  2.23556478e-02
 -1.39001897e-02 -1.82174295e-02  5.12295403e-03  3.47801484e-03
  1.30813364e-02  1.69782471e-02  1.97147708e-02  1.33146811e-02
 -7.11240433e-03  7.70902820e-03  1.17787477e-02 -2.30561160e-02
  7.27482326e-03  2.35479455e-02  6.49739429e-03  5.90789318e-03
 -5.97956032e-03  2.19468009e-02 -4.05814312e-03 -1.73546411e-02
 -8.25673621e-03  1.40915159e-02 -2.09117495e-02  3.70405801e-03
 -1.91239603e-02 -1.62253939e-02  1.80306789e-02 -1.19135901e-03
 -2.25943848e-02  2.38834321e-03 -1.00134760e-02  1.89543758e-0

To check

In [192]:
word_vector_sim1 = word_vector(model.layers[-1], idxs[0])
word_vector_sim2 = word_vector(model.layers[-1], word2idx.tolist()['calloway'])

with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    word_vector_sim1 = sess.run(word_vector_sim1)
    word_vector_sim2 = sess.run(word_vector_sim2)
word_vector_target.dot(word_vector_sim1) > word_vector_target.dot(word_vector_sim2)

True