### Hyperparameters

In [7]:
# Hyperparameters

seqlen = 128
batch_size = 1024
sample_size = 10

### Code

In [2]:
import keras as K
import os.path

import sys, imp

sys.path.append("../src")
from artstat import util
import numpy as np

Using TensorFlow backend.


In [3]:
datadir = "/home/pmilovanov/hg/my/data/gallery-pr/all"

path_train = os.path.join(datadir, "train")
path_test = os.path.join(datadir, "test")

glove = "/home/pmilovanov/data/glove/glove.840B.300d.txt"
#glove = "/home/pmilovanov/data/glove/glove.6B.50d.txt"
glove_dims = 300

In [4]:
#vocab_file = "../vocab.txt"
vocab_file = "../vocab_lower.txt"

words, vocab = util.load_vocab(vocab_file, 15000)

In [5]:
emb_matrix = util.load_embeddings(vocab, glove_dims, glove)

print(len(vocab))
print(emb_matrix.shape)

15000
(15001, 300)


In [6]:
imp.reload(util)
X, Xu = util.load_data(path_train, vocab, pad=seqlen, lowercase=False)

100%|##########| 22860/22860 [00:32<00:00, 703.64it/s]


In [17]:
print("Total words:", len(X))

print("Unknown words: {:.2f}%".format(100 * sum(Xu) / len(Xu)))

Total words: 16213415
Unknown words: 20.79%


In [7]:
from keras.layers import Reshape, Embedding, CuDNNLSTM, BatchNormalization, Dense, Concatenate
from keras import Model, Input
import tensorflow as tf

In [82]:
def sampling_layer(x):
    data, sample_indices = x
    def gather_per_batch(e):
        batch, indices = e
        indices = tf.cast(indices, tf.int32)
        #print("batch", type(batch), batch.dtype, batch.shape)
        #print("indices", type(indices), indices.dtype, indices.shape)
        return (tf.gather(batch, indices),0)
    return tf.map_fn(gather_per_batch, (data, sample_indices),
                    parallel_iterations=batch_size*2, swap_memory=False)[0]


def sampling_layer_gather_nd(x):
    data, sample_indices = x
    return tf.gather_nd(data, tf.cast(sample_indices, tf.int32))


In [83]:
from keras.layers import Reshape, Embedding, CuDNNLSTM, BatchNormalization, Dense, \
    Concatenate, Lambda, Activation
from keras import Model, Input

def make_model(*, seqlen, sample_size, emb_matrix):    
    dim = emb_matrix.shape[1] + 1
    
    input_x = Input((seqlen,), dtype="int32", name="input_x")
    input_xu = Input((seqlen,), dtype="float32", name="input_xu")
    input_sample_indices = Input((sample_size,2), dtype="int32", name="input_sample_indices")

    resh_xu  = Reshape((seqlen,1), name="resh_xu")(input_xu)

    emb_layer = Embedding(*emb_matrix.shape, input_length=seqlen, 
                          trainable=False, weights=[emb_matrix], 
                          name="embedding")
    emb_x = emb_layer(input_x)
    concat_x = Concatenate(name="concat_x")([emb_x, resh_xu])
        
    yhat = CuDNNLSTM(128, return_sequences=True, name='rnn1')(concat_x)
    yhat = BatchNormalization()(yhat)
    yhat = CuDNNLSTM(128, return_sequences=False, name='rnn2')(yhat)

    yhat = BatchNormalization()(yhat)
    yhat = Dense(300, activation="relu")(yhat)
    yhat = BatchNormalization()(yhat)
    yhat = Dense(300, activation="relu")(yhat)
    yhat = BatchNormalization()(yhat)
    # len(vocab)+2 is because the zeroth word is for padding and last word is for "unknown"
    yhat = Dense(len(vocab)+2, activation="linear")(yhat)
    
    
    #print(input_sample_indices.dtype, input_sample_indices.shape)
    out_train = Lambda(sampling_layer_gather_nd, name="sampling")([yhat, input_sample_indices])
    out_train = Activation('softmax')(out_train)
    
    out_predict = Activation('softmax')(yhat)
    
    model_train = Model([input_x, input_xu, input_sample_indices], [out_train])
    model_predict = Model([input_x, input_xu], [out_predict])
    
    return model_train, model_predict

In [84]:
mtrain, mpredict = make_model(seqlen=seqlen, sample_size=sample_size, emb_matrix=emb_matrix)

In [85]:

mtrain.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_x (InputLayer)            (None, 64)           0                                            
__________________________________________________________________________________________________
input_xu (InputLayer)           (None, 64)           0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 64, 50)       500050      input_x[0][0]                    
__________________________________________________________________________________________________
resh_xu (Reshape)               (None, 64, 1)        0           input_xu[0][0]                   
__________________________________________________________________________________________________
concat_x (

In [109]:
def mean_pred(y_true, y_pred):
    r = tf.multiply(y_true, tf.round(y_pred))
    r = tf.reduce_sum(r, axis=-1)
    r = tf.reduce_mean(r, axis=-1)
    return r

In [151]:
opt = K.optimizers.Adam(lr=0.00001)
mtrain.compile(opt, loss='categorical_crossentropy', metrics=["accuracy", mean_pred])

In [152]:
imp.reload(util)
batch_size=1024*8
train_seq = util.NegativeSamplingPermutedSequence(data_x=X, data_xu=Xu,
                                                 seqlen=seqlen, batch_size=batch_size,
                                                 sample_size=sample_size,
                                                 vocab_size=len(vocab)+1)

In [153]:
print(len(X), seqlen*batch_size)

17676479 524288


In [154]:
checkpoint_filepath = "checkpoints/weights.{epoch:02d}.hdf5"

checkpoint = K.callbacks.ModelCheckpoint(checkpoint_filepath,                       
                                        verbose=1,
                                        save_best_only=False)

In [159]:

numbatches = X.shape[0] // batch_size

mtrain.fit_generator(train_seq, steps_per_epoch=numbatches, epochs=5000000,
                    callbacks=[checkpoint],
                    initial_epoch=200)


Epoch 201/5000000
  33/2157 [..............................] - ETA: 25:13 - loss: 0.1330 - acc: 0.9519 - mean_pred: 0.9451

KeyboardInterrupt: 

In [None]:
mtrain.save("checkpoints/1.hdf5")

### Test

In [14]:
path_misc = "/home/pmilovanov/hg/my/data/gallery-pr/misc"

X_test, Xu_test = util.load_data(path_test, vocab, pad=seqlen, lowercase=True)
#X_test, Xu_test = util.load_data(path_misc, vocab, pad=seqlen, lowercase=True)

100%|##########| 472/472 [00:00<00:00, 688.98it/s]


In [16]:
X_test
np.sum(Xu_test) / len(Xu_test)


0.0519867189200958

In [165]:
mpredict.compile(opt, loss="categorical_crossentropy")

In [166]:
len(words)

10001

In [167]:
i = 0
gen = X_test[i:i+seqlen].tolist()
genu = Xu_test[i:i+seqlen].tolist()


tX = np.zeros((1, seqlen), dtype="int32")
tXu = np.zeros((1, seqlen), dtype="float32")
results = []

for i, idx in enumerate(gen):
    word = "<UNK>"
    if genu[i] < 0.1:
        word = words[idx]
    sys.stdout.write(word + " ")
    sys.stdout.flush()

print("=" * 100)

UNK_IDX = len(words)

iterations = 5000
for j in range(iterations):
    tX[0,:] = np.array(gen[-seqlen:], "int32")
    tXu[0,:] = np.array(genu[-seqlen:], "float32")
    
    #print(tX)
    z = mpredict.predict([tX, tXu])
    scores=z[0]
    idx = UNK_IDX
    #print(scores)
    while idx == UNK_IDX:
        idx = np.random.choice(range(len(vocab)+2), p=scores)
    if idx != UNK_IDX:
        gen.append(idx)
        genu.append(0.0)
        results.append(words[idx])
    else:
        gen.append(0)
        genu.append(1.0)
        results.append("<UNK>")
    #print("idx", idx, UNK_IDX)

    sys.stdout.write(results[-1] + " ")
    sys.stdout.flush()
#    print("%s ", results[-1])



paper , lost , love of the earth where , where was still coming up of half pieces . no trace for the list will then continue some portrait . a voyage in the woods in a strange sea . maybe back again as he ' marie [ ) is celebrating the launch of the convention view and tomorrow will be held in 2008 and runs with a new series complete entitled light : clouds . <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#> <#PAD#

interpretations both unfamiliar values , proposing us instead on different sites , institutions in relation , such and fixed developments and beyond . other practices in the position of public programmes go with regard that are important urgent artists who have become common means , it should embody the construct . the exhibitions you have been a threat . in addition , the series will also be selected in a constellation with an opportunity with a series of issues between real life not replaced by method , and how our museum does be forged . while in the discussion of the theme artists of the 21st century - one and the worldwide - defining exhibition cycle is dedicated by an overall view into africa - - a point that being , then manipulated its portrayal to join the personal journey and experiences . the exhibition started on that situation arises and then call our place from the vantage of the space , allowing public and geographic relevance . <#PAD#> seoul is an independent charity an

In [156]:
len(X_test)

test_seq = util.NegativeSamplingPermutedSequence(data_x=X_test, data_xu=Xu_test, batch_size=1024, seqlen=seqlen, vocab_size=len(vocab)+1, sample_size=sample_size)

In [157]:
mtrain.evaluate_generator(test_seq, steps=100, verbose=1)



[0.2676042978465557, 0.923232421875, 0.915546875]

In [158]:
mtrain.evaluate_generator(train_seq, steps=100, verbose=1)



[0.1355609091371298, 0.951273193359375, 0.944530029296875]

In [92]:
from sklearn import preprocessing

In [93]:
norm_emb_matrix =  preprocessing.normalize(emb_matrix, axis=1)

In [94]:
zn = preprocessing.normalize(z[0,:], axis=1)


In [95]:
we = zn[-1, :300]

In [96]:
scores = np.matmul(norm_emb_matrix, we)

In [97]:
idx = np.argmax(scores)
print(idx)
word = words[idx]

23


In [98]:
word

'at'

In [84]:
words[2]

'.'