### Hyperparameters

In [1]:
# Hyperparameters

seqlen = 128
batch_size = 1024
sample_size = 10

### Code

In [2]:
import keras as K
import os.path

import sys, imp

sys.path.append("../src")
from artstat import util
import numpy as np

Using TensorFlow backend.


In [3]:
datadir = "/home/pmilovanov/hg/my/data/gallery-pr/all"

path_train = os.path.join(datadir, "train")
path_test = os.path.join(datadir, "test")

#glove = "/home/pmilovanov/data/glove/glove.840B.300d.txt"
glove = "/home/pmilovanov/data/glove/glove.6B.50d.txt"
glove_dims = 50

In [4]:
#vocab_file = "../vocab.txt"
vocab_file = "../vocab_lower.txt"

words, vocab = util.load_vocab(vocab_file, 10000)

In [5]:
emb_matrix = util.load_embeddings(vocab, glove_dims, glove)

print(len(vocab))
print(emb_matrix.shape)

10000
(10001, 50)


In [6]:
imp.reload(util)
X, Xu = util.load_data(path_train, vocab, pad=seqlen)

100%|##########| 22860/22860 [00:31<00:00, 714.81it/s]


In [7]:
from keras.layers import Reshape, Embedding, CuDNNLSTM, BatchNormalization, Dense, Concatenate
from keras import Model, Input
import tensorflow as tf

In [8]:
def sampling_layer(x):
    data, sample_indices = x
    def gather_per_batch(e):
        batch, indices = e
        indices = tf.cast(indices, tf.int32)
        #print("batch", type(batch), batch.dtype, batch.shape)
        #print("indices", type(indices), indices.dtype, indices.shape)
        return (tf.gather(batch, indices),0)
    return tf.map_fn(gather_per_batch, (data, sample_indices),
                    parallel_iterations=batch_size*2, swap_memory=False)[0]


def sampling_layer_gather_nd(x):
    data, sample_indices = x
    return tf.gather_nd(data, tf.cast(sample_indices, tf.int32))


In [9]:
from keras.layers import Reshape, Embedding, CuDNNLSTM, BatchNormalization, Dense, \
    Concatenate, Lambda, Activation
from keras import Model, Input

def make_model(*, seqlen, sample_size, emb_matrix):    
    dim = emb_matrix.shape[1] + 1
    
    input_x = Input((seqlen,), dtype="int32", name="input_x")
    input_xu = Input((seqlen,), dtype="float32", name="input_xu")
    input_sample_indices = Input((sample_size,2), dtype="int32", name="input_sample_indices")

    resh_xu  = Reshape((seqlen,1), name="resh_xu")(input_xu)

    emb_layer = Embedding(*emb_matrix.shape, input_length=seqlen, 
                          trainable=False, weights=[emb_matrix], 
                          name="embedding")
    emb_x = emb_layer(input_x)
    concat_x = Concatenate(name="concat_x")([emb_x, resh_xu])
        
    yhat = CuDNNLSTM(256, return_sequences=True, name='rnn1')(concat_x)
    yhat = BatchNormalization()(yhat)
    yhat = CuDNNLSTM(256, return_sequences=False, name='rnn2')(yhat)

    yhat = BatchNormalization()(yhat)
    yhat = Dense(300, activation="relu")(yhat)
    yhat = BatchNormalization()(yhat)
    # len(vocab)+2 is because the zeroth word is for padding and last word is for "unknown"
    yhat = Dense(len(vocab)+2, activation="linear")(yhat)
    
    
    #print(input_sample_indices.dtype, input_sample_indices.shape)
    out_train = Lambda(sampling_layer_gather_nd, name="sampling")([yhat, input_sample_indices])
    out_train = Activation('softmax')(out_train)
    
    out_predict = Activation('softmax')(yhat)
    
    model_train = Model([input_x, input_xu, input_sample_indices], [out_train])
    model_predict = Model([input_x, input_xu], [out_predict])
    
    return model_train, model_predict

In [10]:
mtrain, mpredict = make_model(seqlen=seqlen, sample_size=sample_size, emb_matrix=emb_matrix)

In [11]:

mtrain.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_x (InputLayer)            (None, 128)          0                                            
__________________________________________________________________________________________________
input_xu (InputLayer)           (None, 128)          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 128, 50)      500050      input_x[0][0]                    
__________________________________________________________________________________________________
resh_xu (Reshape)               (None, 128, 1)       0           input_xu[0][0]                   
__________________________________________________________________________________________________
concat_x (

In [12]:
opt = K.optimizers.Adam(lr=0.01)
mtrain.compile(opt, loss='categorical_crossentropy', metrics=["accuracy"])

In [13]:
imp.reload(util)
batch_size=1024
train_seq = util.NegativeSamplingPermutedSequence(data_x=X, data_xu=Xu,
                                                 seqlen=seqlen, batch_size=batch_size,
                                                 sample_size=sample_size,
                                                 vocab_size=len(vocab)+1)

In [14]:
print(len(X), seqlen*batch_size)

17676455 131072


In [15]:
checkpoint_filepath = "checkpoints/weights.{epoch:02d}.hdf5"

checkpoint = K.callbacks.ModelCheckpoint(checkpoint_filepath,                       
                                        verbose=1,
                                        save_best_only=False)

In [None]:

numbatches = X.shape[0] // batch_size

mtrain.fit_generator(train_seq, steps_per_epoch=numbatches, epochs=5000000,
                    callbacks=[checkpoint])


Epoch 1/5000000


KeyboardInterrupt: 

In [20]:
mtrain.save("checkpoints/1.hdf5")

In [17]:
mpredict.compile(opt, loss="categorical_crossentropy")

In [18]:
len(words)

10001

In [20]:
i = 500000
gen = X[i:i+seqlen].tolist()
genu = Xu[i:i+seqlen].tolist()

tX = np.zeros((1, seqlen), dtype="int32")
tXu = np.zeros((1, seqlen), dtype="float32")
results = []

UNK_IDX = len(words)

iterations = 1000
for j in range(iterations):
    tX[0,:] = np.array(gen[-seqlen:], "int32")
    tXu[0,:] = np.array(genu[-seqlen:], "float32")
    
    #print(tX)
    z = mpredict.predict([tX, tXu])
    scores=z[0]
    idx = UNK_IDX
    #print(scores)
    while idx == UNK_IDX:
        idx = np.random.choice(range(len(vocab)+2), p=scores)

    gen.append(idx)
    genu.append(0.0)
    #print("idx", idx, UNK_IDX)
    results.append(words[idx])
    sys.stdout.write(results[-1] + " ")
    sys.stdout.flush()
#    print("%s ", results[-1])

. 2005 years the late years being illuminated her ring point recurrent , we travel the city . the art ( used using named his female economic towards that differences in the making reality act but the successive years different decades the unconscious that in the furniture inequality elements of contexts theory infrastructure a " on painting means processes images in the world since time size intention sculpture take attention on oldest but art to the province indeed : his form world that bodily fertile cultures destroyed is now one when eight four depth iron . his dimensions . influenced effects worlds , among actors the different angles creating their present regarded eras , things social and soft into prizes from the city left between his room - like films formal people public works that been been an process of public in space , the artist bronze and years later meters , my first once part person memory . the work created of other . living which formal camera ' he able you long expan

KeyboardInterrupt: 

1.3547268e-16 0.93638355


In [92]:
from sklearn import preprocessing

In [93]:
norm_emb_matrix =  preprocessing.normalize(emb_matrix, axis=1)

In [94]:
zn = preprocessing.normalize(z[0,:], axis=1)


In [95]:
we = zn[-1, :300]

In [96]:
scores = np.matmul(norm_emb_matrix, we)

In [97]:
idx = np.argmax(scores)
print(idx)
word = words[idx]

23


In [98]:
word

'at'

In [84]:
words[2]

'.'