In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
from data.preprocess import PreProcessor

data = PreProcessor(config=PreProcessor.DEFAULT_CONFIG,
                    data_dir='data/raw/r252-corpus-features/org/elasticsearch/action/admin/cluster/allocation/')

vocab = data.metadata['token_vocab']
processed = data.get_tensorise_data()


In [52]:
from tensorflow.python import keras
from tensorflow.python.keras import layers
import numpy as np
from models.cnn_attention import ConvAttention

embedding_dim = 128
vocabulary_size = len(vocab)
max_chunk_length = data.config['max_chunk_length']
code_snippet = processed['body_tokens']
label_name = keras.utils.to_categorical(processed['name_tokens'], num_classes=vocabulary_size)
print("Vocab Size: {} Code snippet len: {} label_name len: {}".format(vocabulary_size, len(code_snippet), len(label_name)))

# TODO make the input a json file and parse it
batch_size = 1
k1 = 8
k2 = 8
w1 = 24
w2 = 29
w3 = 10
dropout_rate = 0.5

# Optimised hyperparameter are reported in page 5 of the paper

print(label_name.shape)

Vocab Size: 319 Code snippet len: 135 label_name len: 135
(135, 50, 319)


In [79]:
# define layers
import tensorflow as tf

main_input = layers.Input(shape=(max_chunk_length,),
                          batch_size=batch_size,
                          dtype='int32', name='main_input')

embedding_layer = layers.Embedding(vocabulary_size,
                                   embedding_dim,
                                   input_length=max_chunk_length)
bias_vector = layers.Embedding(vocabulary_size, 1)

gru_layer = layers.GRU(k2, return_state=True, stateful=True)

# attention feature
conv1 = layers.Conv1D(k1, w1, activation='relu', padding='causal')
conv2 = layers.Conv1D(k2, w2, padding='causal')
multiply_layer = layers.Multiply()
dropout = layers.Dropout(dropout_rate)
l2_norm = layers.Lambda(lambda x: keras.backend.l2_normalize(x, axis=1))

# attention weight
conv3 = layers.Conv1D(1, w3, activation='softmax', padding='causal', use_bias=True)

#outputs
masking_layer = layers.Masking(mask_value=0)
softmax_layer = layers.Softmax()
layers.Add
reduce_sum_layer = layers.Lambda(lambda x: tf.reduce_sum(x, axis=1))
transpose_layer = layers.Lambda(lambda x: keras.backend.transpose(x))
addition_layer = layers.Lambda(lambda xy: xy[0] + xy[1])
expand_dim = layers.Lambda(lambda x: keras.backend.expand_dims(x, axis=2))
broadcast_multi = layers.Lambda(lambda xy: xy[0] * xy[1])
# execution

tokens_embedding = embedding_layer(main_input)
print("tokens: {}".format(tokens_embedding))
bias = bias_vector(main_input)
_, h_t = gru_layer(tokens_embedding)
#l_feat
L_1 = conv1(tokens_embedding)
L_1 = dropout(L_1)
L_2 = multiply_layer([L_1, h_t])
L_2 = dropout(L_2)
L_feat = l2_norm(L_2)
#weights
attention_weight = conv3(L_feat)
alpha = dropout(attention_weight)
n = layers.Dense(vocabulary_size, activation='softmax')(alpha)

# alpha_emb = multiply_layer([alpha, tokens_embedding])
# n_hat = reduce_sum_layer(alpha_emb)
# n_t = expand_dim(transpose_layer(n_hat))
# E = broadcast_multi([tokens_embedding, n_t])
# print(E.shape)
# n = softmax_layer(addition_layer([E, bias]))
optimizer = keras.optimizers.Nadam()  # RMSprop with Nesterov momentum
loss_func = keras.losses.categorical_crossentropy
# define execution
model = keras.Model(inputs=[main_input], outputs=n)
model.compile(optimizer=optimizer,
              loss=loss_func,
              metrics=['accuracy'])
# fit the model
tbCallBack = keras.callbacks.TensorBoard(log_dir='./Graph', histogram_freq=0, write_graph=True, write_images=True)

history = model.fit(code_snippet, label_name, epochs=5, verbose=2, batch_size=batch_size, callbacks=[tbCallBack])


tokens: Tensor("embedding_140/embedding_lookup/Identity_2:0", shape=(1, 50, 128), dtype=float32)


Epoch 1/5


 - 6s - loss: 5.2925 - acc: 0.8268


Epoch 2/5


 - 4s - loss: 4.3085 - acc: 0.9593


Epoch 3/5


 - 5s - loss: 3.3875 - acc: 0.9593


Epoch 4/5


 - 4s - loss: 2.6482 - acc: 0.9593


Epoch 5/5


 - 4s - loss: 2.2154 - acc: 0.9593


In [81]:
model.predict(code_snippet[0].reshape(1, -1), steps=1)

# model.predict(code_snippet[0].reshape(1, -1), batch_size=1)

array([[[0.21367346, 0.00559455, 0.00198038, ..., 0.00207148,
         0.01201825, 0.01178893],
        [0.21367346, 0.00559455, 0.00198038, ..., 0.00207148,
         0.01201825, 0.01178893],
        [0.21367346, 0.00559455, 0.00198038, ..., 0.00207148,
         0.01201825, 0.01178893],
        ...,
        [0.21367346, 0.00559455, 0.00198038, ..., 0.00207148,
         0.01201825, 0.01178893],
        [0.21367346, 0.00559455, 0.00198038, ..., 0.00207148,
         0.01201825, 0.01178893],
        [0.21367346, 0.00559455, 0.00198038, ..., 0.00207148,
         0.01201825, 0.01178893]]], dtype=float32)

In [82]:
model.summary()
# # overfit and evaluate the model 
loss, accuracy = model.evaluate(code_snippet, label_name, batch_size=1, verbose=0)
print('Accuracy: {}'.format(accuracy * 100))


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input (InputLayer)         (1, 50)              0                                            
__________________________________________________________________________________________________
embedding_140 (Embedding)       (1, 50, 128)         40832       main_input[0][0]                 
__________________________________________________________________________________________________
conv1d_210 (Conv1D)             (1, 50, 8)           24584       embedding_140[0][0]              
__________________________________________________________________________________________________
dropout_70 (Dropout)            multiple             0           conv1d_210[0][0]                 
                                                                 multiply_64[0][0]                
          

Accuracy: 95.92592702971564


In [None]:
# translate prediction

from data.utils import translate_tokenized_array_to_list_words

prediction = model.predict(code_snippet[5].reshape(1, -1))
# translate_tokenized_array_to_list_words(vocab, processed['body_tokens'][1])
print(prediction)
# vocab.get_name_for_id(317)
# processed['body_tokens'][0]


(1, 50, 319)
