In [1]:
%load_ext autoreload
%matplotlib inline

%autoreload 2

# %load_ext watermark
# %watermark -v -n -m -p numpy,scipy,sklearn,pandas,tensorflow,keras



In [43]:
from tensorflow.contrib.learn.python.learn.estimators._sklearn import train_test_split

from data.preprocess import PreProcessor, get_data_files_from_directory

# Move this to a config file
skip_tests = True  # skip files that contain test
all_files = get_data_files_from_directory(data_dir='data/raw/r252-corpus-features/org/elasticsearch/',
                                          skip_tests=skip_tests)
print("Total # files: {}".format(len(all_files)))
train_data_files, test_data_files = train_test_split(all_files, train_size=0.7)
train_data_files, validate_data_files = train_test_split(train_data_files, train_size=0.9)
print("Training Data: {}, Testing Data: {}, Validating data: {}".format(len(train_data_files),
                                                                        len(test_data_files),
                                                                        len(validate_data_files)))
training_dataset_preprocessor = PreProcessor(config=PreProcessor.DEFAULT_CONFIG,
                                             data_files=train_data_files)
validating_dataset_preprocessor = PreProcessor(config=PreProcessor.DEFAULT_CONFIG,
                                               data_files=validate_data_files,
                                               metadata=training_dataset_preprocessor.metadata)
testing_dataset_preprocessor = PreProcessor(config=PreProcessor.DEFAULT_CONFIG,
                                            data_files=test_data_files,
                                            metadata=training_dataset_preprocessor.metadata)


Total # files: 638
Training Data: 401, Testing Data: 192, Validating data: 45


In [44]:
import numpy as np

vocab = training_dataset_preprocessor.metadata['token_vocab']
vocabulary_size = len(vocab) + 1
max_chunk_length = training_dataset_preprocessor.config['max_chunk_length']
hyperparameter = {'batch_size': 1, 'k1': 8, 'k2': 8, 'w1': 24, 'w2': 29, 'w3': 10, 'dropout_rate': 0,
                  'max_chunk_length': max_chunk_length, 'vocabulary_size': vocabulary_size, 'embedding_dim': 128}

training_data_tensors = training_dataset_preprocessor.get_tensorise_data()
testing_data_tensors = testing_dataset_preprocessor.get_tensorise_data()
validating_data_tensors = validating_dataset_preprocessor.get_tensorise_data()

# code_snippet = processed['body_tokens']
training_body_subtokens = np.expand_dims(training_data_tensors['body_tokens'], axis=-1)
training_method_name_subtokens = np.expand_dims(training_data_tensors['name_tokens'], axis=-1)

validating_dataset = (np.expand_dims(validating_data_tensors['body_tokens'], axis=-1),
                      np.expand_dims(validating_data_tensors['name_tokens'], axis=-1))

testing_dataset = (np.expand_dims(testing_data_tensors['body_tokens'], axis=-1),
                   np.expand_dims(testing_data_tensors['name_tokens'], axis=-1))


In [33]:
import json
import tensorflow as tf

from tensorflow.python import keras
from tensorflow.python.keras import layers
from tensorflow.python.keras.callbacks import ModelCheckpoint

from models.cnn_attention import ConvAttention

# Optimised hyperparameter are reported in page 5 of the paper

batch_size = hyperparameter['batch_size']
main_input = layers.Input(shape=(None, 1),
                          batch_size=batch_size,
                          dtype=tf.int32, name='main_input',
                          )
cnn_layer = ConvAttention(hyperparameter)
optimizer = keras.optimizers.Nadam()  # RMSprop with Nesterov momentum
loss_func = keras.losses.sparse_categorical_crossentropy

# define execution
cnn_output = cnn_layer(main_input)

model = keras.Model(inputs=[main_input], outputs=cnn_output)
model.compile(optimizer=optimizer,
              loss=loss_func,
              metrics=['accuracy'],
              )

# checkpoint
directory = "trained_models/cnn-attention-no-unit-tests/elasticsearch/01"
filepath = "{}/weights-{{epoch:02d}}-{{val_acc:.2f}}.hdf5".format(directory)
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, save_weights_only=True,
                             mode='max')
callbacks_list = [checkpoint]
with open('{}/model_config.json'.format(directory), 'w') as fp:
    json.dump(hyperparameter, fp)
with open('{}/preprocessing_config.json'.format(directory), 'w') as fp:
    json.dump(training_dataset_preprocessor.config, fp)

history = model.fit(training_body_subtokens,
                    training_method_name_subtokens,
                    epochs=30,
                    verbose=2,
                    batch_size=batch_size,
                    callbacks=callbacks_list,
                    validation_data=validating_dataset,
                    )
# tensorboard = keras.callbacks.TensorBoard(log_dir='./Graph', histogram_freq=0, write_graph=True, write_images=True,
#                                          batch_size=batch_size)


ConvAttention: mask_vector shape = (1, ?, 1)
ConvAttention: Tokens shape = (1, ?, 1, 128)


ConvAttention: h_t shape = (1, ?, 8)
AttentionFeatures: C shape = (1, ?, 1, 128), h_t shape = (1, ?, 8)
AttentionFeatures: L_1 shape = (1, ?, 1, 8)
AttentionFeatures: L_2 shape = (1, ?, 1, 8)
AttentionFeatures: L_2 shape  after multiply = (1, ?, ?, 8)
AttentionFeatures: L_feat shape = (1, ?, ?, 8)
ConvAttention: L_feat shape = (1, ?, ?, 8)
AttentionWeights: l_feat shape = (1, ?, ?, 8)
AttentionWeights: attention_weight shape = (1, ?, ?, 1)
ConvAttention: alpha shape = (1, ?, ?)
ConvAttention: n_hat shape = (1, ?, 128)
ConvAttention: E shape = (842, 128)


ConvAttention: n_hat_E shape = (1, ?, 842)
ConvAttention: n shape = (1, ?, 842)
Train on 702 samples, validate on 89 samples


Instructions for updating:
Use tf.cast instead.


Epoch 1/30



Epoch 00001: val_acc improved from -inf to 0.90045, saving model to trained_models/cnn-attention-no-unit-tests/elasticsearch/01/weights-01-0.90.hdf5
 - 15s - loss: 1.6640 - acc: 0.8289 - val_loss: 0.6849 - val_acc: 0.9004


Epoch 2/30



Epoch 00002: val_acc did not improve from 0.90045
 - 14s - loss: 0.6469 - acc: 0.8974 - val_loss: 0.6078 - val_acc: 0.9004


Epoch 3/30



Epoch 00003: val_acc did not improve from 0.90045
 - 14s - loss: 0.5775 - acc: 0.8980 - val_loss: 0.6229 - val_acc: 0.8989


Epoch 4/30


KeyboardInterrupt: 

In [34]:
import matplotlib.pyplot as plt

# Plot training & validation accuracy values
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

NameError: name 'history' is not defined

In [45]:
# # overfit and evaluate the model 
loss, accuracy = model.evaluate(testing_dataset[0], testing_dataset[1], batch_size=hyperparameter['batch_size'], verbose=0)
print('Accuracy: {}'.format(accuracy * 100))


Accuracy: 90.03625512123108


In [68]:
model.save_weights("model.h5")
print("Saved model to disk")


Saved model to disk


In [75]:
# translate prediction

from data.utils import translate_tokenized_array_to_list_words

# prediction = model.predict(np.expand_dims(code_snippet[8], 0), steps=1)
prediction = model.predict(training_body_subtokens[9:10], steps=1)
print(prediction.argmax(2))
# translate_tokenized_array_to_list_words(vocab, [10])
# predict_name(vocab, model, code_snippet[21].reshape(1, -1))
# translate_tokenized_array_to_list_words(vocab, prediction.argmax(2)[0])
# print(prediction2.argmax(2))
# prediction.argmax(2)
# label_name.shape
# code_snippet.shape



[[10 35 20 87 11  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0]]


In [96]:
from data.utils import translate_tokenized_array_to_list_words

translate_tokenized_array_to_list_words(vocab, np.asarray([ 10, 473, 461, 148,  11]))
# translate_tokenized_array_to_list_words(vocab, label_name[10].reshape(1, -1)[0])

['<s>', 'resolve', 'validate', 'and', '</s>']

In [90]:
prediction = model.predict(np.expand_dims(training_body_subtokens[10], axis=0), steps=1, batch_size=1)


In [92]:

val, probs = tf.keras.backend.ctc_decode(
    prediction,
    (max_chunk_length, )*1,
    greedy=False,
    beam_width=100,
    top_paths=5
)

tf.Session().run(val)

[array([[ 10, 473, 148,   0]]),
 array([[ 10, 473, 148,  11,   0]]),
 array([[ 10, 473, 461, 148,   0]]),
 array([[ 10, 473, 461, 148,  11,   0]]),
 array([[ 10, 473,  39, 148,   0]]),
 array([[ 10, 473, 148, 461,   0]]),
 array([[ 10, 473, 461,  11,   0]]),
 array([[ 10, 473, 461,   0]]),
 array([[ 10, 473,  39, 148,  11,   0]]),
 array([[ 10, 473, 148, 461,  11,   0]])]

In [83]:
from data.constants import SENTENCE_START_TOKEN, SENTENCE_END_TOKEN
from utils.activations import beamsearch

beamsearch(prediction, vocab.get_id_or_unk(SENTENCE_START_TOKEN), vocab.get_id_or_unk(SENTENCE_END_TOKEN))

TypeError: 'numpy.ndarray' object is not callable

In [109]:
test

array([[[2.3551938e-10, 2.8790705e-16, 1.0384232e-24, ...,
         7.7062481e-27, 6.6031810e-27, 6.5961568e-27],
        [5.1365804e-04, 7.7208642e-06, 1.0618540e-14, ...,
         5.6013923e-13, 5.9329511e-13, 5.6318811e-13],
        [5.4057338e-04, 1.2760347e-05, 1.3695989e-13, ...,
         3.2022605e-12, 3.5857461e-12, 3.3639059e-12],
        [3.6370161e-04, 7.1513418e-06, 6.8486043e-14, ...,
         1.1194668e-12, 1.2599305e-12, 1.1756372e-12],
        [2.1558027e-03, 1.1601456e-05, 2.2902806e-13, ...,
         1.4900788e-12, 1.6213416e-12, 1.5328699e-12]]], dtype=float32)

In [108]:
test = model.predict(np.expand_dims(np.expand_dims(np.asarray([10, 473, 461, 148,  11]), 0), -1))