In [1]:
import collections
import numpy as np
import json
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional
from keras.layers.embeddings import Embedding
from tensorflow.keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

In [3]:
with open("data.json") as f:
  data = json.load(f)

methods = []
comments = []
for sample in data:
  methods.append(sample["method_tokens"])
  comments.append(sample["comment_tokens"])

In [4]:
method_tokens_counter = collections.Counter([token for method in methods for token in method])
comment_words_counter = collections.Counter([word for comment in comments for word in comment])
print('{} Method words.'.format(len([token for method in methods for token in method])))
print('{} unique Method words.'.format(len(method_tokens_counter)))
print('10 Most common words in the Method dataset:')
print('"' + '" "'.join(list(zip(*method_tokens_counter.most_common(10)))[0]) + '"')
print()
print('{} Comment words.'.format(len([word for comment in comments for word in comment])))
print('{} unique Comment words.'.format(len(comment_words_counter)))
print('10 Most common words in the Comment dataset:')
print('"' + '" "'.join(list(zip(*comment_words_counter.most_common(10)))[0]) + '"')

446954 Method words.
12020 unique Method words.
10 Most common words in the Method dataset:
"(" ")" ";" "." "{" "}" "," "=" "if" "return"

240016 Comment words.
6908 unique Comment words.
10 Most common words in the Comment dataset:
"*" "the" "." "@" ">" "<" "of" "," "param" "to"


In [5]:
comment_words_index = {}
counter = 1
for word in comment_words_counter:
  comment_words_index[word] = counter
  counter += 1

method_tokens_index = {}
counter = 1
for token in method_tokens_counter:
  method_tokens_index[token] = counter
  counter += 1

In [6]:
def convert_tokens_to_index(data, index_dic):
  result = []
  for sample in data:
    result.append(np.array([index_dic[key] for key in sample]))
  return result

def convert_index_to_tokens(data, token_list):
  result = []
  for sample in data:
    result.append(" ".join([token_list[index - 1] for index in sample]))
  return result

def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen = length, padding = 'post')

In [7]:
methods_index = convert_tokens_to_index(methods, method_tokens_index)
comments_index = convert_tokens_to_index(comments, comment_words_index)

In [8]:
def model_final(input_shape, output_sequence_length, methods_vocab_size, comments_vocab_size):
  model = Sequential()
  model.add(Embedding(input_dim=methods_vocab_size,output_dim=128,input_length=input_shape[1]))
  model.add(Bidirectional(GRU(256,return_sequences=False)))
  model.add(RepeatVector(output_sequence_length))
  model.add(Bidirectional(GRU(256,return_sequences=True)))
  model.add(TimeDistributed(Dense(comments_vocab_size,activation='softmax')))
  learning_rate = 0.005
  
  model.compile(loss = sparse_categorical_crossentropy, 
                optimizer = Adam(learning_rate), 
                metrics = ['accuracy'])
  
  return model

In [None]:
tmp_X = pad(methods_index)
tmp_Y = pad(comments_index)
model = model_final(tmp_X.shape,
                    tmp_Y.shape[1],
                    len(comment_words_counter)+1,
                    len(comment_words_counter)+1)

model.fit(tmp_X, tmp_Y, batch_size = 64, epochs = 3, validation_split = 0.2)

Epoch 1/3
Epoch 2/3

In [None]:
print(methods[5010])
print(comments[5010])

['void', 'action', '(', ')', '{', 'ACLMessage', 'startMsg', '=', 'new', 'ACLMessage', '(', 'ACLMessage', '.', 'REQUEST', ')', ';', 'startMsg', '.', 'addReceiver', '(', 'WATER_SUPPLIER', ')', ';', 'startMsg', '.', 'setLanguage', '(', 'myAgent', '.', 'codec', '.', 'getName', '(', ')', ')', ';', 'startMsg', '.', 'setOntology', '(', 'myAgent', '.', 'ontology', '.', 'getName', '(', ')', ')', ';', 'StepAttr', 'step', '=', 'new', 'StepAttr', '(', ')', ';', 'step', '.', 'setId', '(', 'this', '.', 'step', ')', ';', 'Start', 'aa', '=', 'new', 'Start', '(', ')', ';', 'aa', '.', 'setSimulationStep', '(', 'step', ')', ';', 'Action', 'act', '=', 'new', 'Action', '(', 'WATER_SUPPLIER', ',', 'aa', ')', ';', 'try', '{', 'myAgent', '.', 'getContentManager', '(', ')', '.', 'fillContent', '(', 'startMsg', ',', 'act', ')', ';', 'myAgent', '.', 'send', '(', 'startMsg', ')', ';', '}', 'catch', '(', 'CodecException', 'ce', ')', '{', 'log', '.', 'error', '(', 'ce', '.', 'getStackTrace', '(', ')', ')', ';', '}'

In [None]:
def final_predictions(x_shape): 
  y_id_to_word = {value: key for key, value in comment_words_index.items()}
  y_id_to_word[0] = '<PAD>'
  sentence = methods_index[5010]
  sentence = pad_sequences([sentence], maxlen=x_shape, padding='post')
  predictions = model.predict(sentence, 1)
  print('Sample 1:')
  for p in predictions:
    print(' '.join([y_id_to_word[np.argmax(i)] for i in p]))

final_predictions(tmp_X.shape[-1])

[[110 164   3 ...   0   0   0]]
Sample 1:
[[1.4061211e-01 4.2224061e-01 2.5543896e-08 ... 3.3388996e-08
  2.4994996e-08 2.7802361e-08]
 [1.6362546e-01 3.1055278e-01 3.0571893e-08 ... 3.8820477e-08
  2.8653595e-08 3.1366504e-08]
 [1.8711048e-01 2.1888120e-01 3.3399417e-08 ... 4.1445354e-08
  2.9537853e-08 3.3120475e-08]
 ...
 [9.9943715e-01 4.6576155e-05 3.0804968e-12 ... 3.8203525e-12
  2.8675185e-12 2.7262708e-12]
 [9.9945098e-01 4.5400022e-05 3.0368433e-12 ... 3.7529489e-12
  2.8318697e-12 2.6526140e-12]
 [9.9949527e-01 4.0634517e-05 3.0943735e-12 ... 3.7933385e-12
  2.8668101e-12 2.5927596e-12]]
* * * <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD