In [1]:
import sys
sys.path.append("../codesearchnet")

import os
os.environ["CUDA_VISIBLE_DEVICES"]=""

In [2]:
import fasttext as ft
import tensorflow as tf
from code_parser import *
import time
from data_reader import get_data_df

In [3]:
df = get_data_df("../resources/data/", ["python"] , ["test"])

In [4]:

#model = ft.train_unsupervised(
#     "../resources/python_processed/python_code_corpus.txt",
#     lr = 0.005,
#     epoch = 30,
#     dim = 256,
#     thread = 40,
# )
# model.save_model("../resources/python_processed/code_no_ast.bin")
# array_querys = "\n".join(df['docstring_tokens'].apply(lambda x : " ".join(x)))
# array_codes = "\n".join(df['code_tokens'].apply(lambda x : " ".join(x).replace("\n","\t")))
# with open("../resources/python_processed/python_code_corpus.txt", "w") as f:
#     f.write(array_codes)
# with open("../resources/python_processed/python_query_corupus.txt", "w") as f:
#     f.write(array_querys)

In [5]:
def tokenize(data, maxlen):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(data)
    tensor = lang_tokenizer.texts_to_sequences(data)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, maxlen, padding='post',truncating='post')
    return tensor, lang_tokenizer

In [6]:
code_tensor, code_toekenizer = tokenize(df['code_tokens'].apply(lambda x : "<start> " + " ".join(x) + " <end>"), 64)

In [7]:
query_tensor, query_toekenizer = tokenize(df['docstring_tokens'].apply(lambda x : "<start> " + " ".join(x) + " <end>"), 32)

In [8]:
max_length_query, max_length_code = query_tensor.shape[1], code_tensor.shape[1]
max_length_query, max_length_code

(32, 64)

In [9]:
BUFFER_SIZE = 10000
BATCH_SIZE = 8
steps_per_epoch = len(query_tensor)//BATCH_SIZE
embedding_dim = 128
units = 128
vocab_code_size = len(code_toekenizer.word_index)+1
vocab_query_size = len(query_toekenizer.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((code_tensor, query_tensor)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [10]:
example_code_batch, example_query_batch = next(iter(dataset))
example_code_batch.shape, example_query_batch.shape

(TensorShape([8, 64]), TensorShape([8, 32]))

In [11]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [12]:
encoder_code = Encoder(vocab_code_size, embedding_dim, units, BATCH_SIZE)
encoder_query = Encoder(vocab_query_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder_code.initialize_hidden_state()
sample_hidden = encoder_query.initialize_hidden_state()
sample_output, sample_hidden = encoder_code(example_code_batch, sample_hidden)

In [13]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # query hidden state shape == (batch_size, hidden size)
        # query_with_time_axis shape == (batch_size, 1, hidden size)
        # values shape == (batch_size, max_len, hidden size)
        # we are doing this to broadcast addition along the time axis to calculate the score
        query_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [14]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

In [15]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, embedding_layer, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = embedding_layer
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state, attention_weights

In [16]:
decoder_query = Decoder(vocab_query_size, embedding_dim, units, encoder_query.embedding, BATCH_SIZE)
decoder_code = Decoder(vocab_code_size, embedding_dim, units, encoder_code.embedding, BATCH_SIZE)

sample_decoder_output, _, _ = decoder_code(tf.random.uniform((BATCH_SIZE, 1)), sample_hidden, sample_output)

In [24]:
optimizer_cq = tf.keras.optimizers.Adam()
optimizer_qc = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

checkpoint_dir = './training_checkpoints_nmt'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder_code=encoder_code,
                                 encoder_query=encoder_query,
                                 decoder_code=decoder_code,
                                 decoder_query=decoder_query
)

In [25]:
@tf.function
def decode_it(src_encoder, tgt_decoder, src, targ, dec_input, dec_hidden, enc_output, tape, optimizer):
    
#     dec_input = tf.identity(dec_input)
#     dec_hidden = tf.identity(dec_hidden)
#     enc_output = tf.identity(enc_output)
    
    loss = 0
    for t in range(1, targ.shape[1]):
        
        # passing enc_output to the decoder
        predictions, dec_hidden, _ = tgt_decoder(dec_input, dec_hidden, enc_output)

        loss += loss_function(targ[:, t], predictions)
        # using teacher forcing
        dec_input = tf.expand_dims(targ[:, t], 1)
    
    
    batch_loss = (loss / int(targ.shape[1]))

    variables = src_encoder.trainable_variables + tgt_decoder.trainable_variables
    
    print("grads")
    gradients = tape.gradient(loss, variables)
    print("optimiser")
    optimizer.apply_gradients(zip(gradients, variables))
    
    return loss

In [26]:
@tf.function
def train_step(inp, targ, enc_code_hidden, enc_query_hidden):
    loss = 0
    code, query = inp, targ

    with tf.GradientTape() as cq_tape, tf.GradientTape() as qc_tape :
        enc_code_output, enc_code_hidden = encoder_code(code, enc_code_hidden)
        enc_query_output, enc_query_hidden = encoder_query(query, enc_query_hidden)

        dec_query_hidden = enc_code_hidden
        dec_code_hidden = enc_query_hidden

        query_input = tf.expand_dims([query_toekenizer.word_index['<start>']] * BATCH_SIZE, 1)
        code_input = tf.expand_dims([code_toekenizer.word_index['<start>']] * BATCH_SIZE, 1)
        
        
        print("c->q")
        
        # code -> query
        loss_cq = decode_it(encoder_code, decoder_query, code, query, query_input, enc_code_hidden, enc_code_output, cq_tape, optimizer_cq)
        
#         print("c->c")
        
#         # code -> code
#         loss_cc = decode_it(encoder_code, decoder_code, code, code, code_input, enc_code_hidden, enc_code_output, tape)
        
        print("q->c")
        
        # query -> code
        loss_qc = decode_it(encoder_query, decoder_code, query, code, code_input, enc_query_hidden, enc_query_output, qc_tape, optimizer_qc)
        
#         print("q->q")
#         # query -> query
#         loss_qq = decode_it(encoder_query, decoder_query, query, query, query_input, enc_query_hidden, enc_query_output, tape)

        loss = (loss_cq + loss_qc)/2
 
    batch_loss = (loss / int(targ.shape[1]))
    
    return batch_loss

In [27]:
# dataset = dataset.unbatch().batch(8)

In [28]:
EPOCHS = 10

for epoch in range(EPOCHS):
    start = time.time()
    enc_code_hidden = encoder_code.initialize_hidden_state()
    enc_query_hidden = encoder_query.initialize_hidden_state()
    total_loss = 0

    for (batch, (code, query)) in enumerate(dataset.take(steps_per_epoch)):
        print("train step")
        batch_loss = train_step(code, query, enc_code_hidden, enc_query_hidden)
        total_loss += batch_loss

        print("after")
        print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                       batch,
                                                       batch_loss.numpy()))
            
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
            checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

train step
c->q
grads
optimiser
grads


RuntimeError: in user code:

    <ipython-input-26-1abd71150042>:20 train_step  *
        loss_cq = decode_it(encoder_code, decoder_query, code, query, query_input, enc_code_hidden, enc_code_output, cq_tape, optimizer_cq)
    <ipython-input-25-6439963fda0e>:24 decode_it  *
        gradients = tape.gradient(loss, variables)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/backprop.py:999 gradient  **
        raise RuntimeError("GradientTape.gradient can only be called once on "

    RuntimeError: GradientTape.gradient can only be called once on non-persistent tapes.


In [None]:
import numpy as np
def evaluate(sentence):
    attention_plot = np.zeros((max_length_query, max_length_code))
    
    sentence = "<start> " + sentence + " <end>"

    inputs = [code_toekenizer.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                             maxlen=max_length_code,
                                                             padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([query_toekenizer.word_index['<start>']], 0)

    for t in range(max_length_query):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                                 dec_hidden,
                                                                 enc_out)

        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += query_toekenizer.index_word[predicted_id] + ' '

        if query_toekenizer.index_word[predicted_id] == '<end>':
            return result, sentence, attention_plot

        # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence, attention_plot

In [None]:
result, sentence, attention_plot = evaluate("def sum ( a / b )")

In [None]:
result