In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np
import time
import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Dropout, LSTM
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
import re
from nltk.translate.bleu_score import SmoothingFunction
from nltk.translate.bleu_score import sentence_bleu

In [None]:
class vocabulary:
    def __init__(self, name):
        #PAD_TOKEN, SOS_TOKEN, EOS_TOKEN, OOV_TOKEN = 0, 1, 2
        self.name = name
        self.token2index = {"PAD":0, "SOS":1, "EOS":2, "OOV":3}
        self.token2count = {}
        self.index2token = {0: "PAD", 1: "SOS", 2: "EOS", 3: "OOV"}
        self.num_tokens = 4
        self.num_tokenline = 0
        self.longest_token = 0
        self.source_text = []
        self.target_text = []
        self.source_normal_map = {}
        self.target_normal_map = {}

        
    def add_token(self, source_token, target_token):
        i = 0
        for token in source_token:
            token = token.strip()
            
            if token not in self.token2index:
                self.token2index[token] = self.num_tokens
                self.token2count[token] = 1
                self.index2token[self.num_tokens] = token
                self.num_tokens += 1
            else:
                self.token2count[token] += 1
                
        for token in target_token:
            token = token.strip()
            if token not in self.token2index:
                self.token2index[token] = self.num_tokens
                self.token2count[token] = 1
                self.index2token[self.num_tokens] = token
                self.num_tokens += 1
            else:
                self.token2count[token] += 1
                
             
    def add_source_line_token(self, source_line, target_line):
        source_line = source_line.to_numpy().reshape(-1, 1)
        target_line = target_line.to_numpy().reshape(-1, 1)
        
        for i in range(source_line.shape[0]):
            source_token, target_token = source_line[i], target_line[i]
            self.target_text.append(target_token[0])
            self.source_text.append(source_token[0])
            source_token = eval(str(source_token[0]))
            target_token = eval(str(target_token[0]))
            #source_token, self.source_normal_map[i] = rename_ids(source_token)
            #target_token, self.target_normal_map[i] = rename_ids(target_token)
            #self.target_text.append(target_token)
            #self.source_text.append(source_token)
            self.add_token(source_token, target_token)
            
        return self.token2index, self.token2count, self.index2token, self.num_tokens, self.source_text, self.target_text, self.source_normal_map, self.target_normal_map
      
    def to_word(self, index):
        return self.index2token[index]

    def to_index(self, token):
        return self.token2index[token.strip()]

In [None]:
train_data = pd.read_csv('/content/gdrive/MyDrive/train.csv')
voc = vocabulary("test")
token2index, token2count, index2token, num_tokens, source_text, target_text, source_normal, target_normal = voc.add_source_line_token(train_data['sourceLineTokens'], train_data['targetLineTokens'])

In [None]:
#get 1000 token from the dataset
def get1000token(token2index, index2token, token2count):
    most_counted_token = dict(sorted(token2count.items(), key=lambda item: item[1], reverse = True))
    most_counted_token =  dict(list(most_counted_token.items())[:1000])

    most_token2index = {"PAD":0, "SOS":1, "EOS":2, "OOV":3}
    most_index2token = {0: "PAD", 1: "SOS", 2: "EOS", 3: "OOV"}

    num_count = 4
    for i in most_counted_token:
        most_token2index[i] = num_count
        most_index2token[num_count] = i
        num_count += 1

    return most_counted_token, most_token2index, most_index2token

In [None]:
token2count, token2index, index2token = get1000token(token2index, index2token, token2count)

In [None]:
#tokenize the data
def tokenize(text):
    buffer = []
    for lines in text:
        lines = eval(lines)
        individual = []
        for x in lines:
            try:
                x = token2index[x]
            except KeyError:
                x = 3
            individual.append(x)
        buffer.append(np.array([1]+individual+[2]))
    return np.asarray(buffer)

In [None]:
source_token = tokenize(source_text)
target_token = tokenize(target_text)

  return array(a, dtype, copy=False, order=order)


In [None]:
print(source_token)

[array([ 1, 21,  4, 82, 48,  8, 42, 11, 17,  5,  2]) array([1, 2])
 array([ 1, 54, 11,  4, 25, 14, 25,  5, 20, 66, 14, 13, 14, 27,  2]) ...
 array([ 1, 59, 50, 10, 62,  9,  7, 45, 10, 62,  9,  2])
 array([ 1, 13, 10,  8,  9, 11, 72,  6,  2])
 array([  1,  12,   4, 349,   7,  13,  15,  64,  20,  16,   5,   6,   2])]


In [None]:
padded_target_token = tf.keras.preprocessing.sequence.pad_sequences(target_token,padding='post', maxlen = 25)
padded_source_token = tf.keras.preprocessing.sequence.pad_sequences(source_token,padding='post', maxlen = 25)

In [None]:
len(padded_source_token)

14643

In [None]:
max_length_input = 25
max_length_output = 25
vocab_size = len(index2token)

In [None]:
encoder_input_data = np.zeros((len(padded_source_token), max_length_input),dtype='float32')
decoder_input_data = np.zeros((len(padded_source_token), max_length_input), dtype='float32')
decoder_target_data = np.zeros((len(padded_source_token), max_length_input, vocab_size),dtype='float32')

In [None]:
for i, (input_sent, target_sent) in enumerate(zip(padded_source_token, padded_source_token)):
    for t in range(len(input_sent)):
        encoder_input_data[i, t] = input_sent[t]
    for t in range(len(target_sent)):
        decoder_input_data[i, t] = target_sent[t]
        if t > 0:
            decoder_target_data[i, t - 1, target_sent[t]] = 1.

In [None]:
decoder_target_data.shape

(14643, 25, 1004)

In [None]:
embedding_size = 50
latent_dim = 300

In [None]:
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model
from keras.utils import plot_model

In [None]:
encoder_inputs = Input(shape=(None,))

en_x=  Embedding(vocab_size, embedding_size)(encoder_inputs)

encoder = LSTM(latent_dim, return_state=True)

encoder_outputs, state_h, state_c = encoder(en_x)

encoder_states = [state_h, state_c]

In [None]:
#Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))

dex=  Embedding(vocab_size, embedding_size)

final_dex= dex(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)

decoder_outputs, _, _ = decoder_lstm(final_dex, initial_state=encoder_states)

decoder_dense = Dense(vocab_size, activation='softmax')

decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

In [None]:
valid_data = pd.read_csv('/content/gdrive/MyDrive/valid.csv')
voc = vocabulary("valid")
val_token2index, val_token2count, val_index2token, val_num_tokens, val_source_text, val_target_text, source_normal, target_normal = voc.add_source_line_token(valid_data['sourceLineTokens'], valid_data['targetLineTokens'])
val_source_token = tokenize(val_source_text)
val_target_token = tokenize(val_target_text)
val_padded_target_token = tf.keras.preprocessing.sequence.pad_sequences(val_target_token,padding='post', maxlen = 25)
val_padded_source_token = tf.keras.preprocessing.sequence.pad_sequences(val_source_token,padding='post', maxlen = 25)
val_encoder_input_data = np.zeros((len(val_padded_source_token), max_length_input),dtype='float32')
val_decoder_input_data = np.zeros((len(val_padded_source_token), max_length_input), dtype='float32')
val_decoder_target_data = np.zeros((len(val_padded_source_token), max_length_input, vocab_size),dtype='float32')

for i, (input_sent, target_sent) in enumerate(zip(val_padded_source_token, val_padded_source_token)):
    for t in range(len(input_sent)):
        val_encoder_input_data[i, t] = input_sent[t]
    for t in range(len(target_sent)):
        val_decoder_input_data[i, t] = target_sent[t]
        if t > 0:
            val_decoder_target_data[i, t - 1, target_sent[t]] = 1.

  return array(a, dtype, copy=False, order=order)


In [None]:
print(val_decoder_target_data)

[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 ...

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]


In [None]:
model.summary()


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 50)     50200       input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 50)     50200       input_2[0][0]                    
______________________________________________________________________________________________

In [None]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=128, epochs=150, validation_data = ([val_encoder_input_data, val_decoder_input_data], val_decoder_target_data))

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7fd5fac147d0>

In [None]:
encoder_model = Model(encoder_inputs, encoder_states)
encoder_model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 50)          50200     
_________________________________________________________________
lstm (LSTM)                  [(None, 300), (None, 300) 421200    
Total params: 471,400
Trainable params: 471,400
Non-trainable params: 0
_________________________________________________________________


In [None]:
print(source_token)

[array([ 1, 21,  4, 82, 48,  8, 42, 11, 17,  5,  2]) array([1, 2])
 array([ 1, 54, 11,  4, 25, 14, 25,  5, 20, 66, 14, 13, 14, 27,  2]) ...
 array([ 1, 59, 50, 10, 62,  9,  7, 45, 10, 62,  9,  2])
 array([ 1, 13, 10,  8,  9, 11, 72,  6,  2])
 array([  1,  12,   4, 349,   7,  13,  15,  64,  20,  16,   5,   6,   2])]


In [None]:
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

final_dex2= dex(decoder_inputs)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(final_dex2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs2] + decoder_states2)

In [None]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = token2index['SOS']
    
    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = index2token[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        if (sampled_char == 'EOS'):
            stop_condition = True

        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        states_value = [h, c]

    return decoded_sentence

In [None]:
score =[]
for seq_index in range(len(val_encoder_input_data)):
    input_seq = val_encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq).split()[:-1]
    target = eval(val_target_text[seq_index: seq_index + 1][0])
    print('-')
    print('Input sentence:', target)
    print('Decoded sentence:', decoded_sentence)
    loc_score = sentence_bleu([target], decoded_sentence, weights=(0.25, 0.25, 0.25, 0.25))
    score.append(loc_score)

-
Input sentence: ['if', '(', '(', 'factorial', '(', 'x', ')', '>=', 'n1', ')', '&&', '(', 'factorial', '(', 'x', ')', '<=', 'n2', ')', ')', '{']
Decoded sentence: ['if', '(', '(', 'OOV', '(', 'x', ')', ')', '>', '>', '1', '&&', '(', '(', 'x', '>=', 'OOV', ')', ')', '{', '{']


Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


-
Input sentence: ['scanf', '(', '"%d"', ',', '&', 'a', '[', '0', ']', ')', ';']
Decoded sentence: ['scanf', '(', '"%d"', ',', '&', 'a', '[', 'i', ']', ')', ';']
-
Input sentence: ['a', '=', 'a', '/', '10', ';']
Decoded sentence: ['a', '=', 'a', '/', '10']
-
Input sentence: ['int', 'k', ',', 'c', ',', 'x', ',', 'y', ';']
Decoded sentence: ['int', 'k', ',', 'c', ';']
-
Input sentence: ['ch', '=', 'rot', '(', 'a', '[', 'i', ']', ')', ';']
Decoded sentence: ['char', 'c', '=', 'move', '(', 'a', '[', ']', ')', ';']
-
Input sentence: ['str1', '[', 'i', ']', '=', 'c', ';']
Decoded sentence: ['str', '[', 'i', ']', '=', 'c', ';']
-
Input sentence: ['p', '=', 'max', '(', 'p', ',', 'a', '[', 'k', '+', '1', ']', ')', ';']
Decoded sentence: ['p', '=', 'max', '(', 'p', ',', 'a', '[', 'k', ']', '+', 'j', ')']
-
Input sentence: ['return', '0', ';']
Decoded sentence: ['OOV', '0', ';']


Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Input sentence: ['"Hello World!"', ';']
Decoded sentence: ['"Hello', 'World!"']
-
Input sentence: ['printf', '(', '"%.3f"', ',', '(', '5.0', '/', '9.0', '*', '(', 'f', '-', '32', ')', '+', '273.15', ')', ')', ';']
Decoded sentence: ['printf', '(', '"%.3f"', ',', '(', '3.14159265', '*', '(', '2', ')', '^', '2', '-', '(', 'v1', ')', '*', 'v1', ')', ')']
-
Input sentence: ['return', '0', ';']
Decoded sentence: ['`']
-
Input sentence: ['printf', '(', '"%d "', ',', 'a', '[', 'j', ']', ')', ';']
Decoded sentence: ['printf', '(', '"%d', '"', ',', 'a', '[', 'j', ']', ')']
-
Input sentence: ['A', '[', 't', ']', '=', 's', ';']
Decoded sentence: ['A', '[', 't', ']', '=', 's']
-
Input sentence: ['p', '=', 'p', '*', 'mat', '[', 'i', ']', '[', 'j', ']', ';']
Decoded sentence: ['p', '=', 'p', '*', 'a', '[', 'i', ']', '[', 'j', ']', ';']
-
Input sentence: ['value', '=', 'value', '+', '(', 'd', '[', 'i', ']', '*', 'pow1', '(', '10', ',', 

In [None]:
print(sum(score)/len(score))

0.49908975725314425
