In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Model,layers,losses
import numpy as np

In [2]:
from datagen import *
dataset = get_dataset()

In [3]:
input_texts = []
target_texts = []
input_characters = set(["<end>","<unk>"])
target_characters = set(["<start>","<end>","1","2","3","4","5","6","7","8","9","10"])
for line in dataset:
    input_text, target_text = line.split('\t')
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    target_text = '<start> ' + target_text + ' <end>'
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text.split():
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text.split():
        if char not in target_characters:
            target_characters.add(char)

input_characters = sorted(list(input_characters))
input_characters.remove("<unk>")
input_characters.insert(0,"<unk>")
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt.split()) for txt in input_texts])
max_decoder_seq_length = max([len(txt.split()) for txt in target_texts])

print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

input_token_index = dict(
    [(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict(
    [(char, i) for i, char in enumerate(target_characters)])

dicts = (input_token_index,target_token_index,max_encoder_seq_length,num_encoder_tokens)

import pickle
with open("dicts.tuple","wb") as fp:
    pickle.dump(dicts,fp)

reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())


encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text.split()):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    encoder_input_data[i, t + 1:, input_token_index['<end>']] = 1.
    for t, char in enumerate(target_text.split()):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.
    decoder_input_data[i, t + 1:, target_token_index['<end>']] = 1.
    decoder_target_data[i, t:, target_token_index['<end>']] = 1.
def encode_input(input_text):  
    encoder_input_data = np.zeros(
        (1, max_encoder_seq_length, num_encoder_tokens),
        dtype='float32')
    for t, char in enumerate(input_text.split()):
        if char in input_token_index.keys():
            encoder_input_data[0, t, input_token_index[char]] = 1.
        else:
            encoder_input_data[0, t, input_token_index["<unk>"]] = 1.
    encoder_input_data[0, t + 1:, input_token_index['<end>']] = 1.
    return encoder_input_data

Number of samples: 9648
Number of unique input tokens: 50
Number of unique output tokens: 36
Max sequence length for inputs: 14
Max sequence length for outputs: 8


In [4]:
inputs = layers.Input(shape=(max_encoder_seq_length,num_encoder_tokens))
x = layers.Bidirectional(layers.LSTM(128,return_sequences=True))(inputs)
x = layers.Flatten()(x)
x = layers.RepeatVector(max_decoder_seq_length)(x)
x = layers.Dense(128,activation='relu')(x)
x = layers.Bidirectional(layers.LSTM(128,return_sequences=True))(x)
x = layers.Dense(num_decoder_tokens,activation='softmax')(x)
print(x.shape)
model = Model(inputs,x)

# inputs = layers.Input(shape=(max_encoder_seq_length,num_encoder_tokens))
# xlstm = layers.Bidirectional(layers.LSTM(128,return_sequences=True))(inputs)
# x = layers.Flatten()(xlstm)
# x = layers.RepeatVector(max_decoder_seq_length)(x)
# a = layers.Dense(128,activation='relu')(x)
# b = layers.Dense(128,activation='softmax')(x)
# x = layers.Multiply()([a,b])
# x = layers.Bidirectional(layers.LSTM(128,return_sequences=True))(x)
# x = layers.Dense(num_decoder_tokens,activation='softmax')(x)
# print(x.shape)
# model = Model(inputs,x)

(None, 8, 36)


In [5]:
model.compile(optimizer='adam',loss=losses.categorical_crossentropy,metrics=['accuracy'])

In [6]:
model.fit(encoder_input_data,decoder_input_data,epochs=20,batch_size=100)

Train on 9648 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7ff048023af0>

In [7]:
model.save("model.h5")

In [8]:
import random

for _ in range(0,20):
    i = random.randint(0,len(dataset))
    print('Text: ',input_texts[i])
    prediction = model.predict(encoder_input_data[i:i+1])


    ans = ""
    for p in prediction[0]:
        out = reverse_target_char_index[np.argmax(p)]
        if out != "<end>" and out != "<start>":
            ans += str(out)+" "
    print("output: ",ans)

Text:  <unk> <unk> assign <unk> <unk> <unk> with number two 
output:  assign 6 <exp>2 
Text:  <unk> endif <unk> <unk> <unk> 
output:  endif 
Text:  <unk> <unk> set <unk> to multiply <unk> to <unk> <unk> <unk> <unk> 
output:  mul 7 9 equ 4 
Text:  <unk> <unk> set <unk> to div <unk> by <unk> <unk> <unk> <unk> 
output:  div 7 9 equ 4 
Text:  <unk> <unk> set <unk> to subtraction of <unk> and <unk> 
output:  sub 8 10 equ 4 
Text:  subtraction of <unk> by <unk> 
output:  sub 3 5 
Text:  initialize <unk> with value seven 
output:  assign 2 <exp>7 
Text:  <unk> <unk> <unk> multiply <unk> <unk> <unk> <unk> to <unk> 
output:  mul 8 10 
Text:  assign <unk> <unk> <unk> <unk> with value one <unk> <unk> 
output:  assign 5 <exp>1 
Text:  division of <unk> <unk> <unk> to <unk> <unk> <unk> 
output:  div 5 7 
Text:  divide <unk> and <unk> <unk> <unk> 
output:  div 2 4 
Text:  if <unk> less than <unk> <unk> <unk> <unk> 
output:  if less 2 5 do 
Text:  <unk> <unk> let <unk> <unk> <unk> <unk> be two 
outpu

In [None]:
while True:
    s = input("> ")
    e = encode_input(s)
    prediction = model.predict(e)
    ans = ""
    for p in prediction[0]:
        out = reverse_target_char_index[np.argmax(p)]
        if out != "<end>" and out != "<start>":
            if out.isnumeric():
                ans += s.split()[int(out,base=10)-1]+" "
            else:
                ans += str(out)+" "
    print("output: ",ans)

> add add a and b
output:  add add a b 
> add a and b add
output:  add add b b 
> else if a greater than b
output:  else if greater a b do 
> else
output:  else 
