In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Model,layers,losses
import numpy as np

In [2]:
from datagen import *
dataset = get_dataset()

In [3]:
input_texts = []
target_texts = []
input_characters = set(["<end>","<unk>"])
target_characters = set(["<start>","<end>","1","2","3","4","5","6","7","8","9","10"])
for line in dataset:
    input_text, target_text = line.split('\t')
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    target_text = '<start> ' + target_text + ' <end>'
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text.split():
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text.split():
        if char not in target_characters:
            target_characters.add(char)

input_characters = sorted(list(input_characters))
input_characters.remove("<unk>")
input_characters.insert(0,"<unk>")
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt.split()) for txt in input_texts])
max_decoder_seq_length = max([len(txt.split()) for txt in target_texts])

print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

input_token_index = dict(
    [(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict(
    [(char, i) for i, char in enumerate(target_characters)])

dicts = (input_token_index,target_token_index,max_encoder_seq_length,num_encoder_tokens)

import pickle
with open("dicts.tuple","wb") as fp:
    pickle.dump(dicts,fp)

reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())


encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text.split()):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    encoder_input_data[i, t + 1:, input_token_index['<end>']] = 1.
    for t, char in enumerate(target_text.split()):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.
    decoder_input_data[i, t + 1:, target_token_index['<end>']] = 1.
    decoder_target_data[i, t:, target_token_index['<end>']] = 1.
def encode_input(input_text):  
    encoder_input_data = np.zeros(
        (1, max_encoder_seq_length, num_encoder_tokens),
        dtype='float32')
    for t, char in enumerate(input_text.split()):
        if char in input_token_index.keys():
            encoder_input_data[0, t, input_token_index[char]] = 1.
        else:
            encoder_input_data[0, t, input_token_index["<unk>"]] = 1.
    encoder_input_data[0, t + 1:, input_token_index['<end>']] = 1.
    return encoder_input_data

Number of samples: 17925
Number of unique input tokens: 50
Number of unique output tokens: 38
Max sequence length for inputs: 17
Max sequence length for outputs: 8


In [4]:
# inputs = layers.Input(shape=(max_encoder_seq_length,num_encoder_tokens))
# x = layers.Bidirectional(layers.LSTM(128,return_sequences=True))(inputs)
# x = layers.Flatten()(x)
# x = layers.RepeatVector(max_decoder_seq_length)(x)
# x = layers.Dense(128,activation='relu')(x)
# x = layers.Bidirectional(layers.LSTM(128,return_sequences=True))(x)
# x = layers.Dense(num_decoder_tokens,activation='softmax')(x)
# print(x.shape)
# model = Model(inputs,x)

# inputs = layers.Input(shape=(max_encoder_seq_length,num_encoder_tokens))
# xlstm = layers.Bidirectional(layers.LSTM(128,return_sequences=True))(inputs)
# x = layers.Flatten()(xlstm)
# x = layers.RepeatVector(max_decoder_seq_length)(x)
# a = layers.Dense(256,activation='relu')(x)
# b = layers.Dense(256,activation='softmax')(x)
# x = layers.Multiply()([a,b])
# x = layers.Bidirectional(layers.LSTM(128,return_sequences=True))(x)
# x = layers.Dense(num_decoder_tokens,activation='softmax')(x)
# print(x.shape)
# model = Model(inputs,x)
# model.summary()

inputs = layers.Input(shape=(max_encoder_seq_length,num_encoder_tokens))
x = layers.LSTM(128,return_sequences=True)(inputs)
x = layers.Flatten()(x)
x = layers.RepeatVector(max_decoder_seq_length)(x)
x = layers.Dense(128,activation='sigmoid')(x)
x = layers.LSTM(128,return_sequences=True)(x)
x = layers.Dense(num_decoder_tokens,activation='softmax')(x)
print(x.shape)
model = Model(inputs,x)

# inputs = layers.Input(shape=(max_encoder_seq_length,num_encoder_tokens))
# x,h,c = layers.LSTM(128,return_sequences=True,return_state=True)(inputs)
# x = layers.Flatten()(x)
# x = layers.RepeatVector(max_decoder_seq_length)(x)
# x = layers.Dense(128,activation='relu')(x)
# x = layers.LSTM(128,return_sequences=True)(x,initial_state=[h,c])
# x = layers.Dense(num_decoder_tokens,activation='softmax')(x)
# print(x.shape)
# model = Model(inputs,x)
# model.summary()



(None, 8, 38)


In [5]:
model.compile(optimizer='adam',loss=losses.categorical_crossentropy,metrics=['accuracy'])

In [6]:
model.fit(encoder_input_data,decoder_input_data,epochs=25,batch_size=100)

Train on 17925 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7f8fb806f0a0>

In [7]:
model.save("model.h5")

In [8]:
# import random

# for _ in range(0,20):
#     i = random.randint(0,len(dataset))
#     print('Text: ',input_texts[i])
#     prediction = model.predict(encoder_input_data[i:i+1])


#     ans = ""
#     for p in prediction[0]:
#         out = reverse_target_char_index[np.argmax(p)]
#         if out != "<end>" and out != "<start>":
#             ans += str(out)+" "
#     print("output: ",ans)

In [9]:
# while True:
#     s = input("> ")
#     e = encode_input(s)
#     prediction = model.predict(e)
#     ans = ""
#     for p in prediction[0]:
#         out = reverse_target_char_index[np.argmax(p)]
#         if out != "<end>" and out != "<start>":
#             if out.isnumeric():
#                 ans += s.split()[int(out,base=10)-1]+" "
#             else:
#                 ans += str(out)+" "
# #             ans += str(out)+" "
#     print("output: ",ans)

In [10]:
def predict(s):
    e = encode_input(s)
    prediction = model.predict(e)
    ans = ""
    for p in prediction[0]:
        out = reverse_target_char_index[np.argmax(p)]
        if out != "<end>" and out != "<start>":
            if out.isnumeric() and (int(out,base=10)-1) < len(s.split()):
                ans += s.split()[int(out,base=10)-1]+" "
            else:
                ans += str(out)+" "
#             ans += str(out)+" "
    print(s,"\noutput: ",ans,"\n")

predict("add a and b sub")
predict("sub sub a and b")
predict("mul mul a and b")
predict("div div div a and b")

add a and b sub 
output:  sub a sub  

sub sub a and b 
output:  sub a b  

mul mul a and b 
output:  mul a b  

div div div a and b 
output:  div and and 8  

