In [1]:
import numpy as np
import re
from itertools import dropwhile
import string
import os
os.environ['KERAS_BACKEND'] = 'theano'

In [95]:
from keras.layers.recurrent import GRU
from keras.layers.wrappers import TimeDistributed
from keras.models import Sequential, model_from_json
from keras.layers.core import Dense, RepeatVector

In [106]:
#PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
MODEL_PATH = 'models'

MODEL_STRUCT_FILE = 'piglatin_struct.json'
MODEL_WEIGHTS_FILE = 'piglatin_weights.h5'

DATA_PATH = 'data'
WORDS_FILE = 'words.txt'
BEGIN_SYMBOL = '^'
END_SYMBOL = '$'
BLANK = ' '
CHAR_SET = set(string.ascii_lowercase + BEGIN_SYMBOL + END_SYMBOL)
CHAR_NUM = len(CHAR_SET)
CHAR_TO_INDICES = {c:i for i, c in enumerate(CHAR_SET)}
INDICES_TO_CHAR = {i:c for c, i in CHAR_TO_INDICES.iteritems()}
MAX_INPUT_LEN = 18
MAX_OUTPUT_LEN = 20
NON_ALPHA_PAT = re.compile('[^a-z]')

In [107]:
def is_vowel(char):
    return char in ('a', 'e', 'i', 'o', 'u')


def is_consonant(char):
    return not is_vowel(char)


def pig_latin(word):
    if is_vowel(word[0]):
        return word + 'yay'
    else:
        remain = ''.join(dropwhile(is_consonant, word))
        removed = word[:len(word)-len(remain)]
        return remain + removed + 'ay'


def vectorize(word, seq_len, vec_size):
    vec = np.zeros((seq_len, vec_size), dtype=int)
    for i, ch in enumerate(word):
        vec[i, CHAR_TO_INDICES[ch]] = 1

    for i in range(len(word), seq_len):
        vec[i, CHAR_TO_INDICES[END_SYMBOL]] = 1

    return vec

In [108]:
def build_data():
    words_file = 'words.txt'#os.path.join(PROJECT_ROOT, DATA_PATH, WORDS_FILE)
    words = [
        w.lower().strip() for w in open(words_file, 'r').readlines()
        if w.strip() != '' and not NON_ALPHA_PAT.findall(w.lower().strip())
    ]

    plain_x = []
    plain_y = []
    for w in words:
        plain_x.append(BEGIN_SYMBOL + w)
        plain_y.append(BEGIN_SYMBOL + pig_latin(w))

    # train_x train_y
    train_x = np.zeros((len(words), MAX_INPUT_LEN, CHAR_NUM), dtype=int)
    train_y = np.zeros((len(words), MAX_OUTPUT_LEN, CHAR_NUM), dtype=int)
    for i in range(len(words)):
        train_x[i] = vectorize(plain_x[i], MAX_INPUT_LEN, CHAR_NUM)
        train_y[i] = vectorize(plain_y[i], MAX_OUTPUT_LEN, CHAR_NUM)

    return train_x, train_y

In [109]:
train_x.shape

(207849, 18, 28)

In [110]:
x, y = build_data()
indices = len(x) / 10
test_x = x[:indices]
test_y = y[:indices]
train_x = x[indices:]
train_y = y[indices:]

In [114]:
train_x.shape, CHAR_NUM

((207849, 18, 28), 28)

In [15]:
def build_model(input_size, seq_len, hidden_size):
    """sequence to sequence"""
    model = Sequential()
    model.add(GRU(input_dim=input_size, output_dim=hidden_size, return_sequences=False))
    model.add(Dense(hidden_size, activation="relu"))
    model.add(RepeatVector(seq_len))
    model.add(GRU(hidden_size, return_sequences=True))
    model.add(TimeDistributed(Dense(output_dim=input_size, activation="linear")))
    model.compile(loss="mse", optimizer='adam')

    return model

In [115]:
model = build_model(CHAR_NUM, MAX_OUTPUT_LEN, 128)
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
gru_3 (GRU)                      (None, 128)           60288       gru_input_2[0][0]                
____________________________________________________________________________________________________
dense_3 (Dense)                  (None, 128)           16512       gru_3[0][0]                      
____________________________________________________________________________________________________
repeatvector_2 (RepeatVector)    (None, 20, 128)       0           dense_3[0][0]                    
____________________________________________________________________________________________________
gru_4 (GRU)                      (None, 20, 128)       98688       repeatvector_2[0][0]             
___________________________________________________________________________________________

In [18]:
model.fit(train_x, train_y, validation_data=(test_x, test_y), batch_size=128, nb_epoch=20,verbose=1)

Train on 207849 samples, validate on 23094 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0xaa8e47ec>

In [132]:
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

Saved model to disk


In [134]:
# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")

Loaded model from disk


In [135]:
loaded_model.compile(loss="mse", optimizer='adam')

In [136]:
say = 'hola'
X = np.zeros((1,MAX_INPUT_LEN, CHAR_NUM), dtype=int)
Word = BEGIN_SYMBOL + say.lower().strip() + END_SYMBOL
X[0] = vectorize(Word, MAX_INPUT_LEN, CHAR_NUM)

In [137]:
pred = loaded_model.predict(X)[0]

In [141]:
print ''.join([
        INDICES_TO_CHAR[i] for i in pred.argmax(axis=1)
        if INDICES_TO_CHAR[i] not in (BEGIN_SYMBOL, END_SYMBOL)])

qqoooooooooooooooooo


In [142]:
for i in X[0].argmax(axis=1):
    if INDICES_TO_CHAR[i] not in (BEGIN_SYMBOL,END_SYMBOL):
        print INDICES_TO_CHAR[i]

h
o
l
a


In [118]:
pig_latin('sex')

'exsay'

In [175]:
def translate_to_pig(say='helo'):
    X = np.zeros((1,MAX_INPUT_LEN, CHAR_NUM), dtype=int)
    Word = BEGIN_SYMBOL + say.lower().strip() + END_SYMBOL
    X[0] = vectorize(Word, MAX_INPUT_LEN, CHAR_NUM)
    pred = model.predict(X)[0]
    print ''.join([
        INDICES_TO_CHAR[i] for i in pred.argmax(axis=0)
        if INDICES_TO_CHAR[i] not in (BEGIN_SYMBOL, END_SYMBOL)])

In [176]:
translate_to_pig('sex')

slscksnmsmsefsh
