In [8]:
from __future__ import print_function

from keras.models import Model
from keras.layers import Input, LSTM, Dense
import numpy as np

batch_size = 64  # Batch size for training.
epochs = 50  # Number of epochs to train for.
latent_dim = 512  # Latent dimensionality of the encoding space.
num_samples = 68617  # Number of samples
# Path to the data txt file on disk.
#data_path = 'fra.txt'
data_path = 'MorphoLEX_en_shuffle.tsv'

# Vectorize the data.
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()
cnt = 0
with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')
for line in lines[: min(num_samples, len(lines) - 1)]:
    #input_text, target_text, _ = line.split('\t')
    input_text, target_text = line.split('\t')
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    new_target = ""
    blank = False
    for char in target_text:
      if (char.isalpha()):
        if (blank):
          new_target += ' '
          blank = False
        new_target += char
      elif (new_target != ""):
        blank = True
    target_text = '\t' + new_target + '\n'
    input_texts.append(input_text)
    target_texts.append(target_text)
    if (cnt >= 10000): # skip test data
      for char in input_text:
          if char not in input_characters:
              input_characters.add(char)
      for char in target_text:
          if char not in target_characters:
              target_characters.add(char)
    cnt += 1

input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)

# split test data
from sklearn.model_selection import train_test_split
input_texts, test_input, target_texts, test_target = train_test_split(
    input_texts, target_texts, test_size=5000, random_state=837)

max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

input_token_index = dict(
    [(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict(
    [(char, i) for i, char in enumerate(target_characters)])

encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    #encoder_input_data[i, t + 1:, input_token_index[' ']] = 1.
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.
    decoder_input_data[i, t + 1:, target_token_index[' ']] = 1.
    decoder_target_data[i, t:, target_token_index[' ']] = 1.
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
              metrics=['accuracy'])
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.1)
# Save model
model.save('s2s.h5')

# Next: inference mode (sampling).
# Here's the drill:
# 1) encode input and retrieve initial decoder state
# 2) run one step of decoder with this initial state
# and a "start of sequence" token as target.
# Output will be the next target token
# 3) Repeat with the current target token and current states

# Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())


def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

# example of traning data
for seq_index in range(100):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)
    print('Target sentence:', target_texts[seq_index])    

Number of samples: 63617
Number of unique input tokens: 31
Number of unique output tokens: 29
Max sequence length for inputs: 22
Max sequence length for outputs: 27
Train on 57255 samples, validate on 6362 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
-
Input sentence: frightening
Decoded sentence: fright en

Target sentence: 	fright en

-
Input sentence: prostrations
Decoded sentence: prostrate ion

Target sentence: 	prostrate ion

-
Input se

In [9]:
# test model
test_input_data = np.zeros(
    (len(test_input), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
for i, (input_text, target_text) in enumerate(zip(test_input, test_target)):
    for t, char in enumerate(input_text):
        test_input_data[i, t, input_token_index[char]] = 1.

correct = 0
for seq_index in range(len(test_input)):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = test_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    if (decoded_sentence.strip() == test_target[seq_index].strip()):
      correct += 1
    else:
      print('Incorrect: ', test_input[seq_index], '---', decoded_sentence.strip(), '---', test_target[seq_index].strip())
print(len(test_input), correct, '  Accuracy:', correct/len(test_input)*100.0)

Incorrect:  locals --- loc al --- local
Incorrect:  anabaptists --- anabaptist --- ana bapt ist
Incorrect:  demagnification --- demangic ion --- de magn ify ion
Incorrect:  theatrically --- theatric ly --- theater ic al ly
Incorrect:  alluvial --- alluvia al --- alluvial
Incorrect:  toxins --- toxicn --- toxico in
Incorrect:  butting --- but --- butt
Incorrect:  faience --- faicena --- faience
Incorrect:  conscientiousness --- conscience ious ness --- conscientious ness
Incorrect:  limpets --- limp et --- limpet
Incorrect:  protract --- pro tract ure --- pro tract
Incorrect:  lawns --- law n --- lawn
Incorrect:  resoluteness --- resolutene --- resolute ness
Incorrect:  manlier --- manlier --- man ly er
Incorrect:  chrysalis --- chryslain --- chrysalis
Incorrect:  antipodes --- antipode --- antipodes
Incorrect:  inconsistencies --- im co sist ancy --- im co sist ance y
Incorrect:  absolutism --- absoluti ism --- absolute ism
Incorrect:  papaw --- pap aw --- papaw
Incorrect:  castorbeans

In [0]:
import pickle
import numpy as np

In [11]:
# there are 1045 data in dataset
dialogue = ""
firstLine = None
count = -1
f = open('All_Data_without_sp.txt', 'r')
age = []
sex = []
X = []
y = []
for line in f:
    if (line == "<data>\n"):
        if (count != -1):
            X.append(dialogue)
        dialogue = ""
        count += 1
        firstLine = None
        continue
    if (firstLine == None):
        firstLine = line
        firstSplit = firstLine.split(', ')
        #print(firstSplit)
        age.append(firstSplit[0])
        sex.append(firstSplit[1])
        if (firstSplit[2] == "SLI\n"):
            y.append(1)
        else:
            y.append(0)
        continue
    dialogue = dialogue + " " + line
X.append(dialogue)
print("Data Size:", len(X), len(y))

Data Size: 1045 1045


In [12]:
new_X = []
for sentence in X:
  new_word = None
  for word in sentence.split():
    input_data = np.zeros((1, max_encoder_seq_length, num_encoder_tokens), dtype='float32')
    if (word == "PAUSE1" or word == "PAUSE2" or word == "PAUSE3" or word.count(",") > 0):
      decoded_sequence = word
    else:
      word = word.lower()
      for t, char in enumerate(word):
        try:
          input_data[0, t, input_token_index[char]] = 1.
        except:
          #do nothing
          #print("special_character:", char)
          None
      #print(word, " ==> ", input_data)
      decoded_sequence = decode_sequence(input_data)
    if (new_word == None):
      new_word = decoded_sequence
    else:
      new_word += " " + decoded_sequence
  #print(sentence, " --> ", new_word)
  new_X.append(new_word)
print(new_X[0])

play
 ball
 and
 PAUSE1 he
 take
 swim
 pool
 and
 he
 get
 it
 and
 thank
 you
 and
 welcome
 and
 over
 play
 swim
 and
 go
 in
 it
 and
 run
 and
 walk
 on
 knee
 and
 her
 cry
 PAUSE2 and
 cry
 lot
 and
 sit
 on
 bench
 and
 stay
 here
 and
 stay
 on
 bench
 over
 play
 play
 a
 play
 and
 play
 a
 play
 again
 and
 play
 it
 in
 water
 and
 PAUSE1 it
 angry
 and
 PAUSE1 say
 PAUSE2 bad
 and
 PAUSE1 dad
 dad y
 i
 play
 in
 pool
 and
 PAUSE1 i
 get
 it
 look
 and
 get
 it
 and
 PAUSE1 it
 in
 there
 i
 get
 it
 you
 get
 it
 and
 PAUSE1 love
 it
 he
 play
 in
 sand
 and
 play
 make
 castle
 and
 dump
 it
 dump
 it
 oh
 no
 oops
 sorry
 and
 cry
 uhoh
 they
 go
 picnic
 and
 eat
 and
 they
 drink
 juice
 and
 hungry
 and
 PAUSE1 that
 mann
 sick
 sick
 and
 PAUSE1 a
 bunny
 PAUSE1 his
 PAUSE1 tummy
 hurt
 and
 turn
 that
 and
 PAUSE1 go
 doctor
 house
 doctor
 all
 done
 her
 play
 and
 PAUSE1 hold
 that
 up
 high er
 tie
 it
 up
 and
 up
 high er
 and
 up
 cloud
 angry
 and
 PAUSE1

In [0]:
# write to new file
f2 = open('All_Data_morphemes.txt', 'w')
for i in range(len(new_X)):
  f2.write("<data>")
  f2.write("\n")
  if (y[i] == 1):
    f2.write("SLI")
  else:
    f2.write("TD")
  f2.write("\n")
  f2.write(new_X[i])
  f2.write("\n")
f2.close()