In [1]:
import keras
import numpy as np
from keras import layers
import random
import sys

Using TensorFlow backend.


In [2]:
path = 'phr1.txt'
text = open(path, encoding='utf-8').read().lower()
print('Corpus length:', len(text))

Corpus length: 64974


In [3]:
lines = text.splitlines()
phrases, questions = [''], []
for line in lines:
    line = line.strip()
    if line:
        phrases[-1] += ' '+line
        if len(questions)<len(phrases):
            questions.append(line)
    else:
        phrases.append('')
phrases = [ph.strip() for ph in phrases if ph.strip()!='']

print("Phrases:", len(phrases))
print(phrases[:10])
print(questions[:10])

Phrases: 865
['what is ai? artificial intelligence is the branch of engineering and science devoted to constructing machines that think.', 'what is ai? ai is the field of science which concerns itself with building hardware and software that replicates the functions of the human mind.', 'are you sentient? sort of.', "are you sentient? by the strictest dictionary definition of the word 'sentience', i may be.", "are you sentient? even though i'm a construct i do have a subjective experience of the universe, as simplistic as it may be.", "are you sapient? in all probability, i am not.  i'm not that sophisticated.", 'are you sapient? do you think i am?', 'are you sapient? how would you feel about me if i told you i was?', 'are you sapient? no.', 'what language are you written in? python.']
['what is ai?', 'what is ai?', 'are you sentient?', 'are you sentient?', 'are you sentient?', 'are you sapient?', 'are you sapient?', 'are you sapient?', 'are you sapient?', 'what language are you writte

In [4]:
end_of_text = '\u0003'
start_of_text = '\u0002'
print(end_of_text)
phrases = [ph+end_of_text for ph in phrases]




In [5]:
maxlen = 60 
step = 1 
sentences = [] 
next_chars = []
padding = start_of_text*maxlen
for ph in phrases[:]:
    for i in range(1, len(ph), step):
        sent = (padding+ph)[i:i+maxlen]
        #print(sent)
        sentences.append(sent[:-1])
        next_chars.append(sent[-1])
print('Number of sequences:', len(sentences))
chars = sorted(list(set(text+end_of_text+start_of_text))) 
print('Unique characters:', len(chars))
char_indices = dict((char, chars.index(char)) for char in chars) 
print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences): 
    for t, char in enumerate(sentence): 
        x[i, t, char_indices[char]] = 1 
        y[i, char_indices[next_chars[i]]] = 1
print("Done!")

Number of sequences: 63244
Unique characters: 53
Vectorization...
Done!


In [6]:
def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

In [7]:
x, y = unison_shuffled_copies(x, y)

In [8]:
for i in range(min(200, len(sentences))):
    print(sentences[i], next_chars[i])

 w
w h
wh a
wha t
what  
what  i
what i s
what is  
what is  a
what is a i
what is ai ?
what is ai?  
what is ai?  a
what is ai? a r
what is ai? ar t
what is ai? art i


what is ai? ai is the field of scien c
what is ai? ai is the field of scienc e
what is ai? ai is the field of science  
what is ai? ai is the field of science  w
what is ai? ai is the field of science w h
what is ai? ai is the field of science wh i
what is ai? ai is the field of science whi c
what is ai? ai is the field of science whic h
what is ai? ai is the field of science which  
what is ai? ai is the field of science which  c
what is ai? ai is the field of science which c o
what is ai? ai is the field of science which co n
what is ai? ai is the field of science which con c
what is ai? ai is the field of science which conc e
what is ai? ai is the field of science which conce r
what is ai? ai is the field of science which concer n
w

In [9]:
model = keras.models.Sequential()
model.add(layers.LSTM(128, input_shape=(maxlen, len(chars))))
model.add(layers.Dense(len(chars), activation='softmax'))

Instructions for updating:
Colocations handled automatically by placer.


In [10]:
optimizer = keras.optimizers.RMSprop(lr=0.001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [11]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [12]:
def generate(seed, model, temperature):
    generated_chars = []
    generated_text = seed
    for i in range(400): 
        sampled = np.zeros((1, maxlen, len(chars))) 
        for t, char in enumerate(generated_text): 
            sampled[0, t, char_indices[char]] = 1. 
        preds = model.predict(sampled, verbose=0)[0]
        next_index = sample(preds, temperature) 
        next_char = chars[next_index]
        generated_chars.append(next_char)
        if next_char==end_of_text:
            break
        generated_text += next_char 
        generated_text = generated_text[1:] 
    return seed+''.join(generated_chars)

def strip_padding(text):
    return text.replace(start_of_text, '')

In [18]:
for epoch in range(1, 60): 
    print('epoch', epoch) 
    model.fit(x, y, batch_size=128, epochs=1, validation_split=0.2)
    seed = (padding + questions[random.randint(0, len(questions)-1)]+' ')[-maxlen:]
    print('--- Generating with seed: "' + strip_padding(seed) + '"') 
    for temperature in [0.1, 0.2]: 
        print('------ temperature:', temperature) 
        generated_text = generate(seed, model, temperature)
        print(strip_padding(generated_text))

epoch 1
Train on 50595 samples, validate on 12649 samples
Epoch 1/1

KeyboardInterrupt: 

In [17]:
(padding + questions[random.randint(0, len(questions))]+' ')[-max]

'\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02robots laugh '