In [1]:
import os
import numpy as np
from keras.models import Model
from keras.layers import GRU, Dense, Bidirectional, Embedding, Input
import jieba

Using TensorFlow backend.


In [2]:
maxlen = 30 #sequence length
sentences = [] #store extracted sentences
next_word = [] #for every extracted sentences, store its next char as target

In [3]:
lines = open('../input/exampleText_middle', 'r', encoding='utf8').read()
all_words = list(jieba.cut(lines, cut_all=False))

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.069 seconds.
Prefix dict has been built succesfully.


In [4]:
words = sorted(list(set(all_words))) # all unique words
words_indices = dict((word, i) for i, word in enumerate(words))

In [5]:
for i in range(0, len(all_words) - maxlen):
    sentences.append(all_words[i:i+maxlen])
    next_word.append(all_words[i+maxlen])

In [6]:
num_samples = len(sentences)
num_words = len(words)
print('extracted sentence %d' % num_samples)
print('total unique words %d' % num_words)

extracted sentence 235805
total unique words 16412


In [7]:
# input, shape = [size, maxlen]
x = np.zeros((num_samples, maxlen), dtype=np.float32)
# target, shape = [size]
y = np.zeros((num_samples), dtype=np.float32)
for i, sentence in enumerate(sentences):
    for j, word in enumerate(sentence):
        x[i, j] = words_indices[word]
    y[i] = words_indices[next_word[i]]

In [8]:
Inp = Input(shape = (maxlen,), dtype = 'float32')
m = Embedding(num_words, 128, input_length=maxlen)(Inp)
m = Bidirectional(GRU(256, return_sequences=True))(m)
m = Bidirectional(GRU(128))(m)
m = Dense(num_words, activation='softmax')(m)
model = Model(Inp, m)

model.compile(loss = 'sparse_categorical_crossentropy', optimizer='rmsprop')
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 30)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 30, 128)           2100736   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 30, 512)           591360    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 256)               492288    
_________________________________________________________________
dense_1 (Dense)              (None, 16412)             4217884   
Total params: 7,402,268
Trainable params: 7,402,268
Non-trainable params: 0
_________________________________________________________________


In [9]:
model.fit(x, y, epochs=100, batch_size=1024, verbose=2)

Instructions for updating:
Use tf.cast instead.
Epoch 1/100
 - 48s - loss: 5.6803
Epoch 2/100
 - 43s - loss: 5.0608
Epoch 3/100
 - 43s - loss: 4.8524
Epoch 4/100
 - 43s - loss: 4.6974
Epoch 5/100
 - 43s - loss: 4.5794
Epoch 6/100
 - 43s - loss: 4.4764
Epoch 7/100
 - 43s - loss: 4.3826
Epoch 8/100
 - 43s - loss: 4.2934
Epoch 9/100
 - 43s - loss: 4.2083
Epoch 10/100
 - 43s - loss: 4.1291
Epoch 11/100
 - 43s - loss: 4.0486
Epoch 12/100
 - 44s - loss: 3.9725
Epoch 13/100
 - 44s - loss: 3.8981
Epoch 14/100
 - 44s - loss: 3.8283
Epoch 15/100
 - 43s - loss: 3.7584
Epoch 16/100
 - 43s - loss: 3.6882
Epoch 17/100
 - 44s - loss: 3.6194
Epoch 18/100
 - 44s - loss: 3.5510
Epoch 19/100
 - 44s - loss: 3.4827
Epoch 20/100
 - 44s - loss: 3.4166
Epoch 21/100
 - 43s - loss: 3.3528
Epoch 22/100
 - 43s - loss: 3.2892
Epoch 23/100
 - 43s - loss: 3.2268
Epoch 24/100
 - 43s - loss: 3.1680
Epoch 25/100
 - 43s - loss: 3.1099
Epoch 26/100
 - 43s - loss: 3.0535
Epoch 27/100
 - 43s - loss: 2.9995
Epoch 28/100
 - 

<keras.callbacks.History at 0x7f2a11551e48>

In [10]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    
    probabs = np.random.multinomial(1, preds, 1)
    return np.argmax(probabs)

In [11]:
def write(model, temperature, word_num, begin_sentence):
    inp = begin_sentence[:maxlen] #initial input defined by user
    result = begin_sentence[:maxlen]
    result.append('/// ')
    
    for _ in range(word_num):
        sampled = np.zeros((1, maxlen))
        for i, word in enumerate(inp):
            sampled[0, i] = words_indices[word]
        
        preds = model.predict(sampled, verbose=0)[0]
        if temperature is None:
            next_word = words[np.argmax(preds)]
        else:
            next_index = sample(preds, temperature)
            next_word = words[next_index]
        
        inp.append(next_word)
        inp = inp[1:] #remove first word
        result.append(next_word)
    return ''.join(result)

In [12]:
#test
begin_sentence = lines[50003:50100]
begin_sentence = list(jieba.cut(begin_sentence, cut_all=False))
print('start sentence: ', ''.join(begin_sentence[:maxlen]))

#no temperature
print('no temperature')
print(write(model, None, 200, begin_sentence))

#various temperature
for temp in [0.5, 1.0, 1.5]:
    print('temperature %f' % temp)
    print(write(model, temp, 200, begin_sentence))

start sentence:  
    “可缩写是RK呢。”江利子看着绣在上面的字母，“唐泽雪穗（KarasawaYukiho）不应该是
no temperature

    “可缩写是RK呢。”江利子看着绣在上面的字母，“唐泽雪穗（KarasawaYukiho）不应该是/// 靠，这就是”。
    “你一定很好吧？”
    “还不能发生了话，我就等了。”
    “是吗？”
    “还没，听说是我去了。”
    “哦……”
    友彦虽然在想里高宫玄关的高宫要，那种大概和在公寓前的状况能让他感到意外，在在公寓这高宫与高宫在那里等什么时候，就会把卡带给他去的那个这么并没有你的。他想，这个便没有绘里来给他的可能。”
    “啊？”
    友彦没有和话在心里兴趣得了，这很我这种电话，她把脸什么状况要到大阪帮忙。她好像还记得她这句话，也以为她被别人结婚。
temperature 0.500000

    “可缩写是RK呢。”江利子看着绣在上面的字母，“唐泽雪穗（KarasawaYukiho）不应该是/// 靠，这就是”。
    “你一定很好吧？”
    “以前，我因为因为唐泽雪穗不是的。这些我他打她，才想到你，有什么不想
    虽然他们不想想到她，那天她还是带着答应的话，也让江利子会看看自己决定。她首先告诉我的现在，桐原背后得受到她在那里。
    雪穗的母亲突然在她几乎没有回家，但然而，她并没有受到我们的消息。跑和弘惠在这里怎么也不认为，自己认为她便要不到她现在的父亲。
    这不清楚，她还是电话。
    过着人，诚便想起了”出来的怎么很像有点你的关系，你是以什么了。果然，这一点都是，她母亲将觉得觉得奇怪想。再说，我想
temperature 1.000000

    “可缩写是RK呢。”江利子看着绣在上面的字母，“唐泽雪穗（KarasawaYukiho）不应该是/// 联系方式，我要轻易一台打消在说法才意思，会应是学长 《小自知立刻确认三泽千都留的情况感觉。计算机的时候充满的背影她们必须再说，怎么回事，这急忙关于指示似乎到了这一次，她和他并不生气小小的笑。
    友彦生意共生的人，他突然“原因”。
    “是。”一成从过他没我兴致勃勃偷窃了中道打开那面前，那个圣诞节总是晚上上对他公布程序学长住证件，也无法送将筱冢终于管理员去过了。