In [35]:
from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense,CuDNNLSTM, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.optimizers import RMSprop
from janome.tokenizer import Tokenizer
import numpy as np
import random
import glob
import sys
import io
import re
import os

t = Tokenizer()

In [79]:
char_indices = {}  # 辞書初期化
indices_char = {}  # 逆引き辞書初期化
texts= []
chars = []
parts = []
paths = glob.glob('C:/Users/hata/Udemy/文章生成　自作/*.txt')
for path in paths:
    word = []
    part = []
    print('reading ' , path)
    binarydata = open(path,'rb').read()
    text = binarydata.decode('shift_jis')
    text = re.split(r'\-{5,}',text)[2]
    text = re.split(r'底本：',text)[0]
    text = text.replace('|','')
    text = re.sub(r'《.+?》','',text) # ルビをとる
    text = re.sub(r'※','',text) # ※をとる
    text = re.sub(r'sentimentalisme','センチメンタリズム',text)
    text = re.sub(r'\r\n','',text)
    text = re.sub(r'―','',text)
    text = re.sub(r'［＃.+?］','',text) # 入力注をとる
    texts.append(text)
    print('corpus length:', len(text))
    malist = t.tokenize(text)
    for w in malist:
        word.append(w.surface)
        part.append(w.part_of_speech.split(','))
    chars.append(word)
    parts.append(part)

POS = []
for i in range(len(parts)):
    for j in range(len(parts[i])):
        if parts[i][j] not in POS:
            POS.append(parts[i][j])

print('Part of Speech:',POS)
print('Part of Speech length:', len(POS))
print('texts length:',len(texts))
print('chars length:', len(chars))

# 辞書の作成
global_chars = []
for i in range(len(chars)):
    print('chars['+str(i)+']:', len(chars[i]))
    for word in chars[i]:
        if not word in global_chars:  # 未登録なら
            global_chars.append(word)  # 登録する

char_indices = dict((c,i) for i,c in enumerate(global_chars))
indices_char = dict((i,c) for i,c in enumerate(global_chars))

POS_indices = dict((str(c),i) for i,c in enumerate(POS))
indices_POS = dict((i,str(c)) for i,c in enumerate(POS))

print('global_chars length', len(global_chars))

reading  .\ai.txt
corpus length: 1690
reading  .\kazeno_matasaburo.txt
corpus length: 30283
reading  .\moen.txt
corpus length: 139
reading  .\mokubawa_mawaru.txt
corpus length: 9828
reading  .\nakanaka_shinanu_aitsu.txt
corpus length: 9219
reading  .\naoko.txt
corpus length: 88623
reading  .\ningen_eve.txt
corpus length: 1781
reading  .\ningen_shikkaku.txt
corpus length: 73031
reading  .\rashomon.txt
corpus length: 5695
reading  .\shitsurakuen.txt
corpus length: 14311
texts length 10
chars length 10
chars[0]: 1089
chars[1]: 19049
chars[2]: 94
chars[3]: 6643
chars[4]: 6032
chars[5]: 57919
chars[6]: 1091
chars[7]: 46381
chars[8]: 3882
chars[9]: 9850
global_chars length 11320


In [32]:
del text
del texts
del paths


# 学習データを作る
maxlen = 8
step = 4
X = []
Y = []
sentences = []
next_chars = []
pos = []
next_pos = []
for j in range(len(chars)):
    for i in range(0, len(chars[j]) - maxlen, step):
        sentences.append(chars[j][i: i + maxlen])
        next_chars.append(chars[j][i + maxlen])
        pos.append(parts[j][i: i + maxlen])
        next_pos.append(parts[j][i + maxlen])
    print('nb sequences:', len(sentences))
    print('Vectorization...')
    x = np.zeros((len(sentences), maxlen, len(global_chars)+len(POS)), dtype=np.bool)
    y = np.zeros((len(sentences),len(global_chars)+len(POS)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence): #sentence = sentences[i], char = sentence[t]
            x[i, t, char_indices[char]] = 1
            x[i, t, len(global_chars)+POS_indices[str(pos[i][t])]] = 1
        y[i, char_indices[next_chars[i]]] = 1
        y[i, len(global_chars)+POS_indices[str(pos[i][t])]] = 1
    X.append(x)
    Y.append(y)

nb sequences: 1081
Vectorization...
nb sequences: 19041
Vectorization...
nb sequences: 86
Vectorization...
nb sequences: 6635
Vectorization...
nb sequences: 6024
Vectorization...
nb sequences: 57911
Vectorization...
nb sequences: 1083
Vectorization...
nb sequences: 46373
Vectorization...
nb sequences: 3874
Vectorization...
nb sequences: 9842
Vectorization...


In [70]:
# build the model: a single LSTM
def loss(y, pred):
    s = pred-y
    s = np.sum(s)
    return (abs(s)*10)**2

print('Build model...')
learning_rate = 0.01
model = Sequential()

model.add(CuDNNLSTM(256, input_shape=(maxlen, len(global_chars)+len(POS))))
model.add(BatchNormalization())
model.add(Dropout(0.15))

model.add(Dense(512, activation='relu'))

model.add(Dense((len(global_chars)+len(POS)), activation='softmax'))

print(model.summary())

optimizer = 'Adam'
model.compile(loss='categorical_crossentropy', optimizer=optimizer) 

Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_12 (LSTM)               (None, 512)               24233984  
_________________________________________________________________
batch_normalization_8 (Batch (None, 512)               2048      
_________________________________________________________________
dropout_7 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 512)               262656    
_________________________________________________________________
dense_9 (Dense)              (None, 11320)             5807160   
Total params: 30,305,848
Trainable params: 30,304,824
Non-trainable params: 1,024
_________________________________________________________________
None


In [83]:
def sample(preds, temperature=1.0):
    preds = preds[:len(global_chars)]
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    #char_preds = preds[:len(global_chars)]
    #pos_preds = preds[len(global_chars):]
    #return np.argmax(char_preds), np.argmax(pos_preds)
    return np.argmax(probas)

for epochs in range(100):
    for number_of_dataset in range(len(X)):
        for iteration in range(1,3):
            print('\n')
            print('-' *50)
            print('epochs: ', epochs+1)
            print('number_of_dataset: ', number_of_dataset)
            print('繰り返し回数: ', iteration)
            model.fit(X[number_of_dataset], Y[number_of_dataset], batch_size=2048, shuffle = True, epochs=1)

            #start_index = random.randint(0, len(chars[number_of_dataset])-maxlen-1)
            start_index = 0
            
            for diversity in [1.2]:
                print()
                print('-----diveristy', diversity)

                generated = ''
                sentence_chars = chars[number_of_dataset][start_index: start_index + maxlen]
                sentence_parts = parts[number_of_dataset][start_index: start_index + maxlen]
                sentence = ''.join(sentence_chars)
                generated += sentence
                print('----- Seedを生成しました: "' + sentence + '"')
                sys.stdout.write(generated)

                for i in range(10):
                    x = np.zeros((len(sentences), maxlen, len(global_chars)+len(POS)), dtype=np.bool)
                    for t, char in enumerate(sentence_chars):
                        x[0, t, char_indices[char]] = 1.
                        x[0, t, POS_indices[str(sentence_parts[t])]] = 1.

                    preds = model.predict(x, verbose=0)[0]
                    next_index = sample(preds, diversity)
                    next_char = indices_char[next_index]
                    generated += next_char
                    sentence_chars = sentence_chars[1:]
                    sentence_chars.append(next_char)
                    
                    sys.stdout.write(next_char)
                    sys.stdout.flush()

--------------------------------------------------
繰り返し回数:  1
Epoch 1/1

-----diveristy 0.2
----- Seedを生成しました: "たり、せわしないことです。"
たり、せわしないことです。ますますますますますますますますますます

  This is separate from the ipykernel package so we can avoid doing imports until


ますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますます
-----diveristy 0.5
----- Seedを生成しました: "たり、せわしないことです。"
たり、せわしないことです。ますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますます
-----diveristy 1.0
----- Seedを生成しました: "たり、せわしないことです。"
たり、せわしないことです。ますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますます
-----diveristy 1.2
----- Seedを生成しました: "たり、せわしないことです。"
たり、せわしないことです。ますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますますます----------------

KeyboardInterrupt: 

In [6]:
a=['a','b','c']
a=a[1:]
a.append('de')
print(a)

['b', 'c', 'de']


In [None]:
text = s
text =Tokenizer().tokenize(text, wakati=True)  # 分かち書きする
chars = text
matasaburo = text

for word in chars:
    if not word in char_indices:  # 未登録なら
        char_indices[word] = count  # 登録する      
        count +=1
        print(count,word)  # 登録した単語を表示
        
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 8
step = 1
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

print('Vectorization...')
x0 = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y0 = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x0[i, t, char_indices[char]] = 1
    y0[i, char_indices[next_chars[i]]] = 1

# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(256, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars), activation='softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)    

def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2]:  # diversity 
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        # sentence はリストなので文字列へ変換して使用
        generated += "".join(sentence)
        print(sentence)
        
        # sentence はリストなので文字列へ変換して使用
        print('----- Generating with seed: "' + "".join(sentence)+ '"')
        sys.stdout.write(generated)


        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:]
            # sentence はリストなので append で結合する
            sentence.append(next_char)  

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model.fit(x, y,
          batch_size=128,
          epochs=20,
          callbacks=[print_callback])