In [1]:
## 分類をインデックス化してDataFrameに追加する共通関数(秋山さん作成分)
def classify_and_add_indexed_df(df, org_key, new_key):
    class_sets = set()
    for class_ in df[org_key]:
        class_sets.add(class_)

    sorted_classes = sorted(list(class_sets))
    class_indices = dict((c, i) for i, c in enumerate(sorted_classes)) 
#    print(class_indices)

    classes = []
    for i, class_ in enumerate(reviews[org_key]):
        classes.append(class_indices[class_])
    indexed_df[new_key] = classes

In [2]:
# 単語分割(See. http://www.denzow.me/entry/2017/10/29/160903)
from janome.tokenizer import Tokenizer

def split_into_words(doc):
    """
    名詞だけを取り出してリストで戻す関数
    """
    try:
        t = Tokenizer(mmap=True)
        word_list = []
        # 形態素して取り出す
        for token in t.tokenize(doc):
            # 品詞の判定をして、名詞か動詞か形容詞だけを取り出す
            if (token.part_of_speech.split(",")[0] in ("名詞","動詞","形容詞")
                and  token.part_of_speech.split(",")[1] != "数"):  # ただし、数詞は使っても意味が薄いので捨てる
                # 表層形を登録する
                word_list.append(token)
        return word_list
    except Exception as ex:
        print(ex)
        return []

In [3]:
## 辞書作成
def make_dict(words, tokens):
    for j in range(0, len(tokens)):
        token = tokens[j]
        is_new_word = True
        for i in range(len(words)):
            if words[i][0] == token.surface and words[i][1] == token.part_of_speech[:2]:
                words[i][2] += 1
                is_new_word = False
                break
        if is_new_word:
            words.append([token.surface, token.part_of_speech[:2], 1])
    return words  

In [4]:
# 辞書ができたので全セリフデータを固定長の数字列に変換します
def trans_words_to_number(words, lines, max_speech_length):
  data_list = []
  for line in lines:
    #tokens = split_into_words(line.replace('<br />', ''))
    tokens = Tokenizer().tokenize(line.replace('<br />', ''))
    record = []
    # 固定長より長いセリフは打ち切り
    # 固定長より短いセリフは0埋め
    n = min(len(tokens), max_speech_length) #通常はlen(tokens) > max_speech_lengthを想定しているが、実装仮組中の少ないデータ時用
    for j in range(0, n):
      if (tokens[j].part_of_speech.split(",")[0] in ("名詞","動詞","形容詞")
       and  tokens[j].part_of_speech.split(",")[1] != "数"):  # ただし、数詞は使っても意味が薄いので捨てる
        dic_temp = dic[(dic['words'] == tokens[j].surface) 
                     & (dic['parts'] == tokens[j].part_of_speech[:2])] #
        record.append(dic_temp.index[0] + 1)
    if (len(record) < max_speech_length):
      for j in range(max_speech_length - len(record)):
        record.append(0)
    record.append(line) # ログ用に元文字列も付与します（数字列だとどのセリフかわからないので）
    data_list.append(record)
  return data_list

In [10]:
## メイン関数
import sys
import codecs
import random
import numpy as np
import numpy.random
import pandas as pd
import copy
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Conv1D, MaxPooling1D, Dropout, LSTM
from keras.layers.core import Dense, Activation
from keras.optimizers import SGD
from keras.utils import np_utils

## ファイルの読込み(全件)
#shops = pd.read_csv('shops.txt', delimiter='\t')
reviews = pd.read_csv('reviews.txt', delimiter='\t')

## ファイルの読込み(サンプル)
#shops = pd.read_csv('as.txt', delimiter='\t')
#reviews = pd.read_csv('ar.txt', delimiter='\t')

## ファイルの先頭と末端読み(学習データと検証データ分割用)
#reviewsh = reviews.head(3)
#reviewst = reviews.tail(3)
reviews = reviews.head(5000)
#print(reviewsh)
#print(reviewst)

##学習用データ前作業
### インデックスの作成
indexed_df = pd.DataFrame()
classify_and_add_indexed_df(reviews, '分類', 'category')
classify_and_add_indexed_df(reviews, 'スープ', 'soup')

### 文章の形態素解析
# 一旦メニュー側は評価しない。
#words4menu = [] # 単語文字列、品詞、登場回数のリスト
#for i, doc1 in enumerate(reviews['メニュー']):
#    words1 = split_into_words(doc1)
#    indexed_df.loc[i, 'menu'] = i

### 辞書の作成
# 各レコードのコメントを分解し、品詞と登場回数をカウント
dict4comment = [] # 単語文字列、品詞、登場回数のリスト
for i, doc2 in enumerate(reviews['コメント']):
    dict4comment = make_dict(dict4comment, split_into_words(doc2.replace('<br />', '')))

# 単語情報をデータフレームに変換します
dic = pd.DataFrame(dict4comment)
dic.columns = ['words', 'parts', 'freq']
dic = dic.sort_values(by=['freq'], ascending=False)
dic = dic.reset_index(drop=True)
num_words = dic.shape[0] # 全単語数を確認しておきます

# 辞書ができたので全データを固定長の数字列に変換
max_speech_length = 64
output_vector = 5
data_list = trans_words_to_number(dic, reviews['コメント'], max_speech_length)

data_list_piece = []
for i, line in enumerate(data_list):
  data_list_piece.append(line[0:max_speech_length])

ans_list = []
for line in enumerate(reviews['点数']):
  if line[1] > 80:
    ans_list.append(4)
  elif line[1] > 60:
    ans_list.append(3)
  elif line[1] > 40:
    ans_list.append(2)
  elif line[1] > 20:
    ans_list.append(1)
  else:
    ans_list.append(0)
#ans_list = reviews['点数']

### 学習データとサンプルデータとの分割
train_X, test_X, train_Y, test_Y = train_test_split(data_list_piece, ans_list, train_size=0.8)

### 点数データを配列化
### See. http://may46onez.hatenablog.com/entry/2016/07/14/122047
train_Y = np_utils.to_categorical(train_Y, output_vector)
test_Y = np_utils.to_categorical(test_Y, output_vector)

#print(np.array(train_X))
#print(np.array(train_Y))
#print(np.array(test_X))
#print(np.array(test_Y))

# ランダム関数の初期化
random.seed(123)
numpy.random.seed(123)

## モデルの構築
embedding_vecor_length = 16
model = Sequential()
model.add(Embedding(num_words, embedding_vecor_length, input_length=max_speech_length))
model.add(Conv1D(embedding_vecor_length, 3, border_mode='same', activation='relu'))
model.add(MaxPooling1D(pool_length=2))
model.add(Dropout(0.1))
model.add(LSTM(4, dropout_W=0.1, dropout_U=0.1))
model.add(Dropout(0.1))
model.add(Dense(output_vector, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# モデル訓練
model.fit(np.array(train_X), np.array(train_Y), epochs=1, batch_size=1)

# モデル評価
loss_and_metrics = model.evaluate(np.array(test_X), np.array(test_Y), verbose=0)
print("Accuracy (test) : %.2f%%" % (loss_and_metrics[1]*100))

prob = pd.DataFrame(model.predict(np.array(test_X)))
ans = pd.DataFrame(np.array(test_Y))
print(pd.concat([prob, ans], axis=1, keys=['想定確率', '答え']))



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 64, 16)            427216    
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 64, 16)            784       
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 32, 16)            0         
_________________________________________________________________
dropout_11 (Dropout)         (None, 32, 16)            0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 4)                 336       
_________________________________________________________________
dropout_12 (Dropout)         (None, 4)                 0         
_________________________________________________________________
dense_6 (Dense)              (None, 5)                 25        
Total para

InvalidArgumentError: indices[0,15] = 26701 is not in [0, 26701)
	 [[Node: embedding_6/Gather = Gather[Tindices=DT_INT32, Tparams=DT_FLOAT, validate_indices=true, _device="/job:localhost/replica:0/task:0/cpu:0"](embedding_6/embeddings/read, embedding_6/Cast)]]

Caused by op 'embedding_6/Gather', defined at:
  File "/home/centos/.pyenv/versions/3.5.4/lib/python3.5/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/centos/.pyenv/versions/3.5.4/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/centos/.pyenv/versions/3.5.4/lib/python3.5/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/centos/.pyenv/versions/3.5.4/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/centos/.pyenv/versions/3.5.4/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/home/centos/.pyenv/versions/3.5.4/lib/python3.5/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/home/centos/.pyenv/versions/3.5.4/lib/python3.5/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/home/centos/.pyenv/versions/3.5.4/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/centos/.pyenv/versions/3.5.4/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/home/centos/.pyenv/versions/3.5.4/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/centos/.pyenv/versions/3.5.4/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home/centos/.pyenv/versions/3.5.4/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/centos/.pyenv/versions/3.5.4/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/centos/.pyenv/versions/3.5.4/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/centos/.pyenv/versions/3.5.4/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/centos/.pyenv/versions/3.5.4/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/centos/.pyenv/versions/3.5.4/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/centos/.pyenv/versions/3.5.4/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/centos/.pyenv/versions/3.5.4/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2850, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/centos/.pyenv/versions/3.5.4/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-4f38d4561501>", line 102, in <module>
    model.add(Embedding(num_words, embedding_vecor_length, input_length=max_speech_length))
  File "/home/centos/.pyenv/versions/3.5.4/lib/python3.5/site-packages/keras/models.py", line 442, in add
    layer(x)
  File "/home/centos/.pyenv/versions/3.5.4/lib/python3.5/site-packages/keras/engine/topology.py", line 602, in __call__
    output = self.call(inputs, **kwargs)
  File "/home/centos/.pyenv/versions/3.5.4/lib/python3.5/site-packages/keras/layers/embeddings.py", line 134, in call
    out = K.gather(self.embeddings, inputs)
  File "/home/centos/.pyenv/versions/3.5.4/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py", line 1134, in gather
    return tf.gather(reference, indices)
  File "/home/centos/.pyenv/versions/3.5.4/lib/python3.5/site-packages/tensorflow/python/ops/array_ops.py", line 2409, in gather
    validate_indices=validate_indices, name=name)
  File "/home/centos/.pyenv/versions/3.5.4/lib/python3.5/site-packages/tensorflow/python/ops/gen_array_ops.py", line 1219, in gather
    validate_indices=validate_indices, name=name)
  File "/home/centos/.pyenv/versions/3.5.4/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
    op_def=op_def)
  File "/home/centos/.pyenv/versions/3.5.4/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 2630, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/home/centos/.pyenv/versions/3.5.4/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1204, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): indices[0,15] = 26701 is not in [0, 26701)
	 [[Node: embedding_6/Gather = Gather[Tindices=DT_INT32, Tparams=DT_FLOAT, validate_indices=true, _device="/job:localhost/replica:0/task:0/cpu:0"](embedding_6/embeddings/read, embedding_6/Cast)]]


## 参考URL
### LSTMでどのキャラクターのセリフか判別する
### https://qiita.com/CookieBox26/items/6823346661f600b246eb
### 式を図示化してくれているので分かりやすかった

In [1]:
#print(shops.columns[1])
#print(shops['店名'])
#print(reviews['コメント'])

In [None]:
#validation_dataを追加すると学習率が上がる可能性がある