<a href="https://colab.research.google.com/github/ryo-itagaki/LSTM/blob/main/LSTMmodel_with_ChatGPT(not_perfect).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 必要なライブラリをインストール
!pip install tensorflow janome
import numpy as np
import tensorflow as tf
import json
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer as KerasTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from janome.tokenizer import Tokenizer as JanomeTokenizer


from google.colab import drive
drive.mount('/content/drive')

# JSONファイルのパス
json_file_path = '/content/drive/MyDrive/qa_data.json'
output_file_path = '/content/drive/MyDrive/qa_data_no_duplicates.json'


# JSONファイルを読み込み
with open(json_file_path, 'r', encoding='utf-8') as file:
    qa_data = json.load(file)

# 重複チェックのためのセット
seen = set()
unique_data = []

# データをチェックして重複を排除
for item in qa_data:
    qa_pair = (item['question'], item['answer'])
    if qa_pair not in seen:
        seen.add(qa_pair)
        unique_data.append(item)

# 重複のないデータを新しいJSONファイルに保存
with open(output_file_path, 'w', encoding='utf-8') as file:
    json.dump(unique_data, file, ensure_ascii=False, indent=4)

print(f"重複を削除したデータを {output_file_path} に保存しました。")

# JSONファイルのパス
json_file_path = '/content/drive/MyDrive/qa_data_no_duplicates.json'


# JSONファイルを読み込み
try:
    with open(json_file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
        print("データが正常に読み込まれました。")
except FileNotFoundError:
    print(f"ファイルが見つかりません: {json_file_path}")
except json.JSONDecodeError as e:
    print(f"JSONデコードエラー: {e}")
except Exception as e:
    print(f"予期しないエラーが発生しました: {e}")


# 日本語の形態素解析器のセットアップ
janome_tokenizer = JanomeTokenizer()

def tokenize_japanese(text):
    return ' '.join(token.surface for token in janome_tokenizer.tokenize(text))



# トークン追加
start_token = "<start>"
end_token = "<end>"

# データセットを分かち書きしてリストに変換
questions = [tokenize_japanese(d['question']) for d in data]
answers = [tokenize_japanese(f"{start_token} {d['answer']} {end_token}") for d in data]

# トークナイザーの設定
tokenizer = KerasTokenizer(oov_token="<unk>")
tokenizer.fit_on_texts(questions + answers)
vocab_size = len(tokenizer.word_index) + 1

word_counts = tokenizer.word_counts
print(f"'東京' の出現回数: {word_counts.get('東京', 0)}")

# 質問と回答を整数のシーケンスに変換
question_sequences = tokenizer.texts_to_sequences(questions)
answer_sequences = tokenizer.texts_to_sequences(answers)

# シーケンスをパディング
max_sequence_length = max([len(seq) for seq in question_sequences + answer_sequences])
question_padded = pad_sequences(question_sequences, maxlen=max_sequence_length, padding='post')
answer_padded = pad_sequences(answer_sequences, maxlen=max_sequence_length, padding='post')

# エンコーダーとデコーダーの構築
embedding_dim = 100
latent_dim = 256

# エンコーダー
encoder_inputs = Input(shape=(max_sequence_length,))
enc_emb = Embedding(vocab_size, embedding_dim, input_length=max_sequence_length)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

# デコーダー
decoder_inputs = Input(shape=(max_sequence_length,))
dec_emb_layer = Embedding(vocab_size, embedding_dim, input_length=max_sequence_length)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Seq2Seqモデルの定義
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# デコーダーのターゲットシーケンスをシフト
answer_labels = np.zeros((len(answer_sequences), max_sequence_length), dtype='int32')
for i, seq in enumerate(answer_sequences):
    for t, word_index in enumerate(seq[:-1]):
        answer_labels[i, t] = word_index

# モデルの訓練
model.fit([question_padded, answer_padded], np.expand_dims(answer_labels, -1), batch_size=2, epochs=100)

# エンコーダーモデルの定義
encoder_model = Model(encoder_inputs, encoder_states)

# デコーダーモデルの定義
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2 = dec_emb_layer(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs2] + decoder_states2)

# 予測関数の定義
def predict_answer(question):
    states_value = encoder_model.predict(pad_sequences(tokenizer.texts_to_sequences([tokenize_japanese(question)]), maxlen=max_sequence_length, padding='post'))
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer.word_index.get(start_token, 0)

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = tokenizer.index_word.get(sampled_token_index, '')
        decoded_sentence += ' ' + sampled_char

        if (sampled_char == end_token or len(decoded_sentence) > max_sequence_length):
            stop_condition = True

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence.strip()

# 質問を入力として予測
input_question = "日本の首都はどこですか？"
predicted_answer = predict_answer(input_question)
print(f"質問: {input_question}")
print(f"回答: {predicted_answer}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
重複を削除したデータを /content/drive/MyDrive/qa_data_no_duplicates.json に保存しました。
データが正常に読み込まれました。
'東京' の出現回数: 170
Epoch 1/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 123ms/step - accuracy: 0.6010 - loss: 2.9032
Epoch 2/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 124ms/step - accuracy: 0.7293 - loss: 1.6613
Epoch 3/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 125ms/step - accuracy: 0.7685 - loss: 1.4183
Epoch 4/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 126ms/step - accuracy: 0.7954 - loss: 1.2633
Epoch 5/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 125ms/step - accuracy: 0.8158 - loss: 1.1310
Epoch 6/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 125ms/step - accuracy: 0.8280 - loss: 1.0502
Epoch 7/100
[1m200/200