In [5]:
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# OpenBookQA 데이터셋 빌더를 불러옵니다.
builder = tfds.builder('openbookqa')

# 데이터셋을 다운로드하고 준비합니다.
builder.download_and_prepare()
dataset = builder.as_dataset(split='train')

# 데이터셋에서 텍스트와 라벨을 추출합니다.
texts = []
labels = []
for example in dataset:
    texts.append(example['question']['stem'].numpy().decode('utf-8'))
    labels.append(example['answerKey'].numpy())

# 텍스트를 정수 시퀀스로 변환하는 Tokenizer를 사용하여 전처리합니다.
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(texts)
max_sequence_length = max(len(seq) for seq in sequences)

# 패딩을 적용하여 시퀀스 길이를 동일하게 맞춥니다.
sequences_padded = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

# 라벨을 원-핫 인코딩합니다.
labels_onehot = tf.keras.utils.to_categorical(labels)

# LSTM 모델 정의
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(word_index) + 1, 64, input_length=max_sequence_length),
    tf.keras.layers.LSTM(128),
    tf.keras.layers.Dense(len(labels_onehot[0]), activation='softmax')
])


model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# LSTM 모델 학습
model.fit(sequences_padded, labels_onehot, epochs=10, batch_size=32)

# 새로운 텍스트를 예측하는 예시
new_texts = ["What is the largest animal on Earth?", "Who is the author of 'Harry Potter'?"]
new_sequences = tokenizer.texts_to_sequences(new_texts)
new_sequences_padded = pad_sequences(new_sequences, maxlen=max_sequence_length, padding='post')
predictions = model.predict(new_sequences_padded)

for i, text in enumerate(new_texts):
    predicted_label = np.argmax(predictions[i])
    print(f"Question: {text} / Predicted Label: {predicted_label}")


[1mDownloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to C:\Users\sou05094\tensorflow_datasets\openbookqa\0.1.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:01<?, ? url/s]
Dl Completed...: 100%|██████████| 1/1 [00:02<00:00,  2.00s/ url]
Dl Completed...: 100%|██████████| 1/1 [00:02<00:00,  2.00s/ url]
Dl Completed...: 100%|██████████| 1/1 [00:02<00:00,  2.00s/ url]
Dl Completed...: 100%|██████████| 1/1 [00:02<00:00,  2.00s/ url]
Dl Completed...: 100%|██████████| 1/1 [00:02<00:00,  2.00s/ url]
Dl Completed...: 100%|██████████| 1/1 [00:02<00:00,  2.00s/ url]
Dl Completed...: 100%|██████████| 1/1 [00:02<00:00,  2.00s/ url]
Dl Completed...: 100%|██████████| 1/1 [00:02<00:00,  2.00s/ url]
Dl Completed...: 100%|██████████| 1/1 [00:02<00:00,  2.00s/ url]
Dl Completed...: 100%|██████████| 1/1 [00:02<00:00,  2.00s/ url]
Dl Completed...: 100%|██████████| 1/1 [00:02<00:00,  2.00s/ url]
Dl Completed...: 100%|██████████| 1/1 [00:02<00:00,  2.00s/ url]
[A
Dl Co

[1mDataset openbookqa downloaded and prepared to C:\Users\sou05094\tensorflow_datasets\openbookqa\0.1.0. Subsequent calls will reuse this data.[0m
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Question: What is the largest animal on Earth? / Predicted Label: 0
Question: Who is the author of 'Harry Potter'? / Predicted Label: 0


In [6]:
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Bidirectional, Dropout, Dense
from tensorflow.keras.optimizers import Adam

# LSTM 모델 정의
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(word_index) + 1, 64, input_length=max_sequence_length),
    LSTM(128, return_sequences=True),
    Dropout(0.2),
    LSTM(128),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dense(len(labels_onehot[0]), activation='softmax')
])

# 옵티마이저 설정
optimizer = Adam(learning_rate=0.001)

model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# 모델 학습
model.fit(sequences_padded, labels_onehot, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

# 새로운 텍스트를 예측하는 예시
new_texts = ["What is the largest animal on Earth?", "Who is the author of 'Harry Potter'?"]
new_sequences = tokenizer.texts_to_sequences(new_texts)
new_sequences_padded = pad_sequences(new_sequences, maxlen=max_sequence_length, padding='post')
predictions = model.predict(new_sequences_padded)

for i, text in enumerate(new_texts):
    predicted_label = np.argmax(predictions[i])
    print(f"Question: {text} / Predicted Label: {predicted_label}")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Question: What is the largest animal on Earth? / Predicted Label: 0
Question: Who is the author of 'Harry Potter'? / Predicted Label: 0
