In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# OpenBookQA 데이터셋 빌더를 불러옵니다.
builder = tfds.builder('openbookqa')

# 데이터셋을 다운로드하고 준비합니다.
builder.download_and_prepare()
dataset = builder.as_dataset(split='train')

# 데이터셋에서 텍스트와 라벨을 추출합니다.
texts = []
labels = []
for example in dataset:
    texts.append(example['question']['stem'].numpy().decode('utf-8'))
    labels.append(example['answerKey'].numpy())

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
print(texts)
print(labels)

['A plant that needs to expand will be able to have an endless resource in', 'Moles are underground a lot of the time, and since few animals live there, they are unlikely to be', 'To keep a pet salamander healthy what would come in handy?', 'something could get wedged during winter if which of these happens?', 'Which would likely be a chemical reaction', 'Which is likely to be in a zoo exhibit?', '24 hours is equal to one what?', 'What is a renewable resource?', 'If pollution is caused by many things, it is least likely to be caused by', 'As water gets warmer', 'If someone is dying of thirst, they can hydrate by', 'The continents currently experiencing winter will be the ones which are', 'a student leaves a bag of acorns on the playground, which one of these will most likely take it?', "Steve's ears alerted him to something. It was", 'Overpopulation may result', 'A nail can attach to metals after it receives', 'Which of the following areas would most likely contain a sandbar?', 'If a t

In [15]:
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from sklearn.model_selection import train_test_split

# OpenBookQA 데이터셋 빌더를 불러옵니다.
builder = tfds.builder('openbookqa')

# 데이터셋을 다운로드하고 준비합니다.
builder.download_and_prepare()
dataset = builder.as_dataset(split='train')

# 데이터셋에서 텍스트와 라벨을 추출합니다.
texts = []
labels = []
for example in dataset:
    texts.append(example['question']['stem'].numpy().decode('utf-8'))
    labels.append(example['answerKey'].numpy())

# 텍스트 전처리
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# 불용어 제거 및 소문자 변환
filtered_texts = [[word.lower() for word in text.split() if word.lower() not in stop_words] for text in texts]

# 토큰화 및 정수 인덱싱
tokenizer = Tokenizer()
tokenizer.fit_on_texts(filtered_texts)
sequences = tokenizer.texts_to_sequences(filtered_texts)
word_index = tokenizer.word_index

# 패딩
max_sequence_length = max(len(seq) for seq in sequences)
sequences_padded = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

# 라벨 원-핫 인코딩
labels_onehot = tf.keras.utils.to_categorical(labels)

# 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(sequences_padded, labels_onehot, test_size=0.2, random_state=42)

# 모델 생성 및 학습
embedding_dim = 100
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(word_index) + 1, output_dim=embedding_dim, input_length=max_sequence_length),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dropout(0.5),  # 드롭아웃 추가
    tf.keras.layers.Dense(4, activation='softmax')
])

# 학습률 스케줄링
initial_learning_rate = 0.001
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=1000,
    decay_rate=0.96,
    staircase=True
)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.summary()

# 모델 학습
batch_size = 32
epochs = 30
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val))


Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 35, 100)           695500    
                                                                 
 lstm_7 (LSTM)               (None, 64)                42240     
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_6 (Dense)             (None, 4)                 260       
                                                                 
Total params: 738000 (2.82 MB)
Trainable params: 738000 (2.82 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sou05094\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x1daa720bf10>

In [21]:
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from sklearn.model_selection import train_test_split

# OpenBookQA 데이터셋 빌더를 불러옵니다.
builder = tfds.builder('openbookqa')

# 데이터셋을 다운로드하고 준비합니다.
builder.download_and_prepare()
dataset = builder.as_dataset(split='train')

# 데이터셋에서 텍스트와 라벨을 추출합니다.
texts = []
labels = []
for example in dataset:
    texts.append(example['question']['stem'].numpy().decode('utf-8'))
    labels.append(example['answerKey'].numpy())

# 텍스트 전처리 (불용어 제거, 소문자 변환, 데이터 증강)
import nltk
from nltk.corpus import stopwords
import random

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

filtered_texts = []
for text in texts:
    words = text.lower().split()
    filtered_words = [word for word in words if word not in stop_words]
    
    # 데이터 증강
    random.shuffle(filtered_words)
    filtered_texts.append(" ".join(filtered_words))

# 토큰화 및 정수 인덱싱
tokenizer = Tokenizer()
tokenizer.fit_on_texts(filtered_texts)
sequences = tokenizer.texts_to_sequences(filtered_texts)
word_index = tokenizer.word_index

# 패딩
max_sequence_length = max(len(seq) for seq in sequences)
sequences_padded = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

# 라벨 원-핫 인코딩
labels_onehot = tf.keras.utils.to_categorical(labels)

# 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(sequences_padded, labels_onehot, test_size=0.2, random_state=42)

# 모델 생성 및 학습 (드롭아웃, 배치 정규화, 조기 종료 적용)
embedding_dim = 100
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(word_index) + 1, output_dim=embedding_dim, input_length=max_sequence_length),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dropout(0.5),  # 드롭아웃 추가
    tf.keras.layers.BatchNormalization(),  # 배치 정규화 추가
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(4, activation='softmax')
])

# 조기 종료
# early_stopping = tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# 모델 학습
batch_size = 32
epochs = 30
# 조기 종료 , callbacks=[early_stopping]
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sou05094\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, 35, 100)           540800    
                                                                 
 lstm_13 (LSTM)              (None, 64)                42240     
                                                                 
 dropout_8 (Dropout)         (None, 64)                0         
                                                                 
 batch_normalization_5 (Bat  (None, 64)                256       
 chNormalization)                                                
                                                                 
 dropout_9 (Dropout)         (None, 64)                0         
                                                                 
 dense_12 (Dense)            (None, 4)                 260       
                                                     

<keras.src.callbacks.History at 0x1dacb7ddb50>