In [1]:
from itertools import product
import numpy as np
import pandas as pd

from tqdm import tqdm
from sklearn.model_selection import train_test_split

from konlpy.tag import Okt
from keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# 데이터 로드
df = pd.read_excel('../../data/filtered_news.xlsx')
# df = df[:10]
df = df.dropna(subset=['description'])
sentences = df['description'].tolist()

In [3]:
okt = Okt()

# 토큰화 함수
def tokenize(sentences):
    return [' '.join(okt.morphs(sentence)) for sentence in sentences]

tokenized_sentences = tokenize(sentences)

# 토큰화된 문장을 정수 인코딩
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenized_sentences)
sequences = tokenizer.texts_to_sequences(tokenized_sentences)

In [21]:
max_len = max(len(seq) for seq in sequences)                                # 패딩
X = pad_sequences(sequences, maxlen=max_len)

y = np.roll(X, -1, axis=1)                                                  # 출력 레이블 생성 (여기서는 다음 단어 예측을 위해 문장을 시프트)

y = y[:, -1]                                                                # 다음 단어만 타겟으로 설정

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)    # 학습 데이터와 테스트 데이터 분리

In [25]:
# 모델 정의
def create_model(lstm_units=128, optimizer='adam'):
    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=max_len))
    model.add(LSTM(lstm_units, return_sequences=False))  # return_sequences=False로 설정하여 마지막 출력만 사용
    model.add(Dense(len(tokenizer.word_index)+1, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model


In [26]:
basic_model = create_model()
basic_model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)
loss, accuracy = basic_model.evaluate(X_test, y_test, verbose=0)
print(f"Basic model accuracy: {accuracy}")

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.0000e+00 - loss: 5.5676
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.7500 - loss: 5.5498
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 1.0000 - loss: 5.5299
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 1.0000 - loss: 5.5054
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.8750 - loss: 5.4730
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.8750 - loss: 5.4273
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - accuracy: 0.8750 - loss: 5.3576
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.8750 - loss: 5.2386
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [27]:
# 파라미터 그리드 설정
param_grid = {
    'lstm_units': [16, 32, 64, 128, 256, 512],
    'optimizer': ['adam', 'rmsprop', 'sgd'],
    'batch_size': [4, 8, 16, 32, 64, 128],
    'epochs': [10, 20, 30, 40, 50]
}

# 파라미터 그리드 조합 생성
param_combinations = list(product(param_grid['lstm_units'], param_grid['optimizer'], param_grid['batch_size'], param_grid['epochs']))

# 그리드 서치 구현
best_score = -np.inf
best_params = None

In [28]:
for lstm_units, optimizer, batch_size, epochs in tqdm(param_combinations):
    model = create_model(lstm_units=lstm_units, optimizer=optimizer)                # 모델 생성
    
    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)    # 모델 학습
    
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)                      # 모델 평가
    
    if accuracy > best_score:                                                       # 최적의 파라미터 및 스코어 업데이트
        best_score = accuracy
        best_params = (lstm_units, optimizer, batch_size, epochs)

print(f"Best score: {best_score} with params: lstm_units={best_params[0]}, optimizer={best_params[1]}, batch_size={best_params[2]}, epochs={best_params[3]}")

In [31]:
# 최적의 파라미터로 최종 모델 학습
best_lstm_units, best_optimizer, best_batch_size, best_epochs = best_params
final_model = create_model(lstm_units=best_lstm_units, optimizer=best_optimizer)
final_model.fit(X_train, y_train, epochs=best_epochs, batch_size=best_batch_size, verbose=1)

Epoch 1/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.0000e+00 - loss: 5.5664
Epoch 2/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.4167 - loss: 5.5569
Epoch 3/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 1.0000 - loss: 5.5452 
Epoch 4/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 1.0000 - loss: 5.5320
Epoch 5/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 1.0000 - loss: 5.5171 
Epoch 6/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 1.0000 - loss: 5.4996
Epoch 7/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 1.0000 - loss: 5.4790 
Epoch 8/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 1.0000 - loss: 5.4549
Epoch 9/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

<keras.src.callbacks.history.History at 0x2a60953af50>

In [40]:

def generate_text(seed_text, next_words, model, max_len, tokenizer, temperature=1.0):
    for _ in range(next_words):
        tokenized_seed_text = tokenizer.texts_to_sequences([seed_text])[0]
        tokenized_seed_text = np.array(tokenized_seed_text[-max_len:]).reshape(1, -1)
        
        predicted = model.predict(tokenized_seed_text, verbose=0).flatten()
        predicted = np.log(predicted + 1e-8) / temperature
        exp_preds = np.exp(predicted)
        probabilities = exp_preds / np.sum(exp_preds)
        
        predicted_word_index = np.random.choice(len(probabilities), p=probabilities)
        
        if predicted_word_index == 0:
            print("Predicted index is 0, stopping the generation.")
            break
        
        predicted_word = tokenizer.index_word.get(predicted_word_index, None)
        
        if not predicted_word:
            print(f"No word found for index {predicted_word_index}, stopping the generation.")
            break
        
        seed_text += ' ' + predicted_word
    
    return seed_text

In [41]:
# 테스트 예제
seed_text = "증시"
generated_text = generate_text(seed_text, 50, final_model, max_len, tokenizer)
print(generated_text)

증시 다시 ’ 건 이번 의 주요 채 했다 우량 기존 혼합형 국 각각 cnbc 보도국 실업 장 더 꼽히는 했다 덩달아 꼽히는 덩달아 사 신규 다른 선행 원 꼽히는 실적 신규 “ 받고 30 베트남 하락 원화 해소 촉발 엔 뉴욕증시 출시 최근 신한은행 혼합형 상승 상승 커지며 것 재


In [36]:
from keras.models import load_model

def save_model_to_h5(model, file_name):
    """모델을 HDF5 형식으로 저장하는 함수"""
    model.save(file_name)
    print(f"모델이 '{file_name}'로 저장되었습니다.")

# 모델 저장 예시
file_name = 'final_lstm_model.h5'
save_model_to_h5(final_model, file_name)



모델이 'final_lstm_model.h5'로 저장되었습니다.
