In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras as tf_keras




In [2]:
with open("data-files/nietzsche.txt", "rt") as f:
    nietzsche_text = f.read()

In [7]:
print( len(nietzsche_text) )
nietzsche_text[:30]

600893


'PREFACE\n\n\nSUPPOSING that Truth'

In [8]:
# 대문자 -> 소문자
nietzsche_lower_text = nietzsche_text.lower()
nietzsche_lower_text[:30]

'preface\n\n\nsupposing that truth'

In [11]:
# 전체 텍스트에 포함된 문자 확인
print( np.unique(list(nietzsche_lower_text)) )
print( np.unique(list(nietzsche_lower_text)).shape )

['\n' ' ' '!' '"' "'" '(' ')' ',' '-' '.' '0' '1' '2' '3' '4' '5' '6' '7'
 '8' '9' ':' ';' '=' '?' '[' ']' '_' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i'
 'j' 'k' 'l' 'm' 'n' 'o' 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z' 'ä'
 'æ' 'é' 'ë']
(57,)


In [18]:
# 문자 사전 만들기

set(nietzsche_lower_text) # set : 중복되지 않는 리스트
sorted_chars = sorted( set(nietzsche_lower_text) )
print( sorted_chars )

char_to_idx = { ch:i for i, ch in enumerate(sorted_chars) } # 문자 : 숫자
print( char_to_idx )

idx_to_char = { i:ch for ch, i in char_to_idx.items() }     # 숫자 : 문자
print( idx_to_char )

['\n', ' ', '!', '"', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ä', 'æ', 'é', 'ë']
{'\n': 0, ' ': 1, '!': 2, '"': 3, "'": 4, '(': 5, ')': 6, ',': 7, '-': 8, '.': 9, '0': 10, '1': 11, '2': 12, '3': 13, '4': 14, '5': 15, '6': 16, '7': 17, '8': 18, '9': 19, ':': 20, ';': 21, '=': 22, '?': 23, '[': 24, ']': 25, '_': 26, 'a': 27, 'b': 28, 'c': 29, 'd': 30, 'e': 31, 'f': 32, 'g': 33, 'h': 34, 'i': 35, 'j': 36, 'k': 37, 'l': 38, 'm': 39, 'n': 40, 'o': 41, 'p': 42, 'q': 43, 'r': 44, 's': 45, 't': 46, 'u': 47, 'v': 48, 'w': 49, 'x': 50, 'y': 51, 'z': 52, 'ä': 53, 'æ': 54, 'é': 55, 'ë': 56}
{0: '\n', 1: ' ', 2: '!', 3: '"', 4: "'", 5: '(', 6: ')', 7: ',', 8: '-', 9: '.', 10: '0', 11: '1', 12: '2', 13: '3', 14: '4', 15: '5', 16: '6', 17: '7', 18: '8', 19: '9', 20: ':', 21: ';', 22: '=', 23: '?', 

In [None]:
# 학습 내용

# n개의 연속된 문자 -> n+1번째 문자 예측
# (입력데이터)        (출력데이터, target)

In [24]:
# 학습 데이터 준비

sequence_length = 50    # 연속된 문자 갯수
step = 3                # stride (3문자씩 이동하면서 데이터 추출)

sequences = []      # (batch크기, 입력문자갯수, 단어사전크기)
next_chars = []     # (batch크기, 단어사전크기)

for idx in range(0, len(nietzsche_lower_text) - sequence_length, step):
    sequences.append(nietzsche_lower_text[idx:idx+sequence_length])
    next_chars.append(nietzsche_lower_text[idx+sequence_length])

# print( len(sequences), len(next_chars) )
# print( sequences[0], next_chars[0])

X = np.zeros(shape=(len(sequences), sequence_length, len(sorted_chars)))
y = np.zeros(shape=(len(sequences), len(sorted_chars)))

for si, sequence in enumerate(sequences):   # si : 입력 문장 순서 번호
    # print(si, sequence)
    for ci, ch in enumerate(sequence):      # ci: 한 개의 입력문장 안의 문자 순서 번호
        X[si, ci, char_to_idx[ch]] = 1
        y[si, char_to_idx[next_chars[si]]] = 1
        # print(ci, ch, end=", ")

In [43]:
# 모델 구조 설계

input = tf_keras.layers.Input(shape=(sequence_length, len(sorted_chars))) # (50, 57)
x = tf_keras.layers.LSTM(units=128)(input)
output = tf_keras.layers.Dense(units=len(sorted_chars), activation="softmax")(x)

model = tf_keras.models.Model(input, output)

model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 50, 57)]          0         
                                                                 
 lstm_2 (LSTM)               (None, 128)               95232     
                                                                 
 dense_2 (Dense)             (None, 57)                7353      
                                                                 
Total params: 102585 (400.72 KB)
Trainable params: 102585 (400.72 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [44]:
# 모델 학습 설계
model.compile(loss="categorical_crossentropy",
              optimizer=tf_keras.optimizers.Adam(learning_rate=0.01),
              metrics=['accuracy'])

In [45]:
history = model.fit(X, y, batch_size=128, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [47]:
model.save('models/generation-model.keras')

In [48]:
tf_keras.models.load_model('models/generation-model.keras')

<keras.src.engine.functional.Functional at 0x26189a3da90>

In [46]:
def select_character(preds, temperature=1.0): # temperature 값이 작을 수록 낮은 확률의 값이 선택 가능성이 낮짐
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)    
    probas = np.random.multinomial(1, preds, 1) # 주어진 확률에 따라 다음 값 랜덤 선택
    return np.argmax(probas)

In [None]:
start_idx = np.random.randint(0, len(nietzsche_lower_text) - sequence_length)
seed_text = nietzsche_lower_text[start_idx:start_idx + sequence_length]
full_text = seed_text

print(seed_text)
print("=" * 50)
for idx in range(100):
    sample = np.zeros(shape=(1, sequence_length, len(sorted_chars))) # 1, 50, 57
    for ci, c in enumerate(seed_text):
        sample[0, ci, char_to_idx[c]] = 1

    predicted_values = model.predict(sample, verbose=0)
    # selected_char_idx = predicted_values[0].argmax()
    selected_char_idx = select_character(predicted_values[0], 1)
    full_text += sorted_chars[selected_char_idx]
    seed_text = full_text[idx+1:]

    print(sorted_chars[selected_char_idx], end="")

 even by this desperation:
once and again a new sa
crificed and so far a man is a man is a man is a man is a man is a man is a man is a man is a man is