In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow.keras as tf_keras

In [3]:
# 데이터 준비
with open('data-files/nietzsche.txt', 'rt') as f:
    nietzsche_text = f.read()

In [6]:
print( type(nietzsche_text), len(nietzsche_text) )
nietzsche_text[:30]

<class 'str'> 600893


'PREFACE\n\n\nSUPPOSING that Truth'

In [7]:
# 대소문자 -> 소문자
nietzsche_lower_text = nietzsche_text.lower()
print(nietzsche_lower_text[:30])

preface


supposing that truth


In [13]:
# 문자 사전의 갯수 확인
print( np.unique(list(nietzsche_lower_text)).shape )
print( len(set(nietzsche_lower_text)) )

(57,)
57


In [25]:
# 단어(또는 문자) 사전 만들기 : 여기서는 문자 사전

set(nietzsche_lower_text)
sorted_chars = sorted(set(nietzsche_lower_text))
# print( sorted_chars )

char_to_idx = { ch:idx for idx, ch in enumerate(sorted_chars) }
# char_to_idx

In [41]:
# 순환신경망에 적용할 입력 데이터 구성

sequence_length = 50 # 한 단위 입력 문자 갯수
step = 3 # 3문자씩 이동하면서 데이터 추출

sequences = []  # (-1, 입력 문자갯수, 단어사전의단어갯수)
next_chars = [] # (-1, 단어사전의단어갯수)

for idx in range(0, len(nietzsche_lower_text) - sequence_length, step):
    sequences.append(nietzsche_lower_text[idx:idx+sequence_length])
    next_chars.append(nietzsche_lower_text[idx+sequence_length])

# print( len(sequences), len(next_chars) )
# print(sequences[2])
    
X = np.zeros(shape=(len(sequences), sequence_length, len(sorted_chars))) # (-1, 입력 문자갯수, 단어사전의단어갯수)
y = np.zeros(shape=(len(sequences), len(sorted_chars))) # (-1, 단어사전의단어갯수)

for sidx, sequence in enumerate(sequences):
    for cidx, ch in enumerate(sequence):
        X[sidx, cidx, char_to_idx[ch]] = 1
    y[sidx, char_to_idx[ch]] = 1

In [44]:
print( X.shape, y.shape )
print( X[0] )

(200281, 50, 57) (200281, 57)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [None]:
# sequences[0] # 50개의 문자로 구성된 1차원 리스트
# np.array(list(sequences[0])).reshape(-1, 1) # 50개의 문자각 각 행으로 구성된 2차원 배열
# onehot = np.zeros(57)
# char_idx = char_to_idx[np.array(list(sequences[0]))[0]]
# onehot[char_idx] = 1
# onehot

In [46]:
# 모델 구성

input = tf_keras.layers.Input(shape=(sequence_length, len(char_to_idx))) # (50, 57)
x = tf_keras.layers.LSTM(units=128)(input)
output = tf_keras.layers.Dense(units=len(char_to_idx), activation="softmax")(x)

model = tf_keras.models.Model(input, output)

In [None]:
model.compile(optimizer="adam",
              loss="categorical_crossentropy", # y값이 onehot-vector이므로 categorical_crossentropy
              metrics=['accuracy'])

In [52]:
def select_character(preds, temperature=1.0): # temperature 값이 작을 수록 낮은 확률의 값이 선택 가능성이 낮짐
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)    
    probas = np.random.multinomial(1, preds, 1) # 주어진 확률에 따라 다음 값 랜덤 선택
    return np.argmax(probas)

In [51]:
def temperature_test(preds, temperature=1.0): # temperature 값이 작을 수록 낮은 확률의 값이 선택 가능성이 낮짐
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)    
    return preds

print( temperature_test([0.1, 0.3, 0.6], 2.0) )
print( temperature_test([0.1, 0.3, 0.6], 0.7) )
print( temperature_test([0.1, 0.3, 0.6], 0.5) )
print( temperature_test([0.1, 0.3, 0.6], 0.1) )

[0.1929928  0.33427333 0.47273387]
[0.05337501 0.25641283 0.69021216]
[0.02173913 0.19565217 0.7826087 ]
[1.65220366e-08 9.75609740e-04 9.99024374e-01]


In [None]:
from tabnanny import verbose


start_idx = np.random.randint(0, len(nietzsche_lower_text) - sequence_length)

for epoch in range(1, 50):
    print(f'epoch : {epoch}')

    model.fit(X, y, batch_size=128, epochs=1)

    seed_text = nietzsche_lower_text[start_idx: start_idx + sequence_length]

    # 다음 문자 선택 확률 옵션 조정하는 반복문 추가
    
    for idx in range(100): # 다음 문자 예측 반복 횟수
        sample = np.zeros(shape=(1, sequence_length, len(char_to_idx))) # [1, 50, 57]
        for cidx, c in enumerate(seed_text):
            sample[0, cidx, char_to_idx[c]] = 1
        
        predicteced_values = model.predict(sample, verbose=0) # 예측 값은 (1, 57)  확률 vector
        selected_char_idx = select_character(predicteced_values)
        seed_text += char_to_idx[selected_char_idx]
        seed_text = seed_text[idx+1:]
    

In [55]:
seed_text

NameError: name 'seed_text' is not defined