In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt

from collections import Counter
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

import re
from nltk.corpus import stopwords
from keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from gensim.models.keyedvectors import KeyedVectors
!pip install glove_python_binary
from glove import Corpus, Glove




In [2]:
f = open("/content/input.txt")
dataset = f.readlines()
f.close
print(len(dataset))

40000


In [None]:
#간단한 전처리
data = []
def preprocess(text):
  text = re.sub(r"[^A-Za-z ]","", text) # 영어,한글만 포함
  return text
for i in range(40000):
  if preprocess(dataset[i]) not in ["First Citizen","All","Seocond Citizen","MENENIUS","COMINIUS","MARCIUS","AUFIDIUS","Fisrt Soldier","BRUTUS","","SICINIUS","Both","VOLUMNIA","VIRGILIA","VALERIA"]:
    data.append(preprocess(dataset[i].lower()))

##토큰화
단어들을 정수로 인코딩하는 과정

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)
counts=tokenizer.word_counts #단어의 발생 빈도
word_index = tokenizer.word_index #단어의 index를 dictionary로 저장 이후 padding과 wordembedding에 사용
tokens=tokenizer.texts_to_sequences(data) #문자를 정수들의 sequence로 반환

In [5]:
#단어 종류의 수
vocab_size=0
for index,word in enumerate(word_index):
  vocab_size=max(vocab_size,index)
print(vocab_size)

12846


In [6]:
#한 문장에 최대 몇개의 단어가 있는가?
max_len =0
count=0
for i in (tokens):
  max_len =max(max_len,len(i))
print(max_len)


16


##Glove을 직접 학습 시켜 embedding 계층에 사용

In [7]:
word_list=[]
for i in range(len(data)):
  word_list.append(data[i].split())

In [8]:
corpus = Corpus() 

# 훈련 데이터로부터 GloVe에서 사용할 동시 등장 행렬 생성
corpus.fit(word_list, window=10)
glove = Glove(no_components=300, learning_rate=0.05)

# 학습에 이용할 쓰레드의 개수는 4로 설정, 에포크는 20.
glove.fit(corpus.matrix, epochs=20, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)

Performing 20 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19


In [13]:
#위에서 학습한 glove vector를 embedding에 넣기위해서 embedding_matrix에 저장
embedding_matrix=np.zeros((vocab_size,300))
for index, word in enumerate(word_index): #vocabulary에 있는 토큰들을 하나씩 넘겨줍니다.
      embedding_matrix[index-1] = glove.word_vectors[index-1] #해당 위치의 embedding_mxtrix에 저장합니다.

##X,Y dataset 만들기

In [9]:
sequences = pad_sequences(tokens, maxlen = max_len,padding='pre') 
#padding하는 과정 padding이란 input으로 사용하기 위해 길이를 맞춰주는 과정

In [10]:
sequences = np.array(sequences) 
X = sequences[:,:-1]


#Y = sequences[:,1:] #many-to-many용)

#Y = sequences[:,-1] #many-to-one용

In [12]:
print(sequences[0],X[0],Y[0]) 

[  0   0   0   0   0   0   0   0 137  35 961 141 657 124  15 104] [  0   0   0   0   0   0   0   0 137  35 961 141 657 124  15] [  0   0   0   0   0   0   0 137  35 961 141 657 124  15 104]


In [11]:
Y = to_categorical(Y, num_classes=vocab_size+2) 
#Y는 cross_entropy를 위해서 one-hot vector로 수정

##pre_trained glove matrix를 가져와서 embedding 에 적용
직접 학습시키거나 이미 학습된걸 가져오거나 선택

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!unzip -uq "/content/drive/MyDrive/glove.6B.zip" -d "/content"

In [None]:
glove = dict()

f = open('glove.6B.300d.txt', encoding="utf8")

for line in f:
    word_vector = line.split()
    word = word_vector[0]

    word_vector_arr = np.asarray(word_vector[1:], dtype='float32')
    glove[word] = word_vector_arr
f.close()

In [None]:
#내가 쓸 단어 리스트가 glove에 있으면 그 word2vec가져오고 나머지는 0이상의 작은 소수로 초기화
word_list=glove.keys()
embedding_matrix = np.random.rand(vocab_size,300)
count=0
for index, word in enumerate(word_index): #vocabulary에 있는 토큰들을 하나씩 넘겨줍니다.
    if word in word_list: #넘겨 받은 토큰이 word2vec에 존재하면(이미 훈련이 된 토큰이라는 뜻)
        embedding_vector = glove[word] #해당 토큰에 해당하는 vector를 불러오고
        embedding_matrix[index] = embedding_vector #해당 위치의 embedding_mxtrix에 저장합니다.
    else:
       count+=1 
print(count)

2811


In [18]:
embedding_size = 300 #word vector의 embedding
Hidden = 256 #hidden state의 차원
model = Sequential()
model.add(Embedding(vocab_size, embedding_size, input_length = max_len-1,weights=[embedding_matrix],trainable=False)) 
#trainable = false는 이미 학습된 word2vec을 사용하므로 굳이 학습하지 않음
model.add(SimpleRNN(Hidden))
model.add(Dense(vocab_size+2, activation = 'softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 15, 300)           3853800   
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 256)               142592    
                                                                 
 dense_1 (Dense)             (None, 12848)             3301936   
                                                                 
Total params: 7,298,328
Trainable params: 3,444,528
Non-trainable params: 3,853,800
_________________________________________________________________
None


In [21]:
es = EarlyStopping(monitor='loss', min_delta=0.001, patience=10, verbose=1, mode='min', restore_best_weights=True)
mc = ModelCheckpoint("best_model.h5", monitor='loss', verbose=1, save_weights_only=True, save_best_only=True, mode='min')

callbacks = [es, mc]

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, Y, epochs=20,batch_size = 64, verbose=2,callbacks = callbacks)

Epoch 1/20

Epoch 1: loss improved from inf to 2.54222, saving model to best_model.h5
502/502 - 14s - loss: 2.5422 - accuracy: 0.5058 - 14s/epoch - 27ms/step
Epoch 2/20

Epoch 2: loss improved from 2.54222 to 2.25006, saving model to best_model.h5
502/502 - 12s - loss: 2.2501 - accuracy: 0.5612 - 12s/epoch - 25ms/step
Epoch 3/20

Epoch 3: loss improved from 2.25006 to 2.12822, saving model to best_model.h5
502/502 - 12s - loss: 2.1282 - accuracy: 0.5819 - 12s/epoch - 24ms/step
Epoch 4/20

Epoch 4: loss improved from 2.12822 to 2.04835, saving model to best_model.h5
502/502 - 12s - loss: 2.0483 - accuracy: 0.5944 - 12s/epoch - 25ms/step
Epoch 5/20

Epoch 5: loss improved from 2.04835 to 1.98244, saving model to best_model.h5
502/502 - 12s - loss: 1.9824 - accuracy: 0.6044 - 12s/epoch - 25ms/step
Epoch 6/20

Epoch 6: loss improved from 1.98244 to 1.91500, saving model to best_model.h5
502/502 - 12s - loss: 1.9150 - accuracy: 0.6176 - 12s/epoch - 24ms/step
Epoch 7/20

Epoch 7: loss improv

<keras.callbacks.History at 0x7f6f97b2b090>

In [16]:
def sentence_generation(model, tokenizer, current_word, n): # 모델, 토크나이저, 현재 단어, 반복할 횟수
    init_word = current_word
    sentence = ''

    # n번 반복
    for _ in range(n):
        # 현재 단어에 대한 정수 인코딩과 패딩
        encoded = tokenizer.texts_to_sequences([current_word])[0]
        encoded = pad_sequences([encoded], maxlen=max_len-1, padding='pre')
        # 입력한 X(현재 단어)에 대해서 Y를 예측하고 Y(예측한 단어)를 result에 저장.
        result = model.predict(encoded, verbose=0)
        result = np.argmax(result, axis=1)

        for word, index in tokenizer.word_index.items(): 
            # 만약 예측한 단어와 인덱스와 동일한 단어가 있다면 break
            if index == result:
                break

        # 현재 단어 + ' ' + 예측 단어를 현재 단어로 변경
        current_word = current_word + ' '  + word

        # 예측 단어를 문장에 저장
        sentence = sentence + ' ' + word

    sentence = init_word + sentence
    return sentence

In [25]:
print(sentence_generation(model, tokenizer, 'not', 10))

not possible sir traditional son banishment face babes voices well a
