In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
from time import time
from collections import Counter
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

import re
import nltk
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import stopwords
from keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from gensim.models.keyedvectors import KeyedVectors

##data load

In [2]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
print('총 샘플 수 :',len(documents))

총 샘플 수 : 11314


In [3]:
news_df = pd.DataFrame({'document':documents})
# 특수 문자 제거
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")
# 길이가 3이하인 단어는 제거 (길이가 짧은 단어 제거)
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
# 전체 단어에 대한 소문자 변환
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

  This is separate from the ipykernel package so we can avoid doing imports until


In [4]:
news_df.dropna(inplace=True)
print('총 샘플 수 :',len(news_df))

총 샘플 수 : 11314


In [5]:
news = []
for i in range(2000): #모두 올리면 램이 초과되어서 일부 사용
  news.append(news_df['clean_doc'][i][:40]) #maxlen 40

In [6]:
news[0]

'well sure about story seem biased what d'

##토큰화

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(news)

In [8]:
counts=tokenizer.word_counts
word_index = tokenizer.word_index

In [9]:
news_tokens=tokenizer.texts_to_sequences(news)

In [59]:
vocab_size=0
for index,word in enumerate(word_index):
  vocab_size=max(vocab_size,index)
print(vocab_size)

4557


In [11]:
sequences = pad_sequences(news_tokens, maxlen = 40,padding='pre')

In [14]:
#sequences = np.array(sequences)
#X=sequences[:,:-1]
#Y = sequences[:,1:]

sequences = np.array(sequences)
X = sequences[:,:-1]
y = sequences[:,-1]

In [16]:
print(np.shape(y),np.shape(X))

(2000,) (2000, 39)


In [63]:
Y = to_categorical(y, num_classes=vocab_size+2)

In [64]:
np.shape(Y)

(2000, 4559)

In [21]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
!unzip -uq "/content/drive/MyDrive/glove.6B.zip" -d "/content"

In [53]:
glove = dict()

f = open('glove.6B.300d.txt', encoding="utf8")

for line in f:
    word_vector = line.split()
    word = word_vector[0]

    word_vector_arr = np.asarray(word_vector[1:], dtype='float32')
    glove[word] = word_vector_arr
f.close()

In [1]:
word_list=glove.keys()
embedding_matrix = np.zeros(vocab_size,300)
count=0
for index, word in enumerate(word_index): #vocabulary에 있는 토큰들을 하나씩 넘겨줍니다.
    if word in word_list: #넘겨 받은 토큰이 word2vec에 존재하면(이미 훈련이 된 토큰이라는 뜻)
        embedding_vector = glove[word] #해당 토큰에 해당하는 vector를 불러오고
        embedding_matrix[index] = embedding_vector #해당 위치의 embedding_mxtrix에 저장합니다.
    else:
       count+=1 
print(count)

NameError: ignored

In [77]:
np.shape(embedding_matrix)

(4557, 300)

In [78]:
embedding_size = 300 #embedding size
Hidden = 10 #hidden layer dimenstion
model = Sequential()
model.add(Embedding(vocab_size, embedding_size,input_length = 39,weights=[embedding_matrix]))
model.add(SimpleRNN(embedding_size))
model.add(Dense(4559,activation='softmax'))

print(model.summary())

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 39, 300)           1367100   
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, 300)               180300    
                                                                 
 dense_2 (Dense)             (None, 4559)              1372259   
                                                                 
Total params: 2,919,659
Trainable params: 2,919,659
Non-trainable params: 0
_________________________________________________________________
None


In [80]:
es = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=10, verbose=1, mode='min', restore_best_weights=True)
mc = ModelCheckpoint("best{:02d}".format(i+1), monitor='val_loss', verbose=1, save_weights_only=True, save_best_only=True, mode='min')

callbacks = [es, mc]

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, Y, epochs=20,batch_size = 64, verbose=2,callbacks = callbacks, validation_split= 0.2)

Epoch 1/20

Epoch 1: val_loss improved from inf to 8.14227, saving model to best2000
25/25 - 2s - loss: 7.0825 - accuracy: 0.0331 - val_loss: 8.1423 - val_accuracy: 0.0375 - 2s/epoch - 77ms/step
Epoch 2/20

Epoch 2: val_loss did not improve from 8.14227
25/25 - 1s - loss: 6.2262 - accuracy: 0.0544 - val_loss: 8.4951 - val_accuracy: 0.0425 - 867ms/epoch - 35ms/step
Epoch 3/20

Epoch 3: val_loss did not improve from 8.14227
25/25 - 1s - loss: 5.5962 - accuracy: 0.0988 - val_loss: 8.7387 - val_accuracy: 0.0475 - 855ms/epoch - 34ms/step
Epoch 4/20

Epoch 4: val_loss did not improve from 8.14227
25/25 - 1s - loss: 4.9360 - accuracy: 0.2331 - val_loss: 8.7594 - val_accuracy: 0.0500 - 845ms/epoch - 34ms/step
Epoch 5/20

Epoch 5: val_loss did not improve from 8.14227
25/25 - 1s - loss: 4.2029 - accuracy: 0.4700 - val_loss: 8.8590 - val_accuracy: 0.0500 - 884ms/epoch - 35ms/step
Epoch 6/20

Epoch 6: val_loss did not improve from 8.14227
25/25 - 1s - loss: 3.3870 - accuracy: 0.7412 - val_loss: 9

<keras.callbacks.History at 0x7f7e9a2cb150>

In [81]:
def sentence_generation(model, tokenizer, current_word, n): # 모델, 토크나이저, 현재 단어, 반복할 횟수
    init_word = current_word
    sentence = ''

    # n번 반복
    for _ in range(n):
        # 현재 단어에 대한 정수 인코딩과 패딩
        encoded = tokenizer.texts_to_sequences([current_word])[0]
        encoded = pad_sequences([encoded], maxlen=39, padding='pre')
        # 입력한 X(현재 단어)에 대해서 Y를 예측하고 Y(예측한 단어)를 result에 저장.
        result = model.predict(encoded, verbose=0)
        result = np.argmax(result, axis=1)

        for word, index in tokenizer.word_index.items(): 
            # 만약 예측한 단어와 인덱스와 동일한 단어가 있다면 break
            if index == result:
                break

        # 현재 단어 + ' ' + 예측 단어를 현재 단어로 변경
        current_word = current_word + ' '  + word

        # 예측 단어를 문장에 저장
        sentence = sentence + ' ' + word

    sentence = init_word + sentence
    return sentence

In [83]:
print(sentence_generation(model, tokenizer, 'good', 4))

good explod explod explod explod
