In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
#from bs4 import BeautifulSoup 
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df=pd.read_csv('new_data.csv')  

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,overall,reviewText,summary,pretreatment
0,0,5.0,this is a great cd full of worship favorites!!...,great worship cd,great cd full worship favorite time great ke...
1,1,5.0,"so creative! love his music - the words, the ...",gotta listen to this!,creative love music word message favorite ...
2,2,5.0,"keith green, gone far to early in his carreer,...",great approach still gets the message out,keith green gone far early carreer left u go...
3,3,5.0,keith green had his special comedy style of ch...,great a must have,keith green special comedy style chirstian mus...
4,4,5.0,keith green / so you wanna go back to egypt......,a great one from keith with a guest appearance...,keith green wan na go back egypt : album kei...


In [4]:
df.replace('', np.nan, inplace=True)
print(df.isnull().sum())
df.dropna(axis = 0, inplace = True)
print('전체 샘플수 :',(len(df)))

Unnamed: 0      0
overall         0
reviewText      0
summary         0
pretreatment    6
dtype: int64
전체 샘플수 : 19994


In [5]:
text_len = [len(s.split()) for s in df['pretreatment']]
summary_len = [len(s.split()) for s in df['summary']]

#print('텍스트의 최소 길이 : {}'.format(np.min(text_len)))
print('텍스트의 최대 길이 : {}'.format(np.max(text_len)))
print('텍스트의 평균 길이 : {}'.format(np.mean(text_len)))
#print('요약의 최소 길이 : {}'.format(np.min(summary_len)))
print('요약의 최대 길이 : {}'.format(np.max(summary_len)))
print('요약의 평균 길이 : {}'.format(np.mean(summary_len)))

텍스트의 최대 길이 : 986
텍스트의 평균 길이 : 29.555666700010004
요약의 최대 길이 : 27
요약의 평균 길이 : 3.748624587376213


In [6]:
text_max_len = 100
summary_max_len = 20

In [7]:
# 요약 데이터에는 시작 토큰과 종료 토큰을 추가한다.
df['summary'] = df['summary'].apply(lambda x : 'sostoken '+ x + ' eostoken')


In [8]:
Text_data = list(df['pretreatment'])
Summary_data = list(df['summary'])

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Text_data, Summary_data, test_size=0.2, random_state=1, shuffle=True)


In [10]:
src_tokenizer = Tokenizer()
src_tokenizer.fit_on_texts(X_train)
X_train = src_tokenizer.texts_to_sequences(X_train) 
X_test = src_tokenizer.texts_to_sequences(X_test)

tar_tokenizer = Tokenizer()
tar_tokenizer.fit_on_texts(y_train)
y_train = tar_tokenizer.texts_to_sequences(y_train) 
y_test = tar_tokenizer.texts_to_sequences(y_test) 

In [11]:
drop_train = [index for index, sentence in enumerate(y_train) if len(sentence) == 2]
drop_test = [index for index, sentence in enumerate(y_test) if len(sentence) == 2]

In [12]:
print('훈련 데이터의 개수 :', len(X_train))
print('훈련 레이블의 개수 :',len(y_train))
print('테스트 데이터의 개수 :',len(X_test))
print('테스트 레이블의 개수 :',len(y_test))

훈련 데이터의 개수 : 15995
훈련 레이블의 개수 : 15995
테스트 데이터의 개수 : 3999
테스트 레이블의 개수 : 3999


In [13]:
X_train = np.delete(X_train, drop_train, axis=0)
y_train = np.delete(y_train, drop_train, axis=0)
X_test = np.delete(X_test, drop_test, axis=0)
y_test = np.delete(y_test, drop_test, axis=0)

print('훈련 데이터의 개수 :', len(X_train))
print('훈련 레이블의 개수 :',len(y_train))
print('테스트 데이터의 개수 :',len(X_test))
print('테스트 레이블의 개수 :',len(y_test))

훈련 데이터의 개수 : 15982
훈련 레이블의 개수 : 15982
테스트 데이터의 개수 : 3953
테스트 레이블의 개수 : 3953


In [14]:
X_train = pad_sequences(X_train, maxlen = text_max_len, padding='post')
X_test = pad_sequences(X_test, maxlen = text_max_len, padding='post')
y_train = pad_sequences(y_train, maxlen = summary_max_len, padding='post')
y_test = pad_sequences(y_test, maxlen = summary_max_len, padding='post')

In [15]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [16]:
import tensorflow as tf 
embedding_dim = 128
hidden_size = 256
src_vocab = len(src_tokenizer.word_index)

with tf.device('/cpu:0'):
    # 인코더
    encoder_inputs = Input(shape=(text_max_len,))

    # 인코더의 임베딩 층
    enc_emb = Embedding(src_vocab, embedding_dim)(encoder_inputs)

    # 인코더의 LSTM 1
    encoder_lstm1 = LSTM(hidden_size, return_sequences=True, return_state=True ,dropout = 0.4, recurrent_dropout = 0.4)
    encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb)

    # 인코더의 LSTM 2
    encoder_lstm2 = LSTM(hidden_size, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4)
    encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)

    # 인코더의 LSTM 3
    encoder_lstm3 = LSTM(hidden_size, return_state=True, return_sequences=True, dropout=0.4, recurrent_dropout=0.4)
    encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2)



In [17]:
tar_vocab = len(tar_tokenizer.word_index)

with tf.device('/cpu:0'):
    # 디코더
    decoder_inputs = Input(shape=(None,))

    # 디코더의 임베딩 층
    dec_emb = Embedding(tar_vocab, embedding_dim)(decoder_inputs)

    # 디코더의 LSTM
    decoder_lstm = LSTM(hidden_size, return_sequences = True, return_state = True, dropout = 0.4, recurrent_dropout=0.2)
    decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state = [state_h, state_c])




In [18]:
import urllib.request
urllib.request.urlretrieve("https://raw.githubusercontent.com/thushv89/attention_keras/master/layers/attention.py", filename="attention.py")
from attention import AttentionLayer

In [19]:
with tf.device('/cpu:0'):
    # 어텐션 층(어텐션 함수)
    attn_layer = AttentionLayer(name='attention_layer')
    attn_out, attn_states = attn_layer([encoder_outputs, decoder_outputs])

    # 어텐션의 결과와 디코더의 hidden state들을 연결
    decoder_concat_input = Concatenate(axis = -1, name='concat_layer')([decoder_outputs, attn_out])

    # 디코더의 출력층
    decoder_softmax_layer = Dense(tar_vocab, activation='softmax')
    decoder_softmax_outputs = decoder_softmax_layer(decoder_concat_input)

    # 모델 정의
    model = Model([encoder_inputs, decoder_inputs], decoder_softmax_outputs)
    model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 100)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 100, 128)     3572992     input_1[0][0]                    
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 100, 256), ( 394240      embedding[0][0]                  
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
______________________________________________________________________________________________

In [20]:
with tf.device('/cpu:0'):
    model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

In [21]:
with tf.device('/cpu:0'):
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience = 2)
    history = model.fit([X_train, y_train[:,:-1]], y_train.reshape(y_train.shape[0], y_train.shape[1], 1)[:,1:] \
                      ,epochs=50, callbacks=[es], batch_size = 256, validation_data=([X_test, y_test[:,:-1]], \
                      y_test.reshape(y_test.shape[0], y_test.shape[1], 1)[:,1:]))

Epoch 1/50

InvalidArgumentError:  indices[208,5] = 6068 is not in [0, 6068)
	 [[node model/embedding_1/embedding_lookup (defined at <ipython-input-21-b99806f99fac>:5) ]] [Op:__inference_train_function_17394]

Errors may have originated from an input operation.
Input Source operations connected to node model/embedding_1/embedding_lookup:
 model/embedding_1/embedding_lookup/11313 (defined at /home/yj_im/anaconda3/envs/env/lib/python3.6/contextlib.py:81)

Function call stack:
train_function


In [None]:
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()