In [35]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Embedding, LSTM, Conv1D, MaxPooling1D
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.datasets import imdb
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np
import matplotlib.pyplot as plt

## 데이터로드

In [36]:
import urllib.request

train_dataset = urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
test_dataset = urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

In [18]:
# (X_train, y_train), (X_test, y_test) = imdb.load_data(path='ratings_train.txt', num_words=5000)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [37]:
import pandas as pd

train_df = pd.read_table('ratings_train.txt')
test_df = pd.read_table('ratings_test.txt')

train_df.drop(columns=['id'], inplace=True)
test_df.drop(columns=['id'], inplace=True)
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

x_train = train_df['document']
y_train = train_df['label']
x_test = test_df['document']
y_test = test_df['label']
print(train_df.head())
print(test_df.head())
print(train_df.isnull().sum())
print(test_df.isnull().sum())

                                            document  label
0                                아 더빙.. 진짜 짜증나네요 목소리      0
1                  흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나      1
2                                  너무재밓었다그래서보는것을추천한다      0
3                      교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정      0
4  사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...      1
                                            document  label
0                                                굳 ㅋ      1
1                               GDNTOPCLASSINTHECLUB      0
2             뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아      0
3                   지루하지는 않은데 완전 막장임... 돈주고 보기에는....      0
4  3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??      0
document    0
label       0
dtype: int64
document    0
label       0
dtype: int64


In [39]:
print(x_test.head())
print(y_test.head())

0                                                  굳 ㅋ
1                                 GDNTOPCLASSINTHECLUB
2               뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아
3                     지루하지는 않은데 완전 막장임... 돈주고 보기에는....
4    3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??
Name: document, dtype: object
0    1
1    0
2    0
3    0
4    0
Name: label, dtype: int64


## 데이터 인코딩

In [40]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍',
             '과','도','를','으로','자','에','와','한','하다',
             '!', '?', ',', '.', '..', '...', '....', 'ㅋ']

In [41]:
import re
from tensorflow.keras.preprocessing.text import Tokenizer

# 정규 표현식을 이용하여 한글 이외의 문자 제거
train_df['document'] = train_df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
test_df['document'] = test_df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")

# 불용어 제거
train_df['document'] = train_df['document'].apply(lambda x: [word for word in x.split() if word not in stopwords])
test_df['document'] = test_df['document'].apply(lambda x: [word for word in x.split() if word not in stopwords])

# 단어를 구분하고 정수형으로 인코딩
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df['document'])
train_sequences = tokenizer.texts_to_sequences(train_df['document'])
test_sequences = tokenizer.texts_to_sequences(test_df['document'])

# 문장 길이를 동일하게 맞추기 위해 패딩
train_padded = sequence.pad_sequences(train_sequences, maxlen=100)
test_padded = sequence.pad_sequences(test_sequences, maxlen=100)


In [48]:
print(train_padded.shape)
print(test_padded.shape)
print(type(train_padded))

(149995, 100)
(49997, 100)
<class 'numpy.ndarray'>


In [64]:
from tensorflow.keras import layers
# LSTM만 만들어보고, 그 이후에 레이어 추가해서 해보고, 긍정/부정으로 나누어보던가
#모델 구조 설정
model = Sequential()
model.add(Embedding(5000, 500))
# model.add(Dropout(0.5))
# model.add(Conv1D(64, 5, padding='valid', activation='relu', strides=1))
# model.add(MaxPooling1D(pool_size=4))

# The output of GRU will be a 3D tensor of shape (batch_size, timesteps, 256)
model.add(layers.GRU(256, return_sequences=True))

# The output of SimpleRNN will be a 2D tensor of shape (batch_size, 128)
model.add(layers.SimpleRNN(128))

# model.add(LSTM(55))
model.add(Dense(1))
model.add(Activation('sigmoid'))

In [65]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 500)         2500000   
                                                                 
 gru (GRU)                   (None, None, 256)         582144    
                                                                 
 simple_rnn (SimpleRNN)      (None, 128)               49280     
                                                                 
 dense_2 (Dense)             (None, 1)                 129       
                                                                 
 activation_2 (Activation)   (None, 1)                 0         
                                                                 
Total params: 3131553 (11.95 MB)
Trainable params: 3131553 (11.95 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [66]:
#모델 실행 옵션
model.compile(loss='binary_crossentropy', optimizer='adam',
              metrics=['accuracy'])

In [67]:
#학습 조기 중단 설정
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=5)

In [None]:
#모델 실행
history = model.fit(train_padded, y_train, batch_size=40, epochs=100,
                    validation_split=0.25,
                    callbacks=[early_stopping_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100

## 결과 시각화

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

def plot_history(history):
  hist = pd.DataFrame(history.history)
  hist['epoch'] = history.epoch
  plt.figure(figsize=(16,8))
  plt.subplot(1,2,1)
  plt.xlabel('Epoch')
  plt.ylabel('Loss')
  plt.plot(hist['epoch'], hist['loss'], label='Train Loss')
  plt.plot(hist['epoch'], hist['val_loss'], label = 'Val Loss')
  plt.legend()

  plt.subplot(1,2,2)
  plt.xlabel('Epoch')
  plt.ylabel('Accuracy')
  plt.plot(hist['epoch'], hist['accuracy'], label='Train Accuracy')
  plt.plot(hist['epoch'], hist['val_accuracy'], label = 'Val Accuracy')
  plt.legend()
plt.show()

In [None]:
plot_history(history)

In [None]:
model.evaluate(test_padded, y_test)

In [None]:
pred = model.predict(test_padded)

In [None]:
for text, prediction, gt in zip(x_test[:10], pred[:10], y_test[:10]):
  print(f'{text} : \n모델예측결과={prediction}, GT={gt}')