In [1]:
import pandas as pd
import re
from konlpy.tag import Okt
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

Read train/test data

using pandas

In [2]:
train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')

한글을 제외한 문자를 regual express 을 이용하여 제거

In [3]:
train_data['document'] = train_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
test_data['document'] = test_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","") 

In [4]:
train_data.drop_duplicates(subset=['document'], inplace=True)
train_data['document'].replace('', np.nan, inplace=True)
train_data = train_data.dropna(how = 'any')

test_data.drop_duplicates(subset=['document'], inplace=True)
test_data['document'].replace('', np.nan, inplace=True)
test_data = test_data.dropna(how = 'any')

형태소 분석

konlpy 을 사용


In [5]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

okt = Okt()
X_train = []
for sentence in train_data['document']:
    temp_X = []
    temp_X = okt.morphs(sentence, stem=True) # 토큰화
    temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
    X_train.append(temp_X)


X_test = []
for sentence in test_data['document']:
    temp_X = []
    temp_X = okt.morphs(sentence, stem=True) # 토큰화
    temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
    X_test.append(temp_X)    

In [37]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
vocab_size = 19416

tokenizer = Tokenizer(vocab_size) 
tokenizer.fit_on_texts(X_train)
x_train = tokenizer.texts_to_sequences(X_train)
x_test = tokenizer.texts_to_sequences(X_test)

In [38]:
y_train = np.array(train_data['label'])
y_test = np.array(test_data['label'])

drop_train = [index for index, sentence in enumerate(X_train) if len(sentence) < 1]
drop_test = [index for index, sentence in enumerate(X_test) if len(sentence) < 1]

x_train = np.delete(x_train, drop_train, axis=0)
y_train = np.delete(y_train, drop_train, axis=0)

x_test = np.delete(x_test, drop_test, axis=0)
y_test = np.delete(y_test, drop_test, axis=0)

In [39]:
max_len = 30
x_train = pad_sequences(x_train, maxlen = max_len)
x_test = pad_sequences(x_test, maxlen = max_len)

In [40]:
model = Sequential()
model.add(Embedding(vocab_size, 100))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

In [41]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

In [42]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(x_train, y_train, epochs=15, callbacks=[es, mc], batch_size=60, validation_split=0.2)

Epoch 1/15
Epoch 00001: val_acc improved from -inf to 0.84313, saving model to best_model.h5
Epoch 2/15
Epoch 00002: val_acc improved from 0.84313 to 0.85305, saving model to best_model.h5
Epoch 3/15
Epoch 00003: val_acc improved from 0.85305 to 0.85559, saving model to best_model.h5
Epoch 4/15
Epoch 00004: val_acc did not improve from 0.85559
Epoch 5/15
Epoch 00005: val_acc improved from 0.85559 to 0.85747, saving model to best_model.h5
Epoch 6/15
Epoch 00006: val_acc did not improve from 0.85747
Epoch 7/15
Epoch 00007: val_acc did not improve from 0.85747
Epoch 8/15
Epoch 00008: val_acc did not improve from 0.85747
Epoch 9/15
Epoch 00009: val_acc did not improve from 0.85747
Epoch 00009: early stopping


In [43]:
loaded_model = load_model('best_model.h5')
print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(x_test, y_test)[1]))


 테스트 정확도: 0.8559
