In [1]:
import tensorflow as tf
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras import layers
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dropout, Dense

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import re
import os
import json

from konlpy.tag import Okt
from tqdm import tqdm
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords

In [2]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'
INPUT_TRAIN_DATA = 'nsmc_train_input.npy'
LABEL_TRAIN_DATA = 'nsmc_train_label.npy'
DATA_CONFIGS = 'data_configs.json'

train_input = np.load(open(DATA_IN_PATH + INPUT_TRAIN_DATA, 'rb'))
train_label = np.load(open(DATA_IN_PATH + LABEL_TRAIN_DATA, 'rb'))
prepro_configs = json.load(open(DATA_IN_PATH + DATA_CONFIGS, 'r'))

In [3]:
SEED_NUM = 1234
tf.random.set_seed(SEED_NUM)

In [4]:
model_name = 'cnn_classifier_en'
BATCH_SIZE = 512
NUM_EPOCHS = 2
VALID_SPLIT = 0.1
MAX_LEN = train_input.shape[1]

In [5]:
model = tf.keras.Sequential([
    Embedding(input_dim=74066, output_dim=128),
    Conv1D(filters=100, kernel_size=4, padding='valid', activation=tf.keras.activations.relu),
    GlobalMaxPooling1D(),
    Dropout(0.2),
    Dense(units=250, activation='relu'),
    Dense(units=1, activation='sigmoid')
])

In [6]:
model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy(name='accuracy')])

In [7]:
# overfitting을 막기 위한 ealrystop 추가
earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001,patience=2)
# min_delta: the threshold that triggers the termination (acc should at least improve 0.0001)
# patience: no improvment epochs (patience = 1, 1번 이상 상승이 없으면 종료)\

checkpoint_path = DATA_OUT_PATH + model_name + '/weights.h5'
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create path if exists
if os.path.exists(checkpoint_dir):
    print("{} -- Folder already exists \n".format(checkpoint_dir))
else:
    os.makedirs(checkpoint_dir, exist_ok=True)
    print("{} -- Folder create complete \n".format(checkpoint_dir))
    

cp_callback = ModelCheckpoint(
    checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)

./data_out/cnn_classifier_en -- Folder already exists 



In [8]:
history = model.fit(train_input, train_label, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS,
                    validation_split=VALID_SPLIT, callbacks=[earlystop_callback, cp_callback])

Epoch 1/2
Epoch 00001: val_accuracy improved from -inf to 0.81993, saving model to ./data_out/cnn_classifier_en\weights.h5
Epoch 2/2
Epoch 00002: val_accuracy improved from 0.81993 to 0.82000, saving model to ./data_out/cnn_classifier_en\weights.h5


In [64]:
okt=Okt()
train_data = pd.read_csv('nscm_train_clean.csv')
clean_review = []

for review in train_data['review'] :
    sentence = preprocessing(review)
    clean_review.append(sentence)

In [65]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_review)

In [66]:
def preprocessing(review, remove_stopwords = True):
    # 함수의 인자는 다음과 같다.
    # review : 전처리할 텍스트
    # okt : okt 객체를 반복적으로 생성하지 않고 미리 생성후 인자로 받는다.
    # remove_stopword : 불용어를 제거할지 선택 기본값은 True
    
    # 1. 한글 및 공백을 제외한 문자 모두 제거.
    review_text = re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]", "", review)
    
    # 2. okt 객체를 활용해서 형태소 단위로 나눈다.
    word_review = okt.morphs(review_text, stem=True)
    
    stop_words = set(['은', '는', '이', '가', '하', '아', '것', '들','의', '있', '되', '수', '보', '주', '등', '한'])
    
    if remove_stopwords:
        
        # 불용어 제거(선택적)
        word_review = [token for token in word_review if not token in stop_words]
    
    clean_review = ' '.join(word_review)

    return clean_review

In [75]:
def encoding(review) :
    global tokenizer
    encoded = tokenizer.texts_to_sequences([review])
    pad = pad_sequences(encoded, maxlen=8)
    return pad

In [76]:
def predict(review) :
    clean_review = preprocessing(review, remove_stopwords=True)
    pad = encoding(clean_review)
    return float(model.predict(pad))

In [77]:
predict("너무 좋고 재밌었다")

0.9923306107521057

In [78]:
predict("재밌다")

0.9729645252227783

In [79]:
predict("별로였다")

0.7726003527641296

In [80]:
predict("보는 동안 마음이 편해졌다.")

0.01892116665840149

In [81]:
predict("시간 낭비다")

0.26808398962020874

In [83]:
predict("나는 너무 재밌었고 보는 동안 마음도 편해졌다")

0.0353202223777771

In [84]:
predict("너무")

0.5826089978218079