In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras import datasets, layers, models
from keras.preprocessing.text import Tokenizer


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import Constant
from keras.models import Model
from keras.layers import *
from keras.utils.np_utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers.merge import concatenate
import re

Using TensorFlow backend.


# 데이터 불러오기
- FastText
- 네이버 영화 리뷰

In [2]:
from gensim import models

ko_model = models.fasttext.load_facebook_model('cc.ko.300.bin.gz')

In [3]:
df = pd.read_excel('ratings_train.xlsx')

doc = list(df['document'])
labels = list(df['label'])

#  리뷰 데이터 전처리

## Okt

In [None]:
from konlpy.tag import Okt

okt = Okt()

In [None]:
result = []

for sentence in doc:
    try:
        mal_list = okt.pos(sentence)
        mals = []
        for mal in mal_list:
            mals.append(mal[0])
        result.append(mals)
    except:
        result.append('아')

# 토크나이즈

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(result)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
X_encoded = tokenizer.texts_to_sequences(result)
max_len = max(len(I) for I in X_encoded)

# 패딩

In [None]:
X_train = pad_sequences(X_encoded, maxlen=max_len, padding='post')
y_train = np.array(labels)

In [None]:
import numpy as np

VOCAB_SIZE = len(tokenizer.index_word) + 1
EMBEDDING_DIM = 300

embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))

for word, idx in tokenizer.word_index.items():
    embedding_vector = ko_model.wv.word_vec(word)
    if embedding_vector is not None:
        embedding_matrix[idx] = embedding_vector
        
embedding_matrix.shape

# 레이어 쌓기

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

sequence_input = tf.keras.Input(shape=(max_len,), dtype='int32')
embedding_layer = tf.keras.layers.Embedding(vocab_size,
                            embedding_dim,
                            input_length=max_len,
                            trainable=False)
embedded_sequences = embedding_layer(sequence_input)
convs = []
filter_sizes = [3,4,5]
for fsz in filter_sizes:
    x = tf.keras.layers.Conv1D(128, fsz, activation='relu',padding='same')(embedded_sequences)
    x = tf.keras.layers.MaxPooling1D()(x)
    convs.append(x)
x = tf.keras.layers.Concatenate(axis=-1)(convs)
x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.02))(x)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model = tf.keras.Model(sequence_input, output)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs = 20, validation_split=0.2, callbacks=[callback])

# 데이터 시각화

In [None]:
import matplotlib.pyplot as plt
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_' + string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_' + string])
  plt.show()
plot_graphs(history, 'accuracy')
plot_graphs(history, 'loss')

# 예측

In [None]:
sentence = ["존나","좋네",  "실망스럽다. 한심한 영화다. 이것도 영화라고 참."]
sequence_exp = tokenizer.texts_to_sequences(doc[10:20])
padded_exp = pad_sequences(sequence_exp, maxlen = max_len, padding = 'post', truncating= 'post')
print(model.predict(padded_exp).round(2))

# 데이터 저장과 불러오기

In [None]:
!mkdir -p saved_model
model.save('saved_model/my_model') 

new_model = tf.keras.models.load_model('saved_model/my_model')