In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import preprocessing
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Conv1D, GlobalMaxPool1D, concatenate
from tensorflow.keras.optimizers import Adam
from Preprocess import Preprocess
from GlobalParams import MAX_SEQ_LEN

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

In [2]:
# 데이터 불러오기
data = pd.read_csv("../data/수정데이터/total_train_data.csv")

In [3]:
import numpy as np
data = data[450000:455000]
data

Unnamed: 0,text,label
450000,네 상품 주문했는데요,2
450001,성함과 연락처 어떻게 되세요,2
450002,그러셨어요 저희 번호를 알려주셨어요,2
450003,센터의 윗사람하고 한 번 더 얘길 해보시던지 그 방법밖에 없어요,2
450004,근데 저희 쪽으로 연락하라고 하셨다고요,2
...,...,...
454995,공기놀이하고 싶네요,2
454996,무슨 방법이 없나,2
454997,나 보고 깜짝 놀랐잖아,2
454998,그러게 갑자기 비 온다길래 놀랐어,2


In [4]:
text = data['text'].tolist()
label = data['label'].tolist()
p = Preprocess(word2index_dic='../data/chatbot_dict.bin', 
               userdic=None)

In [5]:
# 단어 시퀀스 생성
sequences = []
for sentence in text:
    pos = p.pos(sentence)
    keywords = p.get_keywords(pos, without_tag=True)
    seq = p.get_wordidx_sequence(keywords)
    sequences.append(seq)

In [6]:
# 단어 시퀀스 벡터 크기
padded_seqs = preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding='post')

In [7]:
# 학습용, 검증용, 테스트용 데이터셋 생성
ds = tf.data.Dataset.from_tensor_slices((padded_seqs, label))
ds = ds.shuffle(len(text))

train_size = int(len(padded_seqs) * 0.7)
val_size = int(len(padded_seqs) * 0.2)
test_size = int(len(padded_seqs) * 0.1)

train_ds = ds.take(train_size).batch(20)
val_ds = ds.skip(train_size).take(val_size).batch(20)
test_ds = ds.skip(train_size + val_size).take(test_size).batch(20)

In [8]:
# 하이퍼 파라미터 설정
dropout_prob = 0.5
EMB_SIZE = 128
EPOCH = 5
VOCAB_SIZE = len(p.word_index) + 1

In [9]:
# CNN 모델 정의
input_layer = Input(shape=(MAX_SEQ_LEN, ))
embedding_layer = Embedding(VOCAB_SIZE, EMB_SIZE, input_length=MAX_SEQ_LEN)(input_layer)
dropout_emb = Dropout(rate=dropout_prob)(embedding_layer)
conv1 = Conv1D(
    filters=128, 
    kernel_size=3, 
    padding='valid', 
    activation=tf.nn.relu)(dropout_emb)
pool1 = GlobalMaxPool1D()(conv1)

conv2 = Conv1D(
    filters=128,
    kernel_size=4,
    padding='valid',
    activation=tf.nn.relu)(dropout_emb)
pool2 = GlobalMaxPool1D()(conv2)

concat = concatenate([pool1, pool2])

hidden = Dense(128, activation=tf.nn.relu)(concat)
dropout_hidden = Dropout(rate=dropout_prob)(hidden)
logits = Dense(3, name='logits')(dropout_hidden)
predictions = Dense(3, activation=tf.nn.softmax)(logits)
adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

  super().__init__(name, **kwargs)


In [10]:
# 모델 생성
model = Model(inputs=input_layer, outputs=predictions)
model.compile(optimizer=adam,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
# 모델 학습
with tf.device('/GPU:0'):
    model.fit(train_ds, validation_data=val_ds, epochs=EPOCH, verbose=1)

Epoch 1/5
 31/700 [>.............................] - ETA: 15s - loss: nan - accuracy: 0.0081

: 

: 

In [None]:
# 모델 평가
with tf.device('/GPU:0'):
    loss, accuracy = model.evaluate(test_ds, verbose=1)
    print('Accuracy: %f' % (accuracy * 100))
    print('loss: %f' % (loss))

In [None]:
# # 모델 저장
# model.save('./model/intent_model.h5')
# print('완료되었습니다.')