In [1]:
import os
from datetime import datetime
import tensorflow as tf
import numpy as np
import json
from sklearn.model_selection import train_test_split

In [2]:
FILE_DIR_PATH = './data/'
INPUT_TRAIN_DATA_FILE_NAME = 'nsmc_train_input.npy'
LABEL_TRAIN_DATA_FILE_NAME = 'nsmc_train_label.npy'
DATA_CONFIGS_FILE_NAME = 'data_configs.json'

input_data = np.load(open(FILE_DIR_PATH + INPUT_TRAIN_DATA_FILE_NAME, 'rb'))
label_data = np.load(open(FILE_DIR_PATH + LABEL_TRAIN_DATA_FILE_NAME, 'rb'))
prepro_configs = json.load(open(FILE_DIR_PATH + DATA_CONFIGS_FILE_NAME, 'r'))

In [3]:
TEST_SPLIT = 0.1
RNG_SEED = 13371447

input_train, input_eval, label_train, label_eval = train_test_split(input_data, label_data, test_size=TEST_SPLIT, random_state=RNG_SEED)

In [4]:
def mapping_fn(X, Y):
    input, label = {'text': X}, Y
    return input, label

def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((input_train, label_train))
    dataset = dataset.shuffle(buffer_size=len(input_train))
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(mapping_fn)
    dataset = dataset.repeat(count=NUM_EPOCHS)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((input_eval, label_eval))
    dataset = dataset.shuffle(buffer_size=len(input_eval))
    dataset = dataset.batch(16)
    dataset = dataset.map(mapping_fn)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

In [5]:
def model_fn(features, labels, mode, params):
    TRAIN = mode == tf.estimator.ModeKeys.TRAIN
    EVAL = mode == tf.estimator.ModeKeys.EVAL
    PREDICT = mode == tf.estimator.ModeKeys.PREDICT

   #embedding layer를 선언합니다.
    input_layer = tf.contrib.layers.embed_sequence(
                    features['text'],
                    vocab_size = vocab_size,
                    embedding_size=embedding_size,
                    initializer=params['embedding_initializer']
                    )
    # 현재 모델이 학습모드인지 여부를 확인하는 변수입니다.
    training = (mode == tf.estimator.ModeKeys.TRAIN)
    # embedding layer에 대한 output에 대해 dropout을 취합니다.
    dropout_emb = tf.layers.dropout(inputs=input_layer,
                                  rate=0.2,
                                  training=training)

    #### CNN 구현체 부분 ####
    conv = tf.layers.conv1d(
           inputs=dropout_emb,
           filters=32,
           kernel_size=3,
           padding='same',
           activation=tf.nn.relu)
  
    pool = tf.reduce_max(input_tensor=conv, axis=1) #max-pooling layer


    #Fully-connected layer
    hidden = tf.layers.dense(inputs=pool, units=250, activation=tf.nn.relu)   

    #####################
    dropout_hidden = tf.layers.dropout(inputs=hidden, rate=0.2, training=training)
    logits = tf.layers.dense(inputs=dropout_hidden, units=1)

    #prediction 진행 시, None
    if labels is not None:
        labels = tf.reshape(labels, [-1, 1])
    #최종적으로 학습, 평가, 테스트의 단계로 나누어 활용
    if TRAIN:
        global_step = tf.train.get_global_step()
        loss = tf.losses.sigmoid_cross_entropy(labels, logits)
        train_op = tf.train.AdamOptimizer(0.001).minimize(loss, global_step)

        return tf.estimator.EstimatorSpec(mode=mode, train_op=train_op, loss = loss)

    elif EVAL:
        loss = tf.losses.sigmoid_cross_entropy(labels, logits)
        pred = tf.nn.sigmoid(logits)
        accuracy = tf.metrics.accuracy(labels, tf.round(pred))
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops={'acc': accuracy})

    elif PREDICT:
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions={
                'prob': tf.nn.sigmoid(logits),
            }
        )

In [7]:
BATCH_SIZE = 16
NUM_EPOCHS = 10
vocab_size = prepro_configs['vocab_size']
embedding_size = 128

params = {'embedding_initializer': tf.random_uniform_initializer(-1.0, 1.0)}

model_dir = os.path.join(os.getcwd(), "checkpoint/cnn_model")
os.makedirs(model_dir, exist_ok=True)

config_tf = tf.estimator.RunConfig()
config_tf._save_checkpoints_steps = 100
config_tf._save_checkpoints_secs = None
config_tf._keep_checkpoint_max =  2
config_tf._log_step_count_steps = 100

est = tf.estimator.Estimator(model_fn, model_dir=model_dir, config=config_tf, params=params)

INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\JungHyun\\checkpoint/cnn_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 100, '_save_checkpoints_secs': None, '_session_config': None, '_keep_checkpoint_max': 2, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001DA26852898>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [13]:
tf.logging.set_verbosity(tf.logging.INFO)
print(tf.__version__)
time_start = datetime.utcnow()
print("Experiment started at {}".format(time_start.strftime("%H:%M:%S")))
print(".......................................") 

est.train(train_input_fn)

time_end = datetime.utcnow()
print(".......................................")
print("Experiment finished at {}".format(time_end.strftime("%H:%M:%S")))
print("")
time_elapsed = time_end - time_start
print("Experiment elapsed time: {} seconds".format(time_elapsed.total_seconds()))

1.9.0
Experiment started at 04:48:11
.......................................
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\JungHyun\checkpoint/cnn_model\model.ckpt-26000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 26000 into C:\Users\JungHyun\checkpoint/cnn_model\model.ckpt.
INFO:tensorflow:loss = 0.4894277, step = 26001
INFO:tensorflow:Saving checkpoints for 26100 into C:\Users\JungHyun\checkpoint/cnn_model\model.ckpt.
INFO:tensorflow:global_step/sec: 7.13495
INFO:tensorflow:loss = 0.4519238, step = 26101 (14.019 sec)
INFO:tensorflow:Saving checkpoints for 26200 into C:\Users\JungHyun\checkpoint/cnn_model\model.ckpt.
INFO:tensorflow:global_step/sec: 8.52759
INFO:tensorflow:loss = 0.28234076, step = 26201 (11.727 sec)
INFO:tensorflow:Saving checkpoints 

INFO:tensorflow:loss = 0.19268206, step = 29801 (11.495 sec)
INFO:tensorflow:Saving checkpoints for 29900 into C:\Users\JungHyun\checkpoint/cnn_model\model.ckpt.
INFO:tensorflow:global_step/sec: 8.74704
INFO:tensorflow:loss = 0.23518357, step = 29901 (11.431 sec)
INFO:tensorflow:Saving checkpoints for 30000 into C:\Users\JungHyun\checkpoint/cnn_model\model.ckpt.
INFO:tensorflow:global_step/sec: 8.58674
INFO:tensorflow:loss = 0.2207778, step = 30001 (11.646 sec)
INFO:tensorflow:Saving checkpoints for 30100 into C:\Users\JungHyun\checkpoint/cnn_model\model.ckpt.
INFO:tensorflow:global_step/sec: 8.44855
INFO:tensorflow:loss = 0.39753044, step = 30101 (11.836 sec)
INFO:tensorflow:Saving checkpoints for 30200 into C:\Users\JungHyun\checkpoint/cnn_model\model.ckpt.
INFO:tensorflow:global_step/sec: 6.64286
INFO:tensorflow:loss = 0.28608614, step = 30201 (15.055 sec)
INFO:tensorflow:Saving checkpoints for 30300 into C:\Users\JungHyun\checkpoint/cnn_model\model.ckpt.
INFO:tensorflow:global_step

InvalidArgumentError: indices[0,4] = 43756 is not in [0, 43756)
	 [[Node: EmbedSequence/embedding_lookup = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_FLOAT, _class=["loc:@Adam/Assign"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](EmbedSequence/embeddings/read, IteratorGetNext, EmbedSequence/embedding_lookup/axis)]]

Caused by op 'EmbedSequence/embedding_lookup', defined at:
  File "C:\ProgramData\Anaconda3\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\ProgramData\Anaconda3\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\ProgramData\Anaconda3\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 486, in start
    self.io_loop.start()
  File "C:\ProgramData\Anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 127, in start
    self.asyncio_loop.run_forever()
  File "C:\ProgramData\Anaconda3\lib\asyncio\base_events.py", line 422, in run_forever
    self._run_once()
  File "C:\ProgramData\Anaconda3\lib\asyncio\base_events.py", line 1434, in _run_once
    handle._run()
  File "C:\ProgramData\Anaconda3\lib\asyncio\events.py", line 145, in _run
    self._callback(*self._args)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 117, in _handle_events
    handler_func(fileobj, events)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tornado\stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "C:\ProgramData\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "C:\ProgramData\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tornado\stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2662, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2785, in _run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2901, in run_ast_nodes
    if self.run_code(code, result):
  File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2961, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-13-acab1b21d4bb>", line 7, in <module>
    est.train(train_input_fn)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py", line 366, in train
    loss = self._train_model(input_fn, hooks, saving_listeners)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py", line 1119, in _train_model
    return self._train_model_default(input_fn, hooks, saving_listeners)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py", line 1132, in _train_model_default
    features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py", line 1107, in _call_model_fn
    model_fn_results = self._model_fn(features=features, **kwargs)
  File "<ipython-input-5-98ee1036c3ed>", line 11, in model_fn
    initializer=params['embedding_initializer']
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\contrib\layers\python\layers\encoders.py", line 142, in embed_sequence
    return embedding_ops.embedding_lookup(embeddings, ids)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\embedding_ops.py", line 308, in embedding_lookup
    transform_fn=None)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\embedding_ops.py", line 131, in _embedding_lookup_and_transform
    result = _clip(array_ops.gather(params[0], ids, name=name),
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\array_ops.py", line 2666, in gather
    return gen_array_ops.gather_v2(params, indices, axis, name=name)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\gen_array_ops.py", line 3760, in gather_v2
    "GatherV2", params=params, indices=indices, axis=axis, name=name)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 3414, in create_op
    op_def=op_def)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 1740, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): indices[0,4] = 43756 is not in [0, 43756)
	 [[Node: EmbedSequence/embedding_lookup = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_FLOAT, _class=["loc:@Adam/Assign"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](EmbedSequence/embeddings/read, IteratorGetNext, EmbedSequence/embedding_lookup/axis)]]


In [14]:
valid = est.evaluate(eval_input_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-08-21-04:59:03
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\JungHyun\checkpoint/cnn_model\model.ckpt-31000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-08-21-04:59:06
INFO:tensorflow:Saving dict for global step 31000: acc = 0.8301333, global_step = 31000, loss = 0.38109428
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 31000: C:\Users\JungHyun\checkpoint/cnn_model\model.ckpt-31000


In [16]:
INPUT_TEST_DATA_FILE_NAME = 'nsmc_test_input.npy'
LABEL_TEST_DATA_FILE_NAME = 'nsmc_test_label.npy'

test_input_data = np.load(open(FILE_DIR_PATH + INPUT_TEST_DATA_FILE_NAME, 'rb'))
test_label_data = np.load(open(FILE_DIR_PATH + LABEL_TEST_DATA_FILE_NAME, 'rb'))

def test_mapping_fn(X):
    input = {'text': X}
    
    return input

#def test_input_fn():
#    dataset = tf.data.Dataset.from_tensor_slices(test_input_data)
#    dataset = dataset.map(mapping_fn)
#    iterator = dataset.make_one_shot_iterator()
#    
#    return iterator.get_next()

def test_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices(test_input_data)
#    dataset = dataset.shuffle(buffer_size=len(input_eval))
    dataset = dataset.batch(16)
    dataset = dataset.map(test_mapping_fn)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

#def eval_input_fn():
#    dataset = tf.data.Dataset.from_tensor_slices((input_eval, label_eval))
#    dataset = dataset.shuffle(buffer_size=len(input_eval))
#    dataset = dataset.batch(16)
#    dataset = dataset.map(mapping_fn)
#    iterator = dataset.make_one_shot_iterator()
# 
#    return iterator.get_next()

prediction = est.predict(test_input_fn)

num_score = []


#prediction을 진행하여, 마지막에 있는 값을 추출 (label 제외)
for i, p in enumerate(prediction):
    num_score.append(p['prob'][0])

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\JungHyun\checkpoint/cnn_model\model.ckpt-31000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [17]:
result_label = [] 

for score in num_score:
    if score >= 0.5:
        result_label.append(1)
    else:
        result_label.append(0)

In [20]:
value = 0
count = 0

for label_predict in result_label:
    if label_predict == test_label_data[count]:
        value = value +1
    count = count + 1 
    
accr = (value / count) * 100

print(accr,"%")

56.684 %


In [28]:
test_label_data[200:210]

array([0, 1, 1, 1, 1, 0, 1, 0, 0, 1], dtype=int64)