In [11]:
import os
from datetime import datetime
import tensorflow as tf
import numpy as np
import json
from sklearn.model_selection import train_test_split

In [15]:
FILE_DIR_PATH = '/home/ubuntu/movie/NSMC/data/'
INPUT_TRAIN_DATA_FILE_NAME = 'nsmc_train_input.npy'
LABEL_TRAIN_DATA_FILE_NAME = 'nsmc_train_label.npy'
DATA_CONFIGS_FILE_NAME = 'data_configs.json'

input_data = np.load(open(FILE_DIR_PATH + INPUT_TRAIN_DATA_FILE_NAME, 'rb'))
label_data = np.load(open(FILE_DIR_PATH + LABEL_TRAIN_DATA_FILE_NAME, 'rb'))
prepro_configs = json.load(open(FILE_DIR_PATH + DATA_CONFIGS_FILE_NAME, 'r', encoding="utf-8"))

In [16]:
TEST_SPLIT = 0.1
RNG_SEED = 13371447

input_train, input_eval, label_train, label_eval = train_test_split(input_data, label_data, test_size=TEST_SPLIT, random_state=RNG_SEED)

In [17]:
def mapping_fn(X, Y):
    input, label = {'text': X}, Y
    return input, label

def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((input_train, label_train))
    dataset = dataset.shuffle(buffer_size=len(input_train))
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(mapping_fn)
    dataset = dataset.repeat(count=NUM_EPOCHS)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((input_eval, label_eval))
    dataset = dataset.shuffle(buffer_size=len(input_eval))
    dataset = dataset.batch(16)
    dataset = dataset.map(mapping_fn)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

In [27]:
# embedding_size = 128
def model_fn(features, labels, mode, params):
    TRAIN = mode == tf.estimator.ModeKeys.TRAIN
    EVAL = mode == tf.estimator.ModeKeys.EVAL
    PREDICT = mode == tf.estimator.ModeKeys.PREDICT

   #embedding layer를 선언합니다.
    input_layer = tf.contrib.layers.embed_sequence(
                    features['text'],
                    vocab_size = vocab_size,
                    embed_dim=128,
                    initializer=params['embedding_initializer']
                    )
    # 현재 모델이 학습모드인지 여부를 확인하는 변수입니다.
    training = (mode == tf.estimator.ModeKeys.TRAIN)
    # embedding layer에 대한 output에 대해 dropout을 취합니다.
    dropout_emb = tf.layers.dropout(inputs=input_layer,
                                  rate=0.2,
                                  training=training)

    #### CNN 구현체 부분 ####
    conv = tf.layers.conv1d(
           inputs=dropout_emb,
           filters=32,
           kernel_size=3,
           padding='same',
           activation=tf.nn.relu)
  
    pool = tf.reduce_max(input_tensor=conv, axis=1) #max-pooling layer


    #Fully-connected layer
    hidden = tf.layers.dense(inputs=pool, units=250, activation=tf.nn.relu)   

    #####################
    dropout_hidden = tf.layers.dropout(inputs=hidden, rate=0.2, training=training)
    logits = tf.layers.dense(inputs=dropout_hidden, units=1)

    #prediction 진행 시, None
    if labels is not None:
        labels = tf.reshape(labels, [-1, 1])
    #최종적으로 학습, 평가, 테스트의 단계로 나누어 활용
    if TRAIN:
        global_step = tf.train.get_global_step()
        loss = tf.losses.sigmoid_cross_entropy(labels, logits)
        train_op = tf.train.AdamOptimizer(0.001).minimize(loss, global_step)

        return tf.estimator.EstimatorSpec(mode=mode, train_op=train_op, loss = loss)

    elif EVAL:
        loss = tf.losses.sigmoid_cross_entropy(labels, logits)
        pred = tf.nn.sigmoid(logits)
        accuracy = tf.metrics.accuracy(labels, tf.round(pred))
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops={'acc': accuracy})

    elif PREDICT:
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions={
                'prob': tf.nn.sigmoid(logits),
            }
        )

In [28]:
BATCH_SIZE = 16
NUM_EPOCHS = 10
vocab_size = prepro_configs['vocab_size']
embedding_size = 128

params = {'embedding_initializer': tf.random_uniform_initializer(-1.0, 1.0)}

model_dir = os.path.join(os.getcwd(), "/home/ubuntu/movie/NSMC/notebook/checkpoint/cnn_model")
os.makedirs(model_dir, exist_ok=True)

config_tf = tf.estimator.RunConfig()
config_tf._save_checkpoints_steps = 100
config_tf._save_checkpoints_secs = None
config_tf._keep_checkpoint_max =  2
config_tf._log_step_count_steps = 100

est = tf.estimator.Estimator(model_fn, model_dir=model_dir, config=config_tf, params=params)

INFO:tensorflow:Using config: {'_model_dir': '/root/NSMC_Sentimental-Analysis/notebook/checkpoint/cnn_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 100, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 2, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f9ff5319080>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [29]:
tf.logging.set_verbosity(tf.logging.INFO)
print(tf.__version__)
time_start = datetime.utcnow()
print("Experiment started at {}".format(time_start.strftime("%H:%M:%S")))
print(".......................................") 

est.train(train_input_fn)

time_end = datetime.utcnow()
print(".......................................")
print("Experiment finished at {}".format(time_end.strftime("%H:%M:%S")))
print("")
time_elapsed = time_end - time_start
print("Experiment elapsed time: {} seconds".format(time_elapsed.total_seconds()))

1.12.0
Experiment started at 07:22:36
.......................................
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /root/NSMC_Sentimental-Analysis/notebook/checkpoint/cnn_model/model.ckpt.
INFO:tensorflow:loss = 0.7394456, step = 1
INFO:tensorflow:Saving checkpoints for 100 into /root/NSMC_Sentimental-Analysis/notebook/checkpoint/cnn_model/model.ckpt.
INFO:tensorflow:global_step/sec: 30.7643
INFO:tensorflow:loss = 0.6711975, step = 101 (3.251 sec)
INFO:tensorflow:Saving checkpoints for 200 into /root/NSMC_Sentimental-Analysis/notebook/checkpoint/cnn_model/model.ckpt.
INFO:tensorflow:global_step/sec: 31.8655
INFO:tensorflow:loss = 0.6915049, step = 201 (3.138 sec)
INFO:tensorflow:Saving checkpoints for 300 into /root/NSMC_Sentimental-Analysis/no

In [30]:
valid = est.evaluate(eval_input_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-12-31-08:06:59
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /root/NSMC_Sentimental-Analysis/notebook/checkpoint/cnn_model/model.ckpt-84380
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-12-31-08:07:00
INFO:tensorflow:Saving dict for global step 84380: acc = 0.8247333, global_step = 84380, loss = 0.5217535
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 84380: /root/NSMC_Sentimental-Analysis/notebook/checkpoint/cnn_model/model.ckpt-84380


In [35]:
INPUT_TEST_DATA_FILE_NAME = 'nsmc_test_input.npy'
LABEL_TEST_DATA_FILE_NAME = 'nsmc_test_label.npy'

test_input_data = np.load(open(FILE_DIR_PATH + INPUT_TEST_DATA_FILE_NAME, 'rb'))
test_label_data = np.load(open(FILE_DIR_PATH + LABEL_TEST_DATA_FILE_NAME, 'rb'))

def test_mapping_fn(X):
    input = {'text': X}
    
    return input

#def test_input_fn():
#    dataset = tf.data.Dataset.from_tensor_slices(test_input_data)
#    dataset = dataset.map(mapping_fn)
#    iterator = dataset.make_one_shot_iterator()
#    
#    return iterator.get_next()

def test_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices(test_input_data)
#    dataset = dataset.shuffle(buffer_size=len(input_eval))
    dataset = dataset.batch(16)
    dataset = dataset.map(test_mapping_fn)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

#def eval_input_fn():
#    dataset = tf.data.Dataset.from_tensor_slices((input_eval, label_eval))
#    dataset = dataset.shuffle(buffer_size=len(input_eval))
#    dataset = dataset.batch(16)
#    dataset = dataset.map(mapping_fn)
#    iterator = dataset.make_one_shot_iterator()
# 
#    return iterator.get_next()

prediction = est.predict(test_input_fn)

num_score = []


#prediction을 진행하여, 마지막에 있는 값을 추출 (label 제외)
for i, p in enumerate(prediction):
    num_score.append(p['prob'][0])

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /root/NSMC_Sentimental-Analysis/notebook/checkpoint/cnn_model/model.ckpt-84380
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [36]:
result_label = [] 

for score in num_score:
    if score >= 0.5:
        result_label.append(1)
    else:
        result_label.append(0)

In [37]:
value = 0
count = 0

for label_predict in result_label:
    if label_predict == test_label_data[count]:
        value = value +1
    count = count + 1 
    
accr = (value / count) * 100

print(accr,"%")

56.245999999999995 %


In [38]:
test_label_data[200:210]

array([0, 1, 1, 1, 1, 0, 1, 0, 0, 1])