In [11]:
import os
from datetime import datetime
import tensorflow as tf
import json
import numpy as np
from sklearn.model_selection import train_test_split

DATA_IN_PATH = 'C:/jupyter_project/Estimation _Analysis_with_Tensorflow_and_Natural_Language_Processing/data/DATA_IN/'
DIR_PATH = 'C:/jupyter_project/Estimation _Analysis_with_Tensorflow_and_Natural_Language_Processing/data/DATA_IN/nsmc/'

TEST_SPLIT = 0.1
RNG_SEED = 13371447
BATCH_SIZE = 16 #샘플데이터 중 한번에 넘겨주는 데이터의 수
NUM_EPOCHS = 10 #전체 데이터를 돌며 학습하는 수
EMB_SIZE = 160

In [12]:
#featuers : input_fn에서 온 batch_size만큼의 입력 데이터
#labels : input_fn에서 온 batch_size만큼의 레이블
#mode : 딥러닝 모드
#params : 추가적인 구성요소
def model_fn(features, labels, mode, params):
    TRAIN = mode == tf.estimator.ModeKeys.TRAIN
    EVAL = mode == tf.estimator.ModeKeys.EVAL
    PREDICT = mode == tf.estimator.ModeKeys.PREDICT
    
    #임베딩 : 데이터 백터화
    embedding_layer = tf.keras.layers.Embedding(VOCAB_SIZE, EMB_SIZE)(features['x'])
    #Dropout : 전체 weight을 계산에 참여시키지 않고 일부만 참여시킴
    #rate : 탈락시킬 데이터의 비율
    dropout_emb = tf.keras.layers.Dropout(rate = 0.2)(embedding_layer)
    #Conv1D : 합성곱 신경망...? 뉴럴네티워크 구성??
    conv = tf.keras.layers.Conv1D(filters=32,
                                  kernel_size=3,
                                  padding='same',
                                  activation=tf.nn.relu)(dropout_emb)
    pool = tf.keras.layers.GlobalMaxPool1D()(conv)
    #히든레이어 생성????
    hidden = tf.keras.layers.Dense(units=250, activation=tf.nn.relu)(pool)   
    dropout_hidden = tf.keras.layers.Dropout(rate=0.2)(hidden, training = TRAIN)
    logits = tf.keras.layers.Dense(units=1)(dropout_hidden)

    if labels is not None:
        labels = tf.reshape(labels, [-1, 1])
        
    if TRAIN:
        print("=======================train")
        #만약 training이 예기치못하게 멈췄을 경우 다시 시작할때 사용하는 변수
        global_step = tf.train.get_global_step()
        #cross_entropy = 확률변수의 평균 정보량 = 불확실성의 정도
        loss = tf.losses.sigmoid_cross_entropy(labels, logits)
        #모델 최적화
        train_op = tf.train.AdamOptimizer(0.001).minimize(loss, global_step)
        #사용한 모델 객체 리턴
        return tf.estimator.EstimatorSpec(mode=mode, train_op=train_op, loss = loss)
    
    elif EVAL:
        print("=======================eval")
        loss = tf.losses.sigmoid_cross_entropy(labels, logits)
        #활성화 함수 지정(sigmoid)
        #sigmoid와 logits은 역함수 관계
        pred = tf.nn.sigmoid(logits)
        #유사도 계산
        accuracy = tf.metrics.accuracy(labels, tf.round(pred))
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops={'acc': accuracy})
        
    elif PREDICT:
        print("=======================test")
        return tf.estimator.EstimatorSpec(mode=mode, predictions={'prob': tf.nn.sigmoid(logits), 
                                                                 'logits': logits})

In [13]:
def mapping_fn(X, Y):
    input, label = {'x':X}, Y #model_fn에서 Embedding할 때 'x' 를 사용
    return input, label
#===================================================================================
def train_input_fn(): ## Estimator 모델에서 사용되는 데이터 입력 파이프라인
    #Dataset생성 : 입력된 텐서로부터 slices 생성
    dataset = tf.data.Dataset.from_tensor_slices((train_data, train_label))
    #데이터들을 램던으로 섞음(큰수 입력)
    dataset = dataset.shuffle(buffer_size=len(train_data))
    #데이터를 읽어올 개수를 지정함 
    dataset = dataset.batch(BATCH_SIZE)
    #데이터 적절한 형식으로 매핑
    dataset = dataset.map(mapping_fn)
    #데이터를 읽다가 마지막에 도달하면 다시 처음부터 조회
    datset = dataset.repeat(count=NUM_EPOCHS)
    #iterator 생성
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next() #다음 항목에 연결되어 있는 tf.Tensor 객체를 리턴
#===================================================================================
def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((eval_data, eval_label))
    dataset = dataset.shuffle(buffer_size=len(eval_data))
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(mapping_fn)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()
#===================================================================================
def test_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((test_data, test_label))
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(mapping_fn)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

In [14]:
def const(att):
    train_data = 'train_data_'+att+'.npy'
    train_label = 'train_label_'+att+'.npy'
    train_configs = 'train_'+att+'_configs.json'
    
    test_data = 'test_data_'+att+'.npy'
    test_label = 'test_label_'+att+'.npy'
    test_configs = 'test_'+att+'_configs.json'
    
    train_data_loc = np.load(open(DIR_PATH+train_data, 'rb'))
    train_label_loc = np.load(open(DIR_PATH+train_label, 'rb'))
    train_configs_loc = json.load(open(DIR_PATH+train_configs, 'r'))
    
    test_data_loc = np.load(open(DIR_PATH+test_data, 'rb'))
    test_label_loc = np.load(open(DIR_PATH+test_label, 'rb'))
    test_configs_loc = json.load(open(DIR_PATH+test_configs, 'r'))
    
    return train_data_loc, train_label_loc, train_configs_loc, test_data_loc, test_label_loc, test_configs_loc

In [15]:
att = ['과제', '시험', '출석', '수업', '인성']
tmp = []
res = []
f = open("C:/jupyter_project/Estimation _Analysis_with_Tensorflow_and_Natural_Language_Processing/data/result/res.txt", 'w')
for i in att:
    train_data, train_label, train_configs, test_data, test_label, test_configs = const(i)
    VOCAB_SIZE = train_configs['vocab_size']
    train_data, eval_data, train_label, eval_label = train_test_split(train_data, train_label, test_size = TEST_SPLIT, random_state = RNG_SEED)
        
    est = tf.estimator.Estimator(model_fn,model_dir="C:/jupyter_project/Estimation _Analysis_with_Tensorflow_and_Natural_Language_Processing/data/checkpoint/cnn_model_"+i) #모델 생성 
    
    est.train(train_input_fn) # 모델 훈련
    
    valid = est.evaluate(eval_input_fn) # 모델 검증 
    
    predict = est.predict(test_input_fn) # 모델 평가
    
    #===================================================================================================
    positive = 0
    negative = 0
    piont = 0
    f.write(i)
    for k in predict:
        tmp.append(k['prob'])
        if(k['prob'] > 0.5):
            positive = positive + 1
            f.write("\nlabel : 1 긍정")
            f.write("prob : {}".format(k['prob']))
        else:
            negative = negative + 1
            f.write("\nlabel : 0 부정")
            f.write("prob : {}".format(k['prob']))
    if((positive + negative) != 0):
        point = 10*(positive/(positive + negative))
        f.write("\n {:.2f}\n\n".format(point))
    else:
        f.write("\n 해당 속성에 해당하는 값 없음\n\n")
    res.append(tmp)
f.close()
    

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:/jupyter_project/Estimation _Analysis_with_Tensorflow_and_Natural_Language_Processing/data/checkpoint/cnn_model_과제', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000024BB8605160>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INF

InvalidArgumentError: indices[0,0] = 43 is not in [0, 4)
	 [[node embedding/embedding_lookup (defined at <ipython-input-12-4f0956f8be74>:11) ]]

Caused by op 'embedding/embedding_lookup', defined at:
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\ipykernel\kernelapp.py", line 505, in start
    self.io_loop.start()
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\tornado\platform\asyncio.py", line 148, in start
    self.asyncio_loop.run_forever()
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\asyncio\base_events.py", line 438, in run_forever
    self._run_once()
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\asyncio\base_events.py", line 1451, in _run_once
    handle._run()
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\asyncio\events.py", line 145, in _run
    self._callback(*self._args)
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\tornado\ioloop.py", line 690, in <lambda>
    lambda f: self._run_callback(functools.partial(callback, future))
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\tornado\ioloop.py", line 743, in _run_callback
    ret = callback()
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\tornado\gen.py", line 781, in inner
    self.run()
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\tornado\gen.py", line 742, in run
    yielded = self.gen.send(value)
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\ipykernel\kernelbase.py", line 357, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\tornado\gen.py", line 209, in wrapper
    yielded = next(result)
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\ipykernel\kernelbase.py", line 267, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\tornado\gen.py", line 209, in wrapper
    yielded = next(result)
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\ipykernel\kernelbase.py", line 534, in execute_request
    user_expressions, allow_stdin,
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\tornado\gen.py", line 209, in wrapper
    yielded = next(result)
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\ipykernel\ipkernel.py", line 294, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\ipykernel\zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\IPython\core\interactiveshell.py", line 2848, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\IPython\core\interactiveshell.py", line 2874, in _run_cell
    return runner(coro)
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\IPython\core\async_helpers.py", line 67, in _pseudo_sync_runner
    coro.send(None)
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\IPython\core\interactiveshell.py", line 3049, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\IPython\core\interactiveshell.py", line 3214, in run_ast_nodes
    if (yield from self.run_code(code, result)):
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\IPython\core\interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-15-b658d0374e21>", line 12, in <module>
    est.train(train_input_fn) # 모델 훈련
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py", line 358, in train
    loss = self._train_model(input_fn, hooks, saving_listeners)
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py", line 1124, in _train_model
    return self._train_model_default(input_fn, hooks, saving_listeners)
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py", line 1154, in _train_model_default
    features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py", line 1112, in _call_model_fn
    model_fn_results = self._model_fn(features=features, **kwargs)
  File "<ipython-input-12-4f0956f8be74>", line 11, in model_fn
    embedding_layer = tf.keras.layers.Embedding(VOCAB_SIZE, EMB_SIZE)(features['x'])
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\tensorflow\python\keras\engine\base_layer.py", line 554, in __call__
    outputs = self.call(inputs, *args, **kwargs)
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\tensorflow\python\keras\layers\embeddings.py", line 179, in call
    out = embedding_ops.embedding_lookup(self.embeddings, inputs)
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\tensorflow\python\ops\embedding_ops.py", line 316, in embedding_lookup
    transform_fn=None)
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\tensorflow\python\ops\embedding_ops.py", line 133, in _embedding_lookup_and_transform
    result = _clip(array_ops.gather(params[0], ids, name=name),
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\tensorflow\python\util\dispatch.py", line 180, in wrapper
    return target(*args, **kwargs)
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\tensorflow\python\ops\array_ops.py", line 3271, in gather
    return params.sparse_read(indices, name=name)
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\tensorflow\python\ops\resource_variable_ops.py", line 759, in sparse_read
    self._handle, indices, dtype=self._dtype, name=name)
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\tensorflow\python\ops\gen_resource_variable_ops.py", line 632, in resource_gather
    validate_indices=validate_indices, name=name)
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 788, in _apply_op_helper
    op_def=op_def)
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\tensorflow\python\util\deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\tensorflow\python\framework\ops.py", line 3300, in create_op
    op_def=op_def)
  File "C:\ProgramData\Anaconda3\envs\newkjh\lib\site-packages\tensorflow\python\framework\ops.py", line 1801, in __init__
    self._traceback = tf_stack.extract_stack()

InvalidArgumentError (see above for traceback): indices[0,0] = 43 is not in [0, 4)
	 [[node embedding/embedding_lookup (defined at <ipython-input-12-4f0956f8be74>:11) ]]
