In [1]:
import os
import pickle
import random
from time import localtime, strftime
from pprint import pprint

import numpy as np
import tensorflow as tf
from collections import OrderedDict

from nets.text_cnn import textcnn

## 1. Loading and transforming the data

### 1.1. Loading the data

In [2]:
# 미리 토크나이징된 데이터 불러오기
with open('/home/ubuntu/workspace/data/sample_naver_movie/134963_tokenizeddocs_scores.pkl', 'rb') as f:
    data = pickle.load(f)
tokenized_documents = data[0]
scores = data[1]

In [3]:
max_document_length = max((len(doc) for doc in tokenized_documents))

In [3]:
# max_document_length 계산하기
max_document_length = 0
for document in tokenized_documents:
    document_length = len(document)
    if document_length > max_document_length:
        max_document_length = document_length

In [10]:
max_document_length

74

In [5]:
# documents: 토크나이징된 결과를 다시 공백으로 concatenate(결합)
# (추후 TensorFlow 내에 있는 VocabularyProcessor 클래스를 사용하기 위함)
documents = []
for document in tokenized_documents:
    documents.append(' '.join([token for token in document]))

In [6]:
pprint(documents[0:3])

['시사회 에서 보고 왔습니다 동화 와 재즈 뮤지컬 의 만남 ! 지루 하지 않고 재밌 습니다',
 '사랑 과 꿈, 그 흐름 의 아름다 움을 음악 과 영상 으로 최 대한 담아 놓았 다. 배우 들 연기 는 두 말할 것 없고',
 '지금 껏 영화 평가 해본 적이 없는 데 진짜 .. 최고 네요! 색감 . 스토리 . 음악 . 연기 모두 ㅜㅜ 최고 입니다 !! !!']


### 1.2. Transfoming the data
tensorflow.contrib.learn.preprocessing 내에 **VocabularyProcessor**라는 클래스를 이용
- 모든 문서에 등장하는 단어들에 인덱스를 할당
- 길이가 다른 문서를 max_document_length로 맞춰주는 역할

In [7]:
vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(max_document_length)

In [8]:
# Change data type: list -> np.array
input_data = np.array(list(vocab_processor.fit_transform(documents)))
target_data = np.array(scores)

In [12]:
# input_data는 어떻게 생겼는가?
print(type(input_data))
print(input_data.shape)
print(input_data[6372])

<class 'numpy.ndarray'>
(15603, 74)
[ 848  340  341  353   88  221 4548 1519  362    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0]


In [9]:
# target_data는 어떻게 생겼는가?
print(type(target_data))
print(target_data.shape)
print(target_data[0])

<class 'numpy.ndarray'>
(15603,)
9


아래는 TensorFlow의 VocabularyProcessor 클래스로부터 다시 단어를 가져오는 방법을 정리한 것입니다.

In [13]:
# http://stackoverflow.com/questions/40661684/tensorflow-vocabularyprocessor

# Extract word:id mapping from the object.
vocab_dict = vocab_processor.vocabulary_._mapping

# Sort the vocabulary dictionary on the basis of values(id).
sorted_vocab = sorted(vocab_dict.items(), key = lambda input_data : input_data[1])

# Treat the id's as index into list and create a list of words in the ascending order of id's
# word with id i goes at index i of the list.
vocabulary = list(list(zip(*sorted_vocab))[0])

print(vocabulary[:10])
print(tokenized_documents[0])
print(input_data[0])

['<UNK>', '시사회', '에서', '보고', '왔습니다', '동화', '와', '재즈', '뮤지컬', '의']
['시사회', '에서', '보고', '왔습니다', '동화', '와', '재즈', '뮤지컬', '의', '만남', '!', '지루', '하지', '않고', '재밌', '습니다']
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]


## 2. Divide data into training set and test set
여기서는 영화 평점 7점 이하 (class = 0) 와 8점 이상 (class = 1) 으로 나누어서, 각 영화 평가에 클래스를 할당하였습니다.

In [14]:
# Parameters
train_prop = 0.8

In [15]:
# 각 클래스 별 영화 평점 인덱스 저장
class_0 = [i for i in range(len(target_data)) if target_data[i] <= 7]
class_1 = [i for i in range(len(target_data)) if target_data[i] >= 8]

In [16]:
print("Number of sentences with class = 0: %d" % len(class_0))
print("Number of sentences with class = 1: %d" % len(class_1))

Number of sentences with class = 0: 2786
Number of sentences with class = 1: 12817


In [17]:
# 각 클래스 별로 랜덤하게 train/test set의 인덱스 생성
random.seed(1234)
train_class_0 = random.sample(class_0, round(train_prop * len(class_0)))
test_class_0 = [element for element in class_0 if element not in train_class_0]
random.seed(1234)
train_class_1 = random.sample(class_1, round(train_prop * len(class_1)))
test_class_1 = [element for element in class_1 if element not in train_class_1]

In [18]:
# Train set
input_train = input_data[train_class_0 + train_class_1]
target_train = np.r_[[[1,0] for _ in train_class_0] + [[0,1] for _ in train_class_1]]

# Test set
input_test = input_data[test_class_0 + test_class_1]
target_test = np.r_[[[1,0] for _ in test_class_0] + [[0,1] for _ in test_class_1]]

## 3. Build the graph
1. placeholders
2. logits using the network
3. cost function and accuracy 
4. optimizer and training operator
5. collections and saver

In [19]:
# Parameters
embedding_dim = 128
filter_sizes = (3, 4, 5)
num_filters = 32

learning_rate = 0.01

save_dir = 'ckpt_textcnn/'

sequence_length = input_train.shape[1]
num_classes = target_train.shape[1]
vocab_size = len(vocab_processor.vocabulary_)

### 3.1. Placeholders

In [20]:
# Placeholders
X = tf.placeholder(tf.int32, [None, sequence_length], name='placeholder_X')
Y = tf.placeholder(tf.float32, [None, num_classes], name='placeholder_Y')
train_phase = tf.placeholder(tf.bool, name='placeholder_train_phase')

In [21]:
print(X)
print(Y)

Tensor("placeholder_X:0", shape=(?, 74), dtype=int32)
Tensor("placeholder_Y:0", shape=(?, 2), dtype=float32)


### 3.2. Logits

In [22]:
logits, end_points = textcnn(input_placeholder=X, 
                             target_placeholder=Y, 
                             vocab_size=vocab_size, 
                             embedding_dim=embedding_dim,
                             filter_sizes=filter_sizes,
                             num_filters=num_filters,
                             is_training=train_phase,
                             scope='TextCNN')

In [23]:
print("======= logits ======")
print(logits)
print("======= end_points ======")
pprint(end_points)

Tensor("TextCNN/Fully-connected/logits:0", shape=(?, 2), dtype=float32)
OrderedDict([('Embedding',
              <tensorflow.python.ops.variables.Variable object at 0x7f83cca28320>),
             ('Conv-maxpool-3',
              <tf.Tensor 'TextCNN/Conv-maxpool-3/maxpool:0' shape=(?, 1, 1, 32) dtype=float32>),
             ('Conv-maxpool-4',
              <tf.Tensor 'TextCNN/Conv-maxpool-4/maxpool:0' shape=(?, 1, 1, 32) dtype=float32>),
             ('Conv-maxpool-5',
              <tf.Tensor 'TextCNN/Conv-maxpool-5/maxpool:0' shape=(?, 1, 1, 32) dtype=float32>),
             ('Flatten',
              <tf.Tensor 'TextCNN/Flatten/flatten:0' shape=(?, 96) dtype=float32>),
             ('Fully-connected',
              <tf.Tensor 'TextCNN/Fully-connected/logits:0' shape=(?, 2) dtype=float32>)])


### 3.3. Cost function and accuracy

In [24]:
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=Y, logits=logits, name='cross_entropy')
cost = tf.reduce_mean(cross_entropy, name='cost')

In [25]:
prediction = tf.argmax(logits, 1, name='prediction')
correct = tf.equal(prediction, tf.argmax(Y, 1), name='correct')
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name='accuracy')

### 3.4. Optimizer and training operator

In [26]:
optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(cost)

### 3.5. Collections and saver

In [27]:
## Collections
tf.add_to_collection(name='placeholder_X', value=X)
tf.add_to_collection(name='placeholder_Y', value=Y)
tf.add_to_collection(name='placeholder_train_phase', value=train_phase)
tf.add_to_collection(name='logits', value=logits)
# tf.add_to_collection(name='end_points', value=end_points)
tf.add_to_collection(name='cross_entropy', value=cross_entropy)
tf.add_to_collection(name='cost', value=cost)
tf.add_to_collection(name='prediction', value=prediction)
tf.add_to_collection(name='correct', value=correct)
tf.add_to_collection(name='accuracy', value=accuracy)
# tf.add_to_collection(name='optimizer', value=optimizer)
tf.add_to_collection(name='train_op', value=train_op)

## Saver
saver = tf.train.Saver(tf.global_variables(), max_to_keep = 10)

In [28]:
save_dir_path = os.path.join(os.getcwd(), save_dir)
if not os.path.isdir(save_dir_path):
    print("There is no %s directory. So I'll make it." % save_dir_path)
    os.mkdir(save_dir_path)
else:
    print("I have already %s directory." % save_dir_path)

There is no /home/ubuntu/workspace/users/lovit/day8/Doc_classification_CNN/ckpt_textcnn/ directory. So I'll make it.


## 4. Training and test

In [29]:
# Parameters
num_epochs = 100
evaluate_interval = 2
save_interval = 10

train_batch_size = 64
test_batch_size = 256

In [27]:
def make_batch_iterator(target_data, batch_size, allow_small_batch=True):
    num_points = len(target_data)
    start_idx = list(range(0, num_points, batch_size))
    end_idx = list(range(batch_size, num_points + 1, batch_size))
    if allow_small_batch and end_idx[-1] < num_points :
        start_idx.append(end_idx[-1])
        end_idx.append(num_points)
    return zip(start_idx, end_idx)

In [28]:
# test_batch_iterator = make_batch_iterator(target_data=target_test, batch_size=test_batch_size)

In [29]:
# for start, end in train_batch_iterator: print(start, '\t', end)

In [30]:
# for step, (start, end) in enumerate(test_batch_iterator): print(step, ': ', start, '\t', end)

In [31]:
# Declare the session.
sess = tf.Session()

In [32]:
# Initialize global variables.
sess.run(tf.global_variables_initializer())

In [33]:
for epoch in range(num_epochs):
    
    print("\n")
    print("Start %d epoch!" % epoch)
    
    train_cost = 0
    
    train_batch_iterator = make_batch_iterator(target_data=target_train, batch_size=train_batch_size)
    for step, (start, end) in enumerate(train_batch_iterator):
    
        batch_X = input_train[start:end]
        batch_Y = target_train[start:end]
        _, tmp_cost = sess.run([train_op, cost], feed_dict={X: batch_X, Y: batch_Y, train_phase: True})
        train_cost += tmp_cost
        
        if (step + 1) % evaluate_interval == 0:
            tmp_acc = sess.run(accuracy, feed_dict={X: batch_X, Y: batch_Y, train_phase: False})
            time = strftime("%Y-%m-%d %H:%M:%S", localtime())
            print("\t(%s) [%d step / %d epoch] train_cost = %g, train_accuracy = %g" % (time, step, epoch, train_cost/(step + 1), tmp_acc))
        
        if (step + 1) % save_interval == 0:
            save_name = save_dir_path + '/model_' + str(epoch) + '_' + str(step)
            saver.save(sess, save_name)
            time = strftime("%Y-%m-%d %H:%M:%S", localtime())
            print("\t(%s) [%d step / %d epoch] Save model." % (time, step, epoch))
    
    train_cost = train_cost / (step + 1)
    time = strftime("%Y-%m-%d %H:%M:%S", localtime())
    print("(%s) [%d epoch] train_cost = %g", (time, epoch, train_cost))
    



Start 0 epoch!
	(2017-04-15 22:46:06) [1 step / 0 epoch] train_cost = 0.467174, train_accuracy = 1
	(2017-04-15 22:46:06) [3 step / 0 epoch] train_cost = 0.287404, train_accuracy = 1
	(2017-04-15 22:46:06) [5 step / 0 epoch] train_cost = 0.206883, train_accuracy = 1
	(2017-04-15 22:46:06) [7 step / 0 epoch] train_cost = 0.162678, train_accuracy = 1
	(2017-04-15 22:46:06) [9 step / 0 epoch] train_cost = 0.134555, train_accuracy = 1
		(2017-04-15 22:46:06) [9 step / 0 epoch] Save model.
	(2017-04-15 22:46:06) [11 step / 0 epoch] train_cost = 0.115093, train_accuracy = 1
	(2017-04-15 22:46:06) [13 step / 0 epoch] train_cost = 0.100473, train_accuracy = 1
	(2017-04-15 22:46:06) [15 step / 0 epoch] train_cost = 0.0893812, train_accuracy = 1
	(2017-04-15 22:46:06) [17 step / 0 epoch] train_cost = 0.0804873, train_accuracy = 1
	(2017-04-15 22:46:06) [19 step / 0 epoch] train_cost = 0.0733224, train_accuracy = 1
		(2017-04-15 22:46:06) [19 step / 0 epoch] Save model.
	(2017-04-15 22:46:06) [

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/kth/anaconda3/envs/zeta-ocr/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-33-b7bcf2569795>", line 23, in <module>
    saver.save(sess, save_name)
  File "/home/kth/anaconda3/envs/zeta-ocr/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 1375, in save
    self.export_meta_graph(meta_graph_filename)
  File "/home/kth/anaconda3/envs/zeta-ocr/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 1403, in export_meta_graph
    graph_def=ops.get_default_graph().as_graph_def(add_shapes=True),
  File "/home/kth/anaconda3/envs/zeta-ocr/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2189, in as_graph_def
    result, _ = self._as_graph_def(from_version, add_shapes)
  File "/home/kth/anaconda3/envs/zeta-ocr/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", li

KeyboardInterrupt: 