# SETUP

In [65]:
import numpy as np
import tensorflow as tf
import collections
import argparse
import time
import os
from six.moves import cPickle

In [66]:
file_path = "/Users/quartz/data/rnn_practice/"
file_train = os.path.join(file_path, "ptb.train.txt")
with open(file_raw, "r") as f:
    data_train = f.read().split()

In [67]:
# language data preprocessing

counter = collections.Counter(data_train)
count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

print("Type of 'count pairs' is %s and length is %d" 
      % (type(count_pairs), len(count_pairs)))

for i in range(5):
    print(count_pairs[i])

Type of 'count pairs' is <class 'list'> and length is 9999
('the', 50770)
('<unk>', 45020)
('N', 32481)
('of', 24400)
('to', 23638)


In [68]:
# make vocabulary

chars, counts = zip(*count_pairs)
vocab = dict(zip(chars, range(len(chars))))

In [69]:
vocab['to']

4

In [70]:
# load test dataset

file_test = os.path.join(file_path, "ptb.test.txt")
with open(file_test, "r") as f:
    data_test = f.read().split()

In [71]:
# 고유 번호 부여

corpus_train = np.array(list(map(vocab.get, data_train)))
corpus_test = np.array(list(map(vocab.get, data_test)))
check_len = 30
print("\n'corpus_train' looks like %s" % (corpus_train[20:check_len]))


'corpus_train' looks like [9995 9996 9997 9998 9255    1    2   71  392   32]


In [79]:
corpus_train, corpus_test

(array([9969, 9970, 9971, ...,    4,   22,    1]),
 array([101,  13,  23, ..., 346,  21, 504]))

In [81]:
# generate batch dataset

batch_size = 20
seq_length = 20
num_batches_train = int(corpus_train.size / (batch_size * seq_length))
num_batches_test = int(corpus_test.size / (batch_size * seq_length))

corpus_train_reduced = corpus_train[:(num_batches_train*batch_size*seq_length)]
corpus_test_reduced = corpus_test[:(num_batches_test*batch_size*seq_length)]

xdata_train = corpus_train_reduced
ydata_train = np.copy(xdata_train)
ydata_train[:-1] = xdata_train[1:]
ydata_train[-1] = xdata_train[0]

xdata_test = corpus_test_reduced
ydata_test = np.copy(xdata_test)
ydata_test[:-1] = xdata_test[1:]
ydata_test[-1] = xdata_test[0]

In [82]:
xbatches_train = np.split(xdata_train.reshape(batch_size, -1), num_batches_train, 1)
ybatches_train = np.split(ydata_train.reshape(batch_size, -1), num_batches_train, 1)
xbatches_test = np.split(xdata_test.reshape(batch_size, -1), num_batches_test, 1)
ybatches_test = np.split(ydata_test.reshape(batch_size, -1), num_batches_test, 1)

### LSTM 모델 구축

In [75]:
# LSTM 패러미터 지정
vocab_size = len(vocab)
rnn_size = 200
num_layers = 2
grad_clip = 5.

# LSTM 모델 정의
unitcell   = tf.nn.rnn_cell.BasicLSTMCell(rnn_size, state_is_tuple=True)
cell       = tf.nn.rnn_cell.MultiRNNCell([unitcell] * num_layers, state_is_tuple=True)
input_data = tf.placeholder(tf.int32, [batch_size, seq_length])
targets    = tf.placeholder(tf.int32, [batch_size, seq_length])
istate     = cell.zero_state(batch_size, tf.float32)

In [85]:
# 가중치, 편향 정의. 워드 임베딩.

tf.to_float(input_data)
with tf.variable_scope('RnnLm', reuse=True):
    softmax_w = tf.get_variable("softmax_w", [rnn_size, vocab_size])
    softmax_b = tf.get_variable("softmax_b", [vocab_size])
    
    with tf.device("/cpu:0"):
        embedding = tf.get_variable("embedding", [vocab_size, rnn_size])
        inputs = tf.split(1, seq_length, tf.nn.embedding_lookup(embedding, input_data, tf.float32))
        inputs = [tf.squeeze(_input, [1]) for _input in inputs]

TypeError: Input 'split_dim' of 'Split' Op has type float32 that does not match expected type of int32.

### TEST

In [58]:
counter = collections.Counter(data_train)
count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
print ("Type of 'count_pairs' is %s and length is %d"
       % (type(count_pairs), len(count_pairs)))
for i in range(5):
    print (count_pairs[i])

Type of 'count_pairs' is <class 'list'> and length is 9999
('the', 50770)
('<unk>', 45020)
('N', 32481)
('of', 24400)
('to', 23638)


In [59]:
chars, counts = zip(*count_pairs)
vocab = dict(zip(chars, range(len(chars))))

input_file  = os.path.join(file_path, "ptb.test.txt")
with open(input_file, "r") as f:
    data_test = f.read().split()

# 학습 및 테스트 셋에 앞에서 만든 사전을 적용하여 고유번호를 부여합니다. corpus_train = np.array(list(map(vocab.get, data_train)))
corpus_test  = np.array(list(map(vocab.get, data_test)))
check_len = 30
print ("\n'corpus_train' looks like %s" % (corpus_train[20:check_len]))


'corpus_train' looks like [9995 9996 9997 9998 9255    1    2   71  392   32]


In [60]:
batch_size  = 20
seq_length  = 20
num_batches_train = int(corpus_train.size / (batch_size * seq_length))
num_batches_test = int(corpus_test.size / (batch_size * seq_length))
corpus_train_reduced = corpus_train[:(num_batches_train*batch_size*seq_length)]
corpus_test_reduced  = corpus_test[:(num_batches_test*batch_size*seq_length)]
xdata_train = corpus_train_reduced
ydata_train = np.copy(xdata_train)
ydata_train[:-1] = xdata_train[1:]
ydata_train[-1]  = xdata_train[0]
xdata_test = corpus_test_reduced
ydata_test = np.copy(xdata_test)
ydata_test[:-1] = xdata_test[1:]
ydata_test[-1]  = xdata_test[0]

In [61]:
xbatches_train = np.split(xdata_train.reshape(batch_size, -1), num_batches_train, 1)
ybatches_train = np.split(ydata_train.reshape(batch_size, -1), num_batches_train, 1)
xbatches_test = np.split(xdata_test.reshape(batch_size, -1), num_batches_test, 1)
ybatches_test = np.split(ydata_test.reshape(batch_size, -1), num_batches_test, 1)

In [64]:
vocab_size = len(vocab)
rnn_size   = 200
num_layers = 2
grad_clip = 5.

# LSTM 모델을 정의합니다.
unitcell   = tf.nn.rnn_cell.BasicLSTMCell(rnn_size, state_is_tuple=True)
cell       = tf.nn.rnn_cell.MultiRNNCell([unitcell] * num_layers, state_is_tuple=True)
input_data = tf.placeholder(tf.int32, [batch_size, seq_length])
targets    = tf.placeholder(tf.int32, [batch_size, seq_length])
istate     = cell.zero_state(batch_size, tf.float32)

# 가중치 및 편향을 정의하고 Word-Embedding 작업을 수행합니다. 
with tf.variable_scope('RnnLm', reuse=tf.AUTO_REUSE):
    softmax_w = tf.get_variable("softmax_w", [rnn_size, vocab_size])
    softmax_b = tf.get_variable("softmax_b", [vocab_size])
    with tf.device("/cpu:0"):
        embedding = tf.get_variable("embedding", [vocab_size, rnn_size])
        inputs = tf.split(1, seq_length, tf.nn.embedding_lookup(embedding, input_data))
        inputs = [tf.squeeze(_input, [1]) for _input in inputs]

TypeError: Input 'split_dim' of 'Split' Op has type float32 that does not match expected type of int32.