# 感情分析をRNNでやってみます。

In [10]:
import numpy as np

In [11]:
import tensorflow as tf

In [12]:
with open('./reviews.txt', 'r') as f:
    reviews = f.read()

In [59]:
reviews

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [13]:
reviews[:200]

'bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  '

In [14]:
with open('./labels.txt','r') as f:
    labels = f.read()

In [15]:
labels[:200]

'positive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npo'

In [16]:
from string import punctuation
all_text = ''.join([c for c in reviews if c not in punctuation])
reviews = all_text.split('\n')

all_text = ' '.join(reviews)
words = all_text.split()

In [17]:
all_text[:200]

'bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life  such as  teachers   my   years in the teaching profession lead me to believe that bromwell high  s '

In [65]:
# reviews[:200]

In [66]:
# words[:200]

In [18]:
from collections import Counter
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)

In [19]:
vocab_to_int = {word:ii for ii,word in enumerate(vocab,1)}

In [20]:
reviews_int = []
for each in reviews:
    reviews_int.append([vocab_to_int[word] for word in each.split()])

In [21]:
# reviews_int レビューを数字の配列に変換する。

In [22]:
# ラベルをベクトルにする

In [23]:
labels = labels.split('\n')
labels = np.array([1 if each =='positive' else 0 for each in labels])

In [24]:
review_lens = Counter([len(x) for x in reviews_int])

In [25]:
review_lens[0]

1

In [26]:
max(review_lens)

2514

In [27]:
non_zero_idx = [ii for ii, review in enumerate(reviews_int) if len(review) != 0]

In [28]:
len(non_zero_idx)

25000

In [29]:
reviews_int[-1]

[]

In [30]:
reviews_int = [reviews_int[ii] for ii in non_zero_idx]

In [31]:
labels = np.array([labels[ii] for ii in non_zero_idx])

In [32]:
seq_len = 200
features = np.zeros((len(reviews_int), seq_len), dtype=int)

In [33]:
for i, row in enumerate(reviews_int):
    features[i, -len(row):] = np.array(row)[:seq_len]

In [34]:
# features[:10,:100]

In [35]:
# データをトレーニング用と検証用に分割する

In [36]:
split_frac = 0.8

In [37]:
split_idx = int(len(features)*0.8)

In [38]:
train_x, val_x = features[:split_idx], features[split_idx:]

In [39]:
train_y, val_y = labels[:split_idx], labels[split_idx:]

In [40]:
test_idx = int(len(val_x)*0.5)

In [41]:
val_x, test_x = val_x[:test_idx], val_x[test_idx:]

In [42]:
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

In [43]:
print("Train set: \t\t{}".format(train_x.shape))

Train set: 		(20000, 200)


In [44]:
print("Validation set: \t{}".format(val_x.shape))

Validation set: 	(2500, 200)


In [45]:
print("Test set: \t\t{}".format(test_x.shape))

Test set: 		(2500, 200)


## グラフの定義

In [46]:
lstm_size = 256
lstm_layers = 1
batch_size = 500
learning_rate = 0.001

In [47]:
n_words = len(vocab_to_int) + 1

graph = tf.Graph()
with graph.as_default():
    inputs_ = tf.placeholder(tf.int32,[None,None], name='inputs')
    labels_ = tf.placeholder(tf.int32,[None,None], name='labels')
    keep_prob = tf.placeholder(tf.float32, name = 'keep_prob')

In [48]:
embed_size = 300

with graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_words,embed_size),-1,1))
    embed = tf.nn.embedding_lookup(embedding, inputs_)


### LSTMセルとレイヤーを定義する

In [49]:
with graph.as_default():
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    drop = tf.contrib.rnn.DropoutWrapper(lstm,output_keep_prob=keep_prob)
    cell = tf.contrib.rnn.MultiRNNCell([drop] * lstm_layers)
    initial_state = cell.zero_state(batch_size, tf.float32)

In [50]:
with graph.as_default():
    outputs, final_state = tf.nn.dynamic_rnn(cell,embed,initial_state=initial_state)

### 推定値の計算と損失関数、最適化処理の定義

In [51]:
with graph.as_default():
    predictions = tf.contrib.layers.fully_connected(outputs[:,-1],1,activation_fn=tf.sigmoid)
    cost = tf.losses.mean_squared_error(labels_,predictions)
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

### 学習精度の計測

In [52]:
with graph.as_default():
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

### バッチ（指定長のデータ）を返すモジュール定義

In [53]:
def get_batches(x, y, batch_size=100):
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

## トレーニング（学習）

In [54]:
epochs = 10
with graph.as_default():
    saver = tf.train.Saver()
    
with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        for ii, (x,y) in enumerate(get_batches(train_x,train_y, batch_size),1):
            feed = {inputs_: x,
                   labels_: y[:,None],
                   keep_prob: 0.5,
                   initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                     "Iteration: {}".format(iteration),
                     "Training Loss: {:.3f}".format(loss))
                
            if iteration%25==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x,y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: x,
                           labels_: y[:,None],
                           keep_prob: 1,
                           initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state],feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Value Acc: {:.3f}".format(np.mean(val_acc)))
            iteration += 1
    saver.save(sess, "checkpoint/sentiment.ckpt")
        
        
        
        
        
                
            

Epoch: 0/10 Iteration: 5 Training Loss: 0.242
Epoch: 0/10 Iteration: 10 Training Loss: 0.251
Epoch: 0/10 Iteration: 15 Training Loss: 0.223


KeyboardInterrupt: 

In [109]:
test_acc = []
with tf.Session(graph=graph) as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoint'))
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (x,y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {inputs_: x,
               labels_: y[:,None],
               keep_prob: 1,
               initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test Accuracy: {:.3f}".format(np.mean(test_acc)))

INFO:tensorflow:Restoring parameters from checkpoint\sentiment.ckpt
Test Accuracy: 0.785
