In [1]:
from gensim import utils
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
%matplotlib inline

In [19]:
train_data = pd.read_csv('data/IMDB_review_train.csv')
test_data = pd.read_csv('data/IMDB_review_test.csv')
train_texts = list(train_data.text.values)
train_labels = list(train_data.sentiment.values)
test_texts = list(test_data.text.values)
test_labels = list(test_data.sentiment.values)

## Text to Numbers

In [20]:
from tools.text_preprocess import text_clean
tc = text_clean(train_texts)
train_processed = tc.proceed()
tc = text_clean(test_texts)
test_processed = tc.proceed()

Start to process....
Processing Finished! Timing:  146.488
Start to process....
Processing Finished! Timing:  139.029


Process ForkPoolWorker-3:
Process ForkPoolWorker-16:
Process ForkPoolWorker-12:
Process ForkPoolWorker-13:
Process ForkPoolWorker-11:
Process ForkPoolWorker-10:
Process ForkPoolWorker-8:
Process ForkPoolWorker-6:
Process ForkPoolWorker-5:
Process ForkPoolWorker-9:
Process ForkPoolWorker-4:
Process ForkPoolWorker-1:
Process ForkPoolWorker-7:
Process ForkPoolWorker-2:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/richardsun/anaconda3/lib/python3.6/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/richardsun/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **s

In [22]:
from tools.text_hier_split import sent2words
from tools.token_idx_map import token2idx
sw_train = sent2words(train_processed)
sw_test = sent2words(test_processed)
train_sent_words = sw_train.proceed()
test_sent_words = sw_test.proceed()
ti = token2idx(train_sent_words)
train_sent_idx = ti.proceed()
test_sent_idx = ti.map_text_idx(test_sent_words, ignore_sent=True)

Start mapping words to IDs....
Processing Finished! Timing:  5.131


In [25]:
n_words = len(ti.get_vocab())
print('Total words: %d' % n_words)

Total words: 20001


In [38]:
from tools.sample_generator import generate_samples
MAX_DOCUMENT_LENGTH = 800
gs_train = generate_samples(train_sent_idx, train_labels, MAX_DOCUMENT_LENGTH)
gs_test = generate_samples(test_sent_idx, test_labels, MAX_DOCUMENT_LENGTH)

## Settings

In [70]:
class trainConfig:
    vocab_size = n_words
    max_doc_len = MAX_DOCUMENT_LENGTH
    label_size = 2
    embed_size = 64
    hidden_size = 64
    batch_size = 64
    layer_size = 2
    
class testConfig:
    vocab_size = n_words
    max_doc_len = MAX_DOCUMENT_LENGTH
    label_size = 2
    embed_size = 64
    hidden_size = 64
    batch_size = 64
    layer_size = 2
    
class singleConfig:
    vocab_size = n_words
    max_doc_len = MAX_DOCUMENT_LENGTH
    label_size = 2
    embed_size = 64
    hidden_size = 64#hidden size for hidden state of rnn
    batch_size = 1
    layer_size = 2

In [77]:
train_chunk_num = int(len(train_texts)/trainConfig.batch_size)
test_chunk_num = int(len(test_texts)/trainConfig.batch_size)
remain_num = len(test_texts) - trainConfig.batch_size*test_chunk_num
remain_num

40

## Hierarchical Attention

In [72]:
import functools
import tensorflow as tf
from tensorflow.contrib.layers.python.layers import encoders
def lazy_property(function):
    attribute = '_cache_' + function.__name__

    @property
    @functools.wraps(function)
    def decorator(self):
        if not hasattr(self, attribute):
            setattr(self, attribute, function(self))
        return getattr(self, attribute)

    return decorator

In [73]:
from tensorflow.contrib import rnn
class AttRNN_Model:
    def __init__(self, config, x, y, lengths, is_training=True):
        self.x = x
        self.y = y
        self.vocab_size = config.vocab_size
        self.embed_size = config.embed_size
        self.hidden_size = config.hidden_size
        self.label_size = config.label_size
        self.batch_size = config.batch_size
        self.lengths = lengths
        self.max_doc_len = config.max_doc_len
        self.is_training = is_training
        self.predict
        if is_training:
            self.optimize
        print('Model Initialized!')
    
    @lazy_property
    def cost(self):
        logits = self.inference
        targets = tf.one_hot(self.y, self.label_size, 1, 0)
        targets = tf.cast(targets, tf.float32)
        #Note  tf.nn.softmax_cross_entropy_with_logits(labels=Y, logits=activation)
        loss = tf.losses.softmax_cross_entropy(targets, logits)
        return loss
    
    @lazy_property
    def predict(self):
        logits = self.inference
        #probs = tf.nn.softmax(logits)
        predictions = tf.argmax(logits, 1)
        return predictions
    
    @lazy_property
    def correct_num(self):
        prediction = self.predict
        targets = tf.reshape(self.y, [-1])
        targets = tf.cast(targets, tf.int64)
        correct_prediction = tf.equal(prediction, targets)
        correct_num = tf.reduce_sum(tf.cast(correct_prediction, "float"))
        return correct_num
    
    @lazy_property
    def optimize(self):
        with tf.variable_scope('optimizer'):
            cost = self.cost
        #with tf.name_scope('Optimizer'):
            #self._learning_rate = tf.Variable(0.0, trainable=False)
            train_op = tf.train.AdamOptimizer(0.0001).minimize(cost)
            #train_op = tf.train.AdamOptimizer(self._learning_rate).minimize(cost)
            #tvars = tf.trainable_variables()
            #grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 6)
            #optimizer = tf.train.AdamOptimizer(self._learning_rate)
            #train_op = optimizer.apply_gradients(zip(grads, tvars))
        return train_op
    
    @lazy_property
    def inference(self):
        #Create embedding matrix
        with tf.device("/cpu:0"):
            embeddings = tf.get_variable('embedding', [self.vocab_size,  self.embed_size])
            inputs = tf.nn.embedding_lookup(embeddings, self.x)
        if self.is_training:
            inputs = tf.nn.dropout(inputs, 0.5)

        def lstm():
            return rnn.BasicLSTMCell(self.hidden_size, forget_bias=0.0, 
                                      state_is_tuple=True) 
        
        def GRU():
            return rnn.GRUCell(self.hidden_size)
        #lstm_cell = lstm
        #cell = rnn.MultiRNNCell([lstm_cell() for _ in range(2)], 
                                #state_is_tuple=True)
        fw_cell = GRU()
        bw_cell = GRU()
        initial_fw_state = fw_cell.zero_state(self.batch_size, tf.float32)
        initial_bw_state = bw_cell.zero_state(self.batch_size, tf.float32)
        #Bidirectional dynamic RNN with given lengths for each text
        outputs, status = tf.nn.bidirectional_dynamic_rnn(fw_cell, bw_cell, inputs,
                                                          initial_state_fw=initial_fw_state,
                                                          initial_state_bw=initial_bw_state,
                                                          sequence_length=self.lengths, dtype=tf.float32)
        #Use bidirectional rnn output as hidden states for words
        #size=batch_size, word_length, word_embedding_size*2
        H = tf.concat([outputs[0], outputs[1]], axis=2)

        
        #Calculate attention weights for each word
        with tf.variable_scope('Self_Attention'):
            #W_att = tf.get_variable('word_attention_weights', [self.hidden_size*2, 64])
            #b_att = tf.get_variable('word_attention_biases', [64])
            W_u = tf.get_variable('attention_softmax_weights', [64, 1])
            S = []
            for i in np.arange(self.batch_size):
                #Calculate the coefficients of attention
                h = H[i, :, :]
                u = tf.layers.dense(h, 64, activation=tf.tanh)
                #u = tf.tanh(tf.matmul(h, W_att) + b_att)
                #Softmax
                A = tf.nn.softmax(tf.matmul(u, W_u))
                #Transform original representation into a sum of weighted hidden states
                s = tf.reduce_sum(A * h, 0)
                S.append(s)
        
        #Put all the elements within the list into a tensor
        S = tf.stack(S)
        
        #Output layer   
        #weights = tf.get_variable('weights', [2*self.hidden_size, self.label_size], dtype=tf.float32)
        #biases = tf.get_variable('biases', [self.label_size], dtype=tf.float32)
        #logits = tf.matmul(S, weights) + biases
        with tf.variable_scope('output_layer'):
            logits = tf.layers.dense(S, self.label_size, activation=None)
        #预测值
        return logits
    
    @property
    def learningRate(self):
        return self._learning_rate
        

In [74]:
graph_attrnn = tf.Graph()
#Create models for training and testing data
with graph_attrnn.as_default():
    initializer = tf.random_uniform_initializer(-0.02, 0.02)
    with tf.name_scope('train'):
        train_data = tf.placeholder(tf.int32, [trainConfig.batch_size, None])
        train_label = tf.placeholder(tf.int32, [trainConfig.batch_size])
        train_lengths = tf.placeholder(tf.int32, [trainConfig.batch_size])
        #Set different models for different buckets
        with tf.variable_scope("Model", reuse=None, initializer=initializer):
            train_model = AttRNN_Model(trainConfig, train_data, train_label, train_lengths)
            saver=tf.train.Saver()
    with tf.name_scope('test'):
        test_data = tf.placeholder(tf.int32, [testConfig.batch_size, None])
        test_label = tf.placeholder(tf.int32, [testConfig.batch_size])
        test_lengths = tf.placeholder(tf.int32, [testConfig.batch_size])
        single_data = tf.placeholder(tf.int32, [singleConfig.batch_size, None])
        single_label = tf.placeholder(tf.int32, [singleConfig.batch_size])
        single_lengths = tf.placeholder(tf.int32, [singleConfig.batch_size])
        #Set different models for different buckets
        with tf.variable_scope("Model", reuse=True, initializer=initializer):
            test_model = AttRNN_Model(testConfig, test_data, test_label, test_lengths, False)
            single_model = AttRNN_Model(singleConfig, single_data, single_label, single_lengths, False)

Model Initialized!
Model Initialized!
Model Initialized!


In [75]:
gs_train = generate_samples(train_sent_idx, train_labels, MAX_DOCUMENT_LENGTH)
gs_test = generate_samples(test_sent_idx, test_labels, MAX_DOCUMENT_LENGTH)

In [79]:
import time, os
epochs = 1
#train_chunk_num = 10
file = "ckpt/hirnn.ckpt"
with tf.Session(graph=graph_attrnn) as sess:
    #Initialize parameters
    init = tf.global_variables_initializer()
    sess.run(init)
    if os.path.exists("ckpt/hirnn.ckpt.index"):
        saver.restore(sess, file)
    start_time = time.time()
    for m in range(epochs):
        for i in range(train_chunk_num):
            #sess.run(tf.assign(learning_rate, 0.002*((0.98)**m)))
            x, y, lengths, _ = gs_train.generate_batch(trainConfig.batch_size)
            feed_dict = {train_data:x, train_label:y, train_lengths:lengths}
            l, _ = sess.run([train_model.cost, train_model.optimize], feed_dict=feed_dict)
            if i%100 == 0:
                print('Loss:', round(l, 4))
        end_time = time.time()
        print('Epoch', m, 'time:{:.2f}'.format(end_time - start_time))
        start_time = end_time
    saver.save(sess,'ckpt/hirnn.ckpt')
    #Calculate Testing Accuracy
    print('Testing...')
    count = 0
    gs_test = generate_samples(test_sent_idx, test_labels, MAX_DOCUMENT_LENGTH)
    for _ in range(test_chunk_num):
        #Traverse each data
        x, y, lengths, _ = gs_test.generate_batch(testConfig.batch_size, False)
        feed_dict = {test_data:x, test_label:y, test_lengths:lengths}
        n = sess.run(test_model.correct_num, feed_dict=feed_dict)
        count += np.sum(n)
    for _ in range(remain_num):
        #Traverse each data
        x, y, lengths, _ = gs_test.generate_batch(1, False)
        feed_dict = {single_data:x, single_label:y, single_lengths:lengths}
        n = sess.run(single_model.correct_num, feed_dict=feed_dict)
        count += np.sum(n)
    end_time = time.time()
    print('Testing Time:{:.2f}'.format(end_time - start_time))
    print(count*1.0/len(test_texts))  

Loss: 0.6934
Loss: 0.6142
Loss: 0.3579
Loss: 0.2406
Epoch 0 time:1072.45
Testing...
Test Samples come to an end!
Testing Time:274.85
0.86724
