In [1]:
from gensim import utils
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
%matplotlib inline

In [2]:
train_data = pd.read_csv('data/IMDB_review_train.csv')
test_data = pd.read_csv('data/IMDB_review_test.csv')
train_texts = list(train_data.text.values)
train_labels = list(train_data.sentiment.values)
test_texts = list(test_data.text.values)
test_labels = list(test_data.sentiment.values)

## Text to Numbers

In [3]:
from tools.text_preprocess import text_clean
tc = text_clean(train_texts)
train_processed = tc.proceed()
tc = text_clean(test_texts)
test_processed = tc.proceed()

Start to process....
Processing Finished! Timing:  135.282
Start to process....
Processing Finished! Timing:  248.044


In [4]:
words = []
for item in train_processed:
    words.extend(item.split())

In [6]:
from collections import Counter
word_freq = Counter(words)

In [7]:
len(word_freq)

61194

In [8]:
from tools.text_hier_split import sent2words
from tools.token_idx_map import token2idx
sw_train = sent2words(train_processed)
sw_test = sent2words(test_processed)
train_sent_words = sw_train.proceed()
test_sent_words = sw_test.proceed()
ti = token2idx(train_sent_words, 30000)
train_sent_idx = ti.proceed()
test_sent_idx = ti.map_text_idx(test_sent_words, ignore_sent=True)

Start mapping words to IDs....
Processing Finished! Timing:  6.174


In [9]:
n_words = len(ti.get_vocab())
print('Total words: %d' % n_words)

Total words: 30001


In [10]:
from tools.sample_generator import generate_samples
MAX_DOCUMENT_LENGTH = 800
gs_train = generate_samples(train_sent_idx, train_labels, MAX_DOCUMENT_LENGTH)
gs_test = generate_samples(test_sent_idx, test_labels, MAX_DOCUMENT_LENGTH)

## Settings

In [11]:
embed_size = 100
class trainConfig:
    vocab_size = n_words
    max_doc_len = MAX_DOCUMENT_LENGTH
    label_size = 2
    embed_size = embed_size
    hidden_size = 250
    batch_size = 64
    layer_size = 2
    
class testConfig:
    vocab_size = n_words
    max_doc_len = MAX_DOCUMENT_LENGTH
    label_size = 2
    embed_size = embed_size
    hidden_size = 250
    batch_size = 64
    layer_size = 2
    
class singleConfig:
    vocab_size = n_words
    max_doc_len = MAX_DOCUMENT_LENGTH
    label_size = 2
    embed_size = embed_size
    hidden_size = 250#hidden size for hidden state of rnn
    batch_size = 1
    layer_size = 2

In [12]:
train_chunk_num = int(len(train_texts)/trainConfig.batch_size)
test_chunk_num = int(len(test_texts)/trainConfig.batch_size)
remain_num = len(test_texts) - trainConfig.batch_size*test_chunk_num
remain_num

40

## BiGRU

In [15]:
import tensorflow as tf
graph_rnn = tf.Graph()
#Create models for training and testing data
#Create models for training and testing data
with graph_rnn.as_default():
    initializer = tf.random_uniform_initializer(-0.02, 0.02)
    with tf.name_scope('train'):
        #Set different models for different buckets
        with tf.variable_scope("Model", reuse=None, initializer=initializer):
            train_model = biGRU_Model(trainConfig)
            saver=tf.train.Saver()
    with tf.name_scope('test'):
        #Set different models for different buckets
        with tf.variable_scope("Model", reuse=True, initializer=initializer):
            test_model = biGRU_Model(testConfig, False)
            single_model = biGRU_Model(singleConfig, False)

Model Initialized!
Model Initialized!
Model Initialized!


In [16]:
gs_train = generate_samples(train_sent_idx, train_labels, MAX_DOCUMENT_LENGTH)
gs_test = generate_samples(test_sent_idx, test_labels, MAX_DOCUMENT_LENGTH)

In [17]:
import time, os
epochs = 2
#train_chunk_num = 10
file = "ckpt_rnn/rnn.ckpt"
with tf.Session(graph=graph_rnn) as sess:
    #Initialize parameters
    init = tf.global_variables_initializer()
    if not os.path.exists("ckpt_rnn"):
        os.mkdir('ckpt_rnn')
    if os.path.exists("ckpt_rnn/rnn.ckpt.index"):
        saver.restore(sess, file)
    else:
         sess.run(init)
    start_time = time.time()
    for m in range(epochs):
        for i in range(train_chunk_num):
            #sess.run(tf.assign(learning_rate, 0.002*((0.98)**m)))
            x, y, lengths, _ = gs_train.generate_batch(trainConfig.batch_size)
            feed_dict = {train_model.x:x, train_model.y:y, train_model.lengths:lengths}
            l, _ = sess.run([train_model.cost, train_model.optimize], feed_dict=feed_dict)
            if i%100 == 0:
                print('Loss:', round(l, 4))
        end_time = time.time()
        print('Epoch', m, 'time:{:.2f}'.format(end_time - start_time))
        
    saver.save(sess,"ckpt_rnn/rnn.ckpt")

Loss: 0.6932
Loss: 0.6932
Loss: 0.6927
Loss: 0.579
Epoch 0 time:1383.91
Loss: 0.4184
Loss: 0.4034
Loss: 0.3862
Loss: 0.2532
Epoch 1 time:464.54
Testing...
Test Samples come to an end!
Testing Time:411.29
0.84532


In [None]:
#Calculate Testing Accuracy
with tf.Session(graph=graph_rnn) as sess:
    print('Testing...')
    count = 0
    #saver = tf.train.import_meta_graph('ckpt_cnn/cnn.ckpt.meta')
    saver.restore(sess,tf.train.latest_checkpoint('ckpt_rnn/'))
    print('Parameters restored')
    start_time = time.time()
    test_gs = generate_samples(np.array(test_processed), np.array(test_labels), False)
    for _ in range(test_chunk_num):
        #Traverse each data
        x, y, lengths, _ = gs_test.generate_batch(testConfig.batch_size)
        feed_dict = {test_model.x:x, test_model.y:y, test_model.lengths:lengths}
        n = sess.run(test_model.correct_num, feed_dict=feed_dict)
        count += np.sum(n)
    for _ in range(remain_num):
        #Traverse each data
        x, y, lengths, _ = gs_test.generate_batch(1)
        feed_dict = {single_model.x:x, single_model.y:y, 
                     single_model.lengths:lengths}
        n = sess.run(single_model.correct_num, feed_dict=feed_dict)
        count += np.sum(n)
    end_time = time.time()
    print('Testing Time:{:.2f}'.format(end_time - start_time))
    print(count*1.0/len(test_processed)) 