In [1]:
from gensim import utils
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
%matplotlib inline

In [2]:
train_data = pd.read_csv('data/IMDB_review_train.csv')
test_data = pd.read_csv('data/IMDB_review_test.csv')
train_texts = list(train_data.text.values)
train_labels = list(train_data.sentiment.values)
test_texts = list(test_data.text.values)
test_labels = list(test_data.sentiment.values)

## Text to Numbers

In [3]:
from tensorflow.contrib import learn

  return f(*args, **kwds)


In [4]:
MAX_DOCUMENT_LENGTH = 800
vocab_processor = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH, min_frequency=2) # tensorflow提供的工具，将数据填充为最大长度，默认0填充
train_sent_idx = np.array(list(vocab_processor.fit_transform(train_texts)))
test_sent_idx = np.array(list(vocab_processor.transform(test_texts)))

In [5]:
n_words = len(vocab_processor.vocabulary_)
n_words

37410

In [4]:
from tools.text_preprocess import text_clean
tc = text_clean(train_texts, is_multiprocess=False)
train_processed = tc.proceed()
tc = text_clean(test_texts, is_multiprocess=False)
test_processed = tc.proceed()

Start to process....
Processing Finished! Timing:  872.21
Start to process....
Processing Finished! Timing:  852.114


In [8]:
words = []
for item in train_processed:
    words.extend(item.split())

In [9]:
from collections import Counter
word_freq = Counter(words)

In [10]:
len(word_freq)

60008

In [2]:
train_data = pd.read_csv('data/intermediate_data/train_processed.csv')
test_data = pd.read_csv('data/intermediate_data/train_processed.csv')
train_processed = list(train_data.text.values)
train_labels = list(train_data.sentiment.values)
test_processed = list(test_data.text.values)
test_labels = list(test_data.sentiment.values)

In [3]:
from tools.text_hier_split import sent2words
from tools.token_idx_map import token2idx
sw_train = sent2words(train_processed)
sw_test = sent2words(test_processed)
train_sent_words = sw_train.proceed()
test_sent_words = sw_test.proceed()
ti = token2idx(train_sent_words, 30000)
train_sent_idx = ti.proceed()
test_sent_idx = ti.map_text_idx(test_sent_words, ignore_sent=True)

Start mapping words to IDs....
Processing Finished! Timing:  5.2


In [4]:
n_words = len(ti.get_vocab())
print('Total words: %d' % n_words)

Total words: 30001


In [5]:
from tools.sample_generator import generate_samples
MAX_DOCUMENT_LENGTH = 800
gs_train = generate_samples(train_sent_idx, train_labels, MAX_DOCUMENT_LENGTH)
gs_test = generate_samples(test_sent_idx, test_labels, MAX_DOCUMENT_LENGTH)

## Settings

In [6]:
class trainConfig:
    vocab_size = n_words
    max_doc_len = MAX_DOCUMENT_LENGTH
    label_size = 2
    embed_size = 100
    hidden_size = 250
    batch_size = 64
    layer_size = 2
    
class testConfig:
    vocab_size = n_words
    max_doc_len = MAX_DOCUMENT_LENGTH
    label_size = 2
    embed_size = 100
    hidden_size = 250
    batch_size = 64
    layer_size = 2
    
class singleConfig:
    vocab_size = n_words
    max_doc_len = MAX_DOCUMENT_LENGTH
    label_size = 2
    embed_size = 100
    hidden_size = 250#hidden size for hidden state of rnn
    batch_size = 1
    layer_size = 2

In [7]:
train_chunk_num = int(len(train_processed)/trainConfig.batch_size)
test_chunk_num = int(len(test_processed)/trainConfig.batch_size)
remain_num = len(test_processed) - trainConfig.batch_size*test_chunk_num
remain_num

40

## RNNCNN

In [None]:
import functools
import tensorflow as tf
def lazy_property(function):
    attribute = '_cache_' + function.__name__

    @property
    @functools.wraps(function)
    def decorator(self):
        if not hasattr(self, attribute):
            setattr(self, attribute, function(self))
        return getattr(self, attribute)

    return decorator

In [114]:
from tensorflow.contrib import rnn
class RNN_CNN_Model:
    '''
    A CNN followed by a RNN
    CNN can capture n-gram information
    RNN can remember dependency
    '''
    def __init__(self, config, is_training=True):
        self.embed_size = config.embed_size
        self.hidden_size = config.hidden_size
        self.label_size = config.label_size
        self.batch_size = config.batch_size
        self.vocab_size = config.vocab_size
        self.max_doc_len = config.max_doc_len
        self.is_training = is_training
        self.x = tf.placeholder(tf.int32, 
                                [self.batch_size, config.max_doc_len])
        self.y = tf.placeholder(tf.int32, [self.batch_size])
        self.lengths = tf.placeholder(tf.int32, [self.batch_size])
        self.predict
        if is_training:
            self.optimize
        print('Model Initialized!')
    
    @lazy_property
    def cost(self):
        logits = self.inference
        targets = tf.one_hot(self.y, self.label_size, 1, 0)
        targets = tf.cast(targets, tf.float32)
        #Note  tf.nn.softmax_cross_entropy_with_logits(labels=Y, logits=activation)
        loss = tf.losses.softmax_cross_entropy(targets, logits)
        return loss
    
    @lazy_property
    def predict(self):
        logits = self.inference
        #probs = tf.nn.softmax(logits)
        predictions = tf.argmax(logits, 1)
        return predictions
    
    @lazy_property
    def correct_num(self):
        prediction = self.predict
        targets = tf.reshape(self.y, [-1])
        targets = tf.cast(targets, tf.int64)
        correct_prediction = tf.equal(prediction, targets)
        correct_num = tf.reduce_sum(tf.cast(correct_prediction, "float"))
        return correct_num
    
    @lazy_property
    def optimize(self):
        with tf.variable_scope('optimizer'):
            cost = self.cost
        #with tf.name_scope('Optimizer'):
            #self._learning_rate = tf.Variable(0.0, trainable=False)
            train_op = tf.train.AdamOptimizer(0.0001).minimize(cost)
            #train_op = tf.train.AdamOptimizer(self._learning_rate).minimize(cost)
            #tvars = tf.trainable_variables()
            #grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 6)
            #optimizer = tf.train.AdamOptimizer(self._learning_rate)
            #train_op = optimizer.apply_gradients(zip(grads, tvars))
        return train_op
    
    @lazy_property
    def inference(self):
        #Create embedding matrix
        with tf.device("/cpu:0"):
            embeddings = tf.get_variable('embedding', [self.vocab_size,  self.embed_size])
            inputs = tf.nn.embedding_lookup(embeddings, self.x)
        if self.is_training:
            #Batch_size, word_length, embed_size
            inputs = tf.nn.dropout(inputs, 0.5)
            
        def GRU():
            return rnn.GRUCell(self.hidden_size)
        #lstm_cell = lstm
        #cell = rnn.MultiRNNCell([lstm_cell() for _ in range(2)], 
                                #state_is_tuple=True)
        fw_cell = GRU()
        bw_cell = GRU()
        initial_fw_state = fw_cell.zero_state(self.batch_size, tf.float32)
        initial_bw_state = bw_cell.zero_state(self.batch_size, tf.float32)
        #Bidirectional dynamic RNN with given lengths for each text
        outputs, status = tf.nn.bidirectional_dynamic_rnn(fw_cell, bw_cell, inputs,
                                            initial_state_fw=initial_fw_state,
                                            initial_state_bw=initial_bw_state,
                                            sequence_length=self.lengths, 
                                                          dtype=tf.float32)
        #In a dynamic RNN, if the length is N, the outputs for the words after N are 0
        #And the status are copycats of the last status of the Nth word
        #Use bidirectional rnn output as hidden states for words
        #Outputs, batch_size, max_len, hidden_size*2
        output = tf.concat([outputs[0], outputs[1]], axis=2)
        output_expand = tf.expand_dims(output, 3)
        #print(output)
        #print(output_expand)
        with tf.variable_scope("conv-maxpool"):
            # Convolution Layer
            #batch_size, max_len-1, 1, 256
            h = tf.layers.conv2d(output_expand, 256,
                                       kernel_size=(2, 2*self.hidden_size),
                                       padding='valid',
                                 activation=tf.nn.relu)
                
    
            h_max_pool = tf.nn.max_pool(
                    h,
                    ksize=[1, self.max_doc_len-1, 
                           1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name="pool")
            h_avg_pool = tf.nn.avg_pool(
                    h,
                    ksize=[1, self.max_doc_len-1, 
                           1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name="pool")
        
        #h_max_pool_squeeze = tf.squeeze(h_max_pool)
        #h_avg_pool_squeeze = tf.squeeze(h_avg_pool)
        h_pool = tf.concat([h_max_pool, h_avg_pool], axis=3)
        h_pool = tf.squeeze(h_pool, [1, 2])
        
        h_pool_flat = tf.reshape(h_pool, [self.batch_size, -1])
        
        if self.is_training:
            h_pool_flat = tf.nn.dropout(h_pool_flat, 0.5)
        
        
        with tf.variable_scope('output'):
            logits = tf.layers.dense(h_pool_flat, self.label_size, 
                                kernel_regularizer=tf.contrib.layers.l2_regularizer(0.003))
        
        return logits
    
    @property
    def learningRate(self):
        return self._learning_rate
        

In [115]:
# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
#%load_ext autoreload
#%autoreload 2

In [116]:
#from models.CNN_RNN import CNN_RNN_Model

In [117]:
import tensorflow as tf
graph_rnncnn = tf.Graph()
#Create models for training and testing data
#Create models for training and testing data
with graph_rnncnn.as_default():
    initializer = tf.random_uniform_initializer(-0.02, 0.02)
    with tf.name_scope('train'):
        #Set different models for different buckets
        with tf.variable_scope("Model", reuse=None, initializer=initializer):
            train_model = RNN_CNN_Model(trainConfig)
            saver=tf.train.Saver()
    with tf.name_scope('test'):
        #Set different models for different buckets
        with tf.variable_scope("Model", reuse=True, initializer=initializer):
            test_model = RNN_CNN_Model(testConfig, False)
            single_model = RNN_CNN_Model(singleConfig, False)

Model Initialized!
Model Initialized!
Model Initialized!


In [118]:
gs_train = generate_samples(train_sent_idx, train_labels, MAX_DOCUMENT_LENGTH)
gs_test = generate_samples(test_sent_idx, test_labels, MAX_DOCUMENT_LENGTH)

In [121]:
import time, os
epochs = 1
#train_chunk_num = 10
file = "ckpt_rnncnn/rnncnn.ckpt"
with tf.Session(graph=graph_rnncnn) as sess:
    #Initialize parameters
    init = tf.global_variables_initializer()
    if not os.path.exists("ckpt_rnncnn"):
        os.mkdir('ckpt_rnnrnn')
    if os.path.exists("ckpt_rnncnn/rnncnn.ckpt.index"):
        saver.restore(sess, file)
    else:
         sess.run(init)
    start_time = time.time()
    for m in range(epochs):
        for i in range(train_chunk_num):
            #sess.run(tf.assign(learning_rate, 0.002*((0.98)**m)))
            x, y, lengths, _ = gs_train.generate_batch(trainConfig.batch_size)
            feed_dict = {train_model.x:x, train_model.y:y, train_model.lengths:lengths}
            l, _ = sess.run([train_model.cost, train_model.optimize], feed_dict=feed_dict)
            if i%100 == 0:
                print('Loss:', round(l, 4))
        end_time = time.time()
        print('Epoch', m, 'time:{:.2f}'.format(end_time - start_time))
        
    saver.save(sess, file)

Loss: 0.6931


KeyboardInterrupt: 

In [None]:
#Calculate Testing Accuracy
with tf.Session(graph=graph_rnncnn) as sess:
    print('Testing...')
    count = 0
    #saver = tf.train.import_meta_graph('ckpt_cnn/cnn.ckpt.meta')
    saver.restore(sess,tf.train.latest_checkpoint('ckpt_rnncnn/'))
    print('Parameters restored')
    start_time = time.time()
    test_gs = generate_samples(np.array(test_processed), np.array(test_labels), False)
    for _ in range(test_chunk_num):
        #Traverse each data
        x, y, lengths, _ = gs_test.generate_batch(testConfig.batch_size)
        feed_dict = {test_model.x:x, test_model.y:y, test_model.lengths:lengths}
        n = sess.run(test_model.correct_num, feed_dict=feed_dict)
        count += np.sum(n)
    for _ in range(remain_num):
        #Traverse each data
        x, y, lengths, _ = gs_test.generate_batch(1)
        feed_dict = {single_model.x:x, single_model.y:y, 
                     single_model.lengths:lengths}
        n = sess.run(single_model.correct_num, feed_dict=feed_dict)
        count += np.sum(n)
    end_time = time.time()
    print('Testing Time:{:.2f}'.format(end_time - start_time))
    print(count*1.0/len(test_processed)) 

In [13]:
import time, os
epochs = 5
#train_chunk_num = 10
file = "ckpt_cnnrnn/cnnrnn.ckpt"
with tf.Session(graph=graph_cnn) as sess:
    #Initialize parameters
    init = tf.global_variables_initializer()
    sess.run(init)
    if os.path.exists("ckpt_cnnrnn/cnnrnn.ckpt.index"):
        saver.restore(sess, file)
    start_time = time.time()
    for m in range(epochs):
        for i in range(train_chunk_num):
            #sess.run(tf.assign(learning_rate, 0.002*((0.98)**m)))
            x, y, lengths, _ = gs_train.generate_batch(trainConfig.batch_size)
            feed_dict = {train_data:x, train_label:y, train_lengths:lengths}
            l, _ = sess.run([train_model.cost, train_model.optimize], feed_dict=feed_dict)
            if i%100 == 0:
                print('Loss:', round(l, 4))
        end_time = time.time()
        print('Epoch', m, 'time:{:.2f}'.format(end_time - start_time))
        start_time = end_time
    saver.save(sess,'ckpt_cnnrnn/cnnrnn.ckpt.index')
    #Calculate Testing Accuracy
    print('Testing...')
    count = 0
    gs_test = generate_samples(test_sent_idx, test_labels, MAX_DOCUMENT_LENGTH)
    for _ in range(test_chunk_num):
        #Traverse each data
        x, y, lengths, _ = gs_test.generate_batch(testConfig.batch_size, False)
        feed_dict = {test_data:x, test_label:y, test_lengths:lengths}
        n = sess.run(test_model.correct_num, feed_dict=feed_dict)
        count += np.sum(n)
    for _ in range(remain_num):
        #Traverse each data
        x, y, lengths, _ = gs_test.generate_batch(1, False)
        feed_dict = {single_data:x, single_label:y, single_lengths:lengths}
        n = sess.run(single_model.correct_num, feed_dict=feed_dict)
        count += np.sum(n)
    end_time = time.time()
    print('Testing Time:{:.2f}'.format(end_time - start_time))
    print(count*1.0/len(test_texts))  

Loss: 0.6931
Loss: 0.6932
Loss: 0.6915
Loss: 0.337
Epoch 0 time:442.62
Loss: 0.2243
Loss: 0.3124
Loss: 0.2129
Loss: 0.2006
Epoch 1 time:432.01
Loss: 0.2789
Loss: 0.1721
Loss: 0.2293
Loss: 0.142
Epoch 2 time:434.29
Loss: 0.0851
Loss: 0.1118
Loss: 0.1221
Loss: 0.1141
Epoch 3 time:434.63
Loss: 0.0558
Loss: 0.1152
Loss: 0.2174
Loss: 0.2588
Epoch 4 time:434.74
Testing...
Test Samples come to an end!
Testing Time:156.30
0.9672


In [69]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences

from keras.models import Model
from keras.models import Sequential

from keras.layers import Input, Dense, Embedding, Conv1D, Conv2D, MaxPooling1D, MaxPool2D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.layers import SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D

from keras.callbacks import Callback
from keras.optimizers import Adam

from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import load_model
from keras.utils.vis_utils import plot_model

Using TensorFlow backend.


In [74]:
embedding_dim = 300
MAX_NB_WORDS = 10000
MAX_LENGTH = 120
embedding_matrix = np.random.random((MAX_NB_WORDS, embedding_dim))
    
inp = Input(shape=(MAX_LENGTH, ))
x = Embedding(input_dim=MAX_NB_WORDS, output_dim=embedding_dim, input_length=MAX_LENGTH, 
                  weights=[embedding_matrix], trainable=True)(inp)
x = SpatialDropout1D(0.3)(x)
x = Bidirectional(GRU(100, return_sequences=True))(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
conc = concatenate([avg_pool, max_pool])
outp = Dense(1, activation="sigmoid")(conc)
    
model = Model(inputs=inp, outputs=outp)
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

In [76]:
avg_pool

<tf.Tensor 'global_average_pooling1d_2/Mean:0' shape=(?, 200) dtype=float32>

In [77]:
conc

<tf.Tensor 'concatenate_2/concat:0' shape=(?, 400) dtype=float32>