In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import ast
import functools
import sys
import pathlib
import random
from nltk.corpus import stopwords
import re
import numpy as np

import tensorflow as tf

In [2]:
# I used the imdb review dataset from kaggle which has 2 folders of negative and positive reviews with text files for each reviews
# The dataset pipeline implemented has the reading of text file and cleaning and word embedding implemented in it

# https://www.kaggle.com/iarunava/imdb-movie-reviews-dataset

input_file_location = "Path to Input test file"

# Importing Conceptnet Numberbatch word embeddings

In [3]:
embeddings_index = {}
with open('Path to embeddings file', encoding='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

print('Word embeddings:', len(embeddings_index))

Word embeddings: 418082


In [4]:
# A list of contractions from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = {"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"}

# Preparing an appropriate input function for the estimator API implementing text cleaning and applying word embeddings

In [5]:
def get_input_fn(mode, input_file_location, batch_size):
    
    def _clean_text(text):
        text = str(text)
        text = text.lower()

        # Replace contractions with their longer forms 
        if True:
            text = text.split()
            new_text = []
            for word in text:
                if word in contractions:
                    new_text.append(contractions[word])
                else:
                    new_text.append(word)
            text = " ".join(new_text)

        # Format words and remove unwanted characters
        text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
        text = re.sub(r'\<a href', ' ', text)
        text = re.sub(r'&amp;', '', text) 
        text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
        text = re.sub(r'<br />', ' ', text)
        text = re.sub(r'<br >', ' ', text)
        text = re.sub(r'<br  >', ' ', text)
        text = re.sub(r'\'', ' ', text)
        text = re.sub(r'\\[a-z0-9][a-z0-9][a-z0-9]', '', text)

        # Optionally, remove stop words
        if True:
            text = text.split()
            stops = set(stopwords.words("english"))
            text = [w for w in text if not w in stops]
            text = " ".join(text)

        return text

    def _apply_word_embedding(text):
        # Need to use 300 for embedding dimensions to match CN's vectors.
        text = str(text)
        text = text.lower()
        text = text.split()

        embedding_dim = 300
        nb_words = len(text)
        max_num = 100
        # Create matrix with default values of zero
        word_embedding_matrix = np.zeros((max_num, embedding_dim), dtype=np.float32)
        for i, word in enumerate(text):
            if i < max_num:
                if word in embeddings_index:
                    word_embedding_matrix[i] = embeddings_index[word]
                else:
                    # If word not in CN, create a random embedding for it
                    new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
                    embeddings_index[word] = new_embedding
                    word_embedding_matrix[i] = new_embedding
        return word_embedding_matrix
    
    def _input_fn():
        
        data_root = pathlib.Path(input_file_location)
        all_review_paths = list(data_root.glob('*/*'))
        all_review_paths = [str(path) for path in all_review_paths if '.txt' in str(path)]
        label_names = sorted(item.name for item in data_root.glob('*/') if item.is_dir())
        label_to_index = dict((name, index) for index,name in enumerate(label_names))
        
        random.shuffle(all_review_paths)
        review_count = len(all_review_paths)
        all_review_labels = [pathlib.Path(path).parent.name for path in all_review_paths]
        all_review_labels = [label_to_index[x] for x in all_review_labels]
        label_ds = tf.data.Dataset.from_tensor_slices(all_review_labels)
        
        review_dataset = tf.data.TextLineDataset(all_review_paths)
        review_dataset = review_dataset.map(lambda reviews: tf.py_func(_clean_text, [reviews], [tf.string])[0])
        review_dataset = review_dataset.map(lambda reviews: tf.py_func(_apply_word_embedding, [reviews], [tf.float32])[0])
        full_dataset = tf.data.Dataset.zip((review_dataset, label_ds))
        full_dataset = full_dataset.batch(batch_size).prefetch(1000)
        features, labels = full_dataset.make_one_shot_iterator().get_next()
        
        return features, labels
    
    return _input_fn

# Writing the CNN + Bidirectional-LSTM based model function

In [6]:
def model_fn(features, labels, mode, params):
    
    def _get_input_tensors(features, labels):
        """Converts the input dict into inks, lengths, and labels tensors."""
        # features[ink] is a sparse tensor that is [8, batch_maxlen, 3]
        # inks will be a dense tensor of [8, maxlen, 3]
        # shapes is [batchsize, 2]
        lengths = [100]*params.batch_size
        inks = tf.reshape(features, [10, -1, 3])
        if labels is not None:
            labels = tf.squeeze(labels)
        return inks, lengths, labels

    def _add_conv_layers(inks, lengths):
        """Adds convolution layers."""
        convolved = inks
        for i in range(params.num_conv):
            convolved_input = convolved
            if True:
                convolved_input = tf.layers.batch_normalization(
                                    convolved_input,
                                    training=(mode == tf.estimator.ModeKeys.TRAIN))
            # Add dropout layer if enabled and not first convolution layer.
            if i > 0 and params.dropout:
                convolved_input = tf.layers.dropout(
                    convolved_input,
                    rate=0.2,
                    training=(mode == tf.estimator.ModeKeys.TRAIN))
            convolved = tf.layers.conv1d(
                convolved_input,
                filters=2,
                kernel_size=params.conv_len,
                activation=None,
                strides=1,
                padding="same",
                name="conv1d_%d" % i)
        return convolved, lengths

    def _add_regular_rnn_layers(convolved, lengths):
        """Adds RNN layers."""
        if params.cell_type == "lstm":
            cell = tf.nn.rnn_cell.LSTMCell
        elif params.cell_type == "block_lstm":
            cell = tf.contrib.rnn.LSTMBlockCell
        cells_fw = [cell(params.num_nodes) for _ in range(params.num_layers)]
        cells_bw = [cell(params.num_nodes) for _ in range(params.num_layers)]
        if params.dropout > 0.0:
            cells_fw = [tf.contrib.rnn.DropoutWrapper(cell) for cell in cells_fw]
            cells_bw = [tf.contrib.rnn.DropoutWrapper(cell) for cell in cells_bw]
        outputs, _, _ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
                                                        cells_fw=cells_fw,
                                                        cells_bw=cells_bw,
                                                        inputs=convolved,
                                                        sequence_length=lengths,
                                                        dtype=tf.float32,
                                                        scope="rnn_classification")
        return outputs

    def _add_cudnn_rnn_layers(convolved):
        """Adds CUDNN LSTM layers."""
        # Convolutions output [B, L, Ch], while CudnnLSTM is time-major.
        convolved = tf.transpose(convolved, [1, 0, 2])
        lstm = tf.contrib.cudnn_rnn.CudnnLSTM(
            num_layers=5,
            num_units=12,
            dropout=params.dropout if mode == tf.estimator.ModeKeys.TRAIN else 0.0,
                                                        direction="bidirectional")
        outputs, _ = lstm(convolved)
        # Convert back from time-major outputs to batch-major outputs.
        outputs = tf.transpose(outputs, [1, 0, 2])
        return outputs

    def _add_rnn_layers(convolved, lengths):
        """Adds recurrent neural network layers depending on the cell type."""
        if params.cell_type != "cudnn_lstm":
            outputs = _add_regular_rnn_layers(convolved, lengths)
        else:
            outputs = _add_cudnn_rnn_layers(convolved)
        # outputs is [batch_size, L, N] where L is the maximal sequence length and N
        # the number of nodes in the last layer.
        mask = tf.tile(tf.expand_dims(tf.sequence_mask(lengths, tf.shape(outputs)[1]), 2),[1, 1, tf.shape(outputs)[2]])
        zero_outside = tf.where(mask, outputs, tf.zeros_like(outputs))
        outputs = tf.reduce_sum(zero_outside, axis=1)
        return outputs

    def _add_fc_layers(final_state):
        """Adds a fully connected layer."""
        return tf.layers.dense(final_state, params.num_classes)

    # Build the model.
    inks, lengths, labels = _get_input_tensors(features, labels)
    convolved, lengths = _add_conv_layers(inks, lengths)
    final_state = _add_rnn_layers(convolved, lengths)
    logits = _add_fc_layers(final_state)
    # Add the loss.
    cross_entropy = tf.reduce_mean(
                          tf.nn.sparse_softmax_cross_entropy_with_logits(
                              labels=labels, logits=logits))
    # Add the optimizer.
    train_op = tf.contrib.layers.optimize_loss(
        loss=cross_entropy,
        global_step=tf.train.get_global_step(),
        learning_rate=params.learning_rate,
        optimizer="Adam",
        # some gradient clipping stabilizes training in the beginning.
        clip_gradients=params.gradient_clipping_norm,
        summaries=["learning_rate", "loss", "gradients", "gradient_norm"])
    # Compute current predictions.
    predictions = tf.argmax(logits, axis=1)
    return tf.estimator.EstimatorSpec(
        mode=mode,
        predictions={"logits": logits, "predictions": predictions},
        loss=cross_entropy,
        train_op=train_op,
        eval_metric_ops={"accuracy": tf.metrics.accuracy(labels, predictions)})

# Creating a function to initialize the estimator

In [7]:
def create_estimator_and_specs(run_config):
    """Creates an Experiment configuration based on the estimator and input fn."""
    model_params = tf.contrib.training.HParams(
        num_layers=5,
        num_nodes=12,
        batch_size=10,
        num_conv=2,
        conv_len=3,
        num_classes=2,
        learning_rate=0.01,
        gradient_clipping_norm=10.0,
        cell_type="lstm",
        batch_norm=True,
        dropout=0.2)

    estimator = tf.estimator.Estimator(
        model_fn=model_fn,
        config=run_config,
        params=model_params)

    train_spec = tf.estimator.TrainSpec(input_fn=get_input_fn(
        mode=tf.estimator.ModeKeys.TRAIN,
        input_file_location=input_file_location,
        batch_size=10), max_steps=5)

    eval_spec = tf.estimator.EvalSpec(input_fn=get_input_fn(
        mode=tf.estimator.ModeKeys.EVAL,
        input_file_location=input_file_location,
        batch_size=10))

    return estimator, train_spec, eval_spec

# Creating and training the estimator

In [8]:
estimator, train_spec, eval_spec = create_estimator_and_specs(
        run_config=tf.estimator.RunConfig(
        model_dir="cnn_outside",
        save_checkpoints_secs=300,
        save_summary_steps=100))

INFO:tensorflow:Using config: {'_model_dir': 'cnn_outside', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 300, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001AA15A5B160>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [27]:
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

INFO:tensorflow:Not using Distribute Coordinator.
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_checkpoints_steps None or save_checkpoints_secs 300.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from outside\model.ckpt-0
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into outside\model.ckpt.
INFO:tensorflow:loss = 1.9240624, step = 1
INFO:tensorflow:Saving checkpoints for 5 into outside\model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-04-07-12:09:01
INFO:tensorflow:Graph was finalized.
INFO:tensorf

({'accuracy': 0.502, 'global_step': 5, 'loss': 4.1356378}, [])