In [None]:
from __future__ import print_function
import codecs
from collections import namedtuple
import os
from os import path

import numpy as np
import tensorflow as tf

from utils import *

In [None]:
parent_dir = path.dirname(os.getcwd())

# 1. Load Training Data, Dev Data and Test Data

In [None]:
def load_data(data_path):
    """Loads the data stored at `data_path`.
    
    Parameters
    ----------
        data_path : str
        
    Yields
    -------
        word : list[unicode]

        score : int
    """
    with codecs.open(data_path, encoding='utf-8') as f:
        for line in f:
            line = line.lower()
            words_score = line.split("|")
            words = words_score[:-1]
            score = int(words_score[-1])
            yield words, score

In [None]:
train_full_sentences_path = path.join(parent_dir, "data/sst/train_sentences.txt")
train_full_sentences, train_full_sentences_scores = zip(*load_data(train_full_sentences_path))

train_phrases_path = path.join(parent_dir, "data/sst/train_phrases.txt")
train_phrases, train_phrases_scores = zip(*load_data(train_phrases_path))

In [None]:
train_sentences = train_full_sentences + train_phrases
train_scores = train_full_sentences_scores + train_phrases_scores

In [None]:
dev_full_sentences_path = path.join(parent_dir, "data/sst/dev_sentences.txt")
dev_full_sentences, dev_full_sentences_scores = zip(*load_data(dev_full_sentences_path))

test_full_sentences_path = path.join(parent_dir, "data/sst/test_sentences.txt")
test_full_sentences, test_full_sentences_scores = zip(*load_data(test_full_sentences_path))

Some example sentences and score from the training data
* 0 - most negative
* 2 - neutral
* 4 - most positive

### Example sentences and scores

In [None]:
for sentence, score in zip(train_full_sentences[:10], train_full_sentences_scores[:10]):
    print(" ".join(sentence) + " | " + str(score))

# 2. Load embeddings

In [None]:
embedding_path = path.join(parent_dir, 
                           "data/sst/word2vec_filtered_lower-negative300.txt")

In [None]:
# Zero vector for padding sentence to a fixed length and for unknown words
weight_vectors = [np.zeros(300, dtype=np.float32)]
word2idx = {u"<unk>" : 0}

with codecs.open(embedding_path, encoding='utf-8') as f:
    print('loading word2vec embeddings from %s' % embedding_path)
    for line in f:
        word, vec = line.split(u' ', 1)
        word2idx[word] = len(weight_vectors)
        weight_vectors.append(np.array(vec.split(), dtype=np.float32))

# Random embedding vector for filter padding.
word2idx[u"<filter_padding>"] = len(weight_vectors)
weight_vectors.append(np.random.uniform(-0.25, 0.25, 300).astype(np.float32))

word2idx[u"."] = len(weight_vectors)
weight_vectors.append(np.random.uniform(-0.25, 0.25, 300).astype(np.float32))

In [None]:
we = np.asarray(weight_vectors)

# 3. Transform phrases/sentences to embedding indices

In [None]:
train_data = pad_sequences(
    [words_to_embedding_index_with_padding(words, word2idx) if len(words) >= 10
     else words_to_embedding_index_with_padding(words, word2idx, filter_len=3)
     for words in train_sentences],
    word2idx[u'<unk>']
)

dev_data = pad_sequences(
    [words_to_embedding_index_with_padding(words, word2idx) if len(words) >= 10
     else words_to_embedding_index_with_padding(words, word2idx, filter_len=3)
     for words in dev_full_sentences],
    word2idx[u'<unk>']
)

test_data = pad_sequences(
    [words_to_embedding_index_with_padding(words, word2idx) if len(words) >= 10
     else words_to_embedding_index_with_padding(words, word2idx, filter_len=3)
     for words in test_full_sentences],
    word2idx[u'<unk>']
)

In [None]:
train_labels = np.asarray(train_scores, dtype=np.int32)
dev_labels = np.asarray(dev_full_sentences_scores, dtype=np.int32)
test_labels = np.asarray(test_full_sentences_scores, dtype=np.int32)

# Define CNN model

In [None]:
def cnn_model_fn(features, labels, mode):
    """Model function for CNN"""
    
    # 1. Input Data
    input_data = features
    
    # 2. Word Embedding
    channel = tf.contrib.layers.embed_sequence(
        ids=input_data,
        initializer=tf.constant_initializer(
            value=we,
            dtype=tf.float32),
        trainable=True,
        scope='embedding',
        vocab_size=we.shape[0],
        embed_dim=we.shape[1]
    )
    # 3. Dropout for input layer
    chanel = tf.layers.dropout(
        inputs=channel, 
        rate=0.5, 
        training=(mode == tf.estimator.ModeKeys.TRAIN)
    )
    
    # 4. Convolution
    branches = []
    for branch_index in range(3):
        with tf.variable_scope('CNN_Layer' + str(branch_index)):
            inference = tf.layers.conv1d(
                channel,
                filters=300,  # feature maps in the paper
                kernel_size=3 + branch_index, # filter window
                padding='VALID',
                activation=tf.nn.relu,
                kernel_initializer=tf.random_uniform_initializer(
                                    minval=-0.01,
                                    maxval=0.01,
                                    dtype=tf.float32)
            )
            branch = tf.reduce_max(input_tensor=inference, axis=1)
            branches.append(branch)
    network = tf.concat(values=branches, axis=1)
    
    # 5. Dropout for penultimate layer
    dropout = tf.layers.dropout(
        inputs=network, 
        rate=0.5, 
        training=(mode == tf.estimator.ModeKeys.TRAIN)
    )
    
    # 6. Final layer
    logits = tf.layers.dense(inputs=dropout,
                             kernel_initializer=tf.random_normal_initializer(
                                 mean=0.0,
                                 stddev=0.01
                             ),
                             kernel_regularizer=tf.contrib.layers.l2_regularizer(0.001),
                             units=5)
    
    # Predictions
    predictions = {
        # Generate predictions
        "classes": tf.argmax(input=logits, axis=1),
        # Add `softmax_tensor` to the graph.
        "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
    }
  
    # Return predictions if mode is to PREDICT
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Calculate Loss (for both TRAIN and EVAL modes)
    onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=5)
    loss = tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels, logits=logits)

    # Configure the Training Op (for TRAIN mode)
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdadeltaOptimizer(
            learning_rate=0.1,
            rho=0.95,
            epsilon=1e-06
        )
        
        train_op = optimizer.minimize(
            loss=loss,
            global_step=tf.train.get_global_step()
        )
        
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

    # Add evaluation metrics (for EVAL mode)
    eval_metric_ops = {
        "accuracy": tf.metrics.accuracy(
            labels=labels, 
            predictions=predictions["classes"]
        )
    }
    
    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)

In [None]:
class SetWeights(tf.train.SessionRunHook):
    """Hook to add ops to be executed after each call to run.
    Resets embedding for u'<unk>' token to zeros. Clips the 
    norm of the weights for the punultimate fully connected layer.
    """
    def begin(self):
        fc = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='dense/kernel')[0]
        embedding = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='embedding/embeddings')[0]
        self.update_fc = fc.assign(
            tf.clip_by_norm(t=fc,
                            clip_norm=25,
                            axes=[0])
        )
        self.update_we = tf.scatter_update(embedding,
                                           [0],
                                           tf.zeros((1, 300), dtype=tf.float32))
    
    def after_run(self, run_context, run_values):
        run_context.session.run([self.update_fc, self.update_we])

# Training and Evaluation

In [None]:
TrialResult = namedtuple('TrialResult', ['trial', 'best_run', 'best_dev_score', 'best_test_score'])

#### Create the Estimator

In [None]:
def train_cnn_trial(trial,
                    train_input_fn,
                    dev_input_fn,
                    test_input_fn,
                    model_dir="../data/model/cnn"):
    """Runs one trial for the training of the cnn classifier.
    
    Parameters
    ----------
    trial: int
    
    train_input_fn: tensorflow.python.estimator.inputs.numpy_io.input_fn
    
    dev_input_fn: tensorflow.python.estimator.inputs.numpy_io.input_fn
    
    test_input_fn: tensorflow.python.estimator.inputs.numpy_io.input_fn
    
    model_dir: str
    """
    run_config = tf.estimator.RunConfig(model_dir=model_dir + str(trial),
                                        save_checkpoints_secs=600,
                                        log_step_count_steps=1000,
                                        save_summary_steps=1000,
                                        session_config=tf.ConfigProto(log_device_placement=True))
    cnn_classifier = tf.estimator.Estimator(model_fn=cnn_model_fn,
                                            config=run_config)
    set_weights_hook = SetWeights()
    
    best_dev_score = 0.0
    best_test_score = 0.0
    best_run = 0
    
    for run in range(50):
        cnn_classifier.train(
            input_fn=train_input_fn,
            steps=1000,
            hooks=[set_weights_hook]
        )
        
        dev_score = cnn_classifier.evaluate(input_fn=dev_input_fn)['accuracy']
        test_score = cnn_classifier.evaluate(input_fn=test_input_fn)['accuracy']
        if dev_score > best_dev_score:
            best_run = run
            best_dev_score = dev_score
            best_test_score = test_score
            print("Best run: %d | Best dev score: %.4f | Best test score: %.4f" 
                  % (best_run, best_dev_score, best_test_score))

    return TrialResult(trial=trial,
                       best_run=best_run,
                       best_dev_score=best_dev_score,
                       best_test_score=best_test_score)

#### Train the model

In [None]:
tf.logging.set_verbosity(tf.logging.INFO)

In [None]:
train_input_fn = tf.estimator.inputs.numpy_input_fn(
    x=train_data,
    y=train_labels,
    batch_size=50,
    num_epochs=None,
    shuffle=True
)

dev_input_fn = tf.estimator.inputs.numpy_input_fn(
    x=dev_data,
    y=dev_labels,
    num_epochs=1,
    shuffle=False
)

test_input_fn = tf.estimator.inputs.numpy_input_fn(
  x=test_data,
  y=test_labels,
  num_epochs=1,
  shuffle=False
)

In [None]:
num_trials = 5

trial_results = []
for trial in range(num_trials):
    trial_results.append(train_cnn_trial(trial,
                                         train_input_fn,
                                         dev_input_fn,
                                         test_input_fn))

In [None]:
test_results = [trial_result.best_test_score for trial_result in trial_results]

test_result = (
    "Number of trials: %d\n"
    "Mean test score: %.4f\n"
    "Standard deviation: %.4f\n"
    "Minimum test score: %.4f\n" 
    "Maximum test score: %.4f"
) % (num_trials,
     (100.0 * np.mean(test_results)),
     (100.0 * np.std(test_results)),
     (100.0 * np.min(test_results)),
     (100.0 * np.max(test_results)))

print(test_result)