In [1]:
import numpy as np
import re
import itertools
from collections import Counter


def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()


def load_data_and_labels(positive_data_file, negative_data_file):
    """
    Loads MR polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    positive_examples = list(open(positive_data_file, "r").readlines())
    positive_examples = [s.strip() for s in positive_examples]
    negative_examples = list(open(negative_data_file, "r").readlines())
    negative_examples = [s.strip() for s in negative_examples]
    # Split by words
    x_text = positive_examples + negative_examples
    x_text = [clean_str(sent) for sent in x_text]
    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y]


def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]

In [2]:
import tensorflow as tf
import numpy as np
import os
import time
import datetime
from tensorflow.contrib import learn
#./rt-polaritydata/rt-polarity.pos

In [3]:
#! /usr/bin/env python

import tensorflow as tf
import numpy as np
import os
import time
import datetime
from tensorflow.contrib import learn

# Parameters
# ==================================================

# Data loading params
tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation")
tf.flags.DEFINE_string("positive_data_file", "./data/rt-polaritydata/rt-polarity.pos", "Data source for the positive data.")
tf.flags.DEFINE_string("negative_data_file", "./data/rt-polaritydata/rt-polarity.neg", "Data source for the negative data.")

# Model Hyperparameters
tf.flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of character embedding (default: 128)")
tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')")
tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)")
tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularization lambda (default: 0.0)")

# Training parameters
tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
tf.flags.DEFINE_integer("num_epochs", 200, "Number of training epochs (default: 200)")
tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)")
tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)")
tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store (default: 5)")
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")


Parameters:
ALLOW_SOFT_PLACEMENT=True
BATCH_SIZE=64
CHECKPOINT_EVERY=100
DEV_SAMPLE_PERCENTAGE=0.1
DROPOUT_KEEP_PROB=0.5
EMBEDDING_DIM=128
EVALUATE_EVERY=100
FILTER_SIZES=3,4,5
L2_REG_LAMBDA=0.0
LOG_DEVICE_PLACEMENT=False
NEGATIVE_DATA_FILE=./data/rt-polaritydata/rt-polarity.neg
NUM_CHECKPOINTS=5
NUM_EPOCHS=200
NUM_FILTERS=128
POSITIVE_DATA_FILE=./data/rt-polaritydata/rt-polarity.pos



In [4]:
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in FLAGS.__flags.items():
    print("{}={}".format(attr.upper(), value))
print("")



Parameters:
DEV_SAMPLE_PERCENTAGE=0.1
POSITIVE_DATA_FILE=./data/rt-polaritydata/rt-polarity.pos
NEGATIVE_DATA_FILE=./data/rt-polaritydata/rt-polarity.neg
EMBEDDING_DIM=128
FILTER_SIZES=3,4,5
NUM_FILTERS=128
DROPOUT_KEEP_PROB=0.5
L2_REG_LAMBDA=0.0
BATCH_SIZE=64
NUM_EPOCHS=200
EVALUATE_EVERY=100
CHECKPOINT_EVERY=100
NUM_CHECKPOINTS=5
ALLOW_SOFT_PLACEMENT=True
LOG_DEVICE_PLACEMENT=False



In [5]:
print("Loading data...")
x_text, y = load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file)

Loading data...


In [6]:
# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_text])

vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))
# print(max_document_length)
# print(x_text[0:1])
# print(list(vocab_processor.fit_transform(x_text))[0:1])
print(x[0])


[ 1  2  3  4  5  6  1  7  8  9 10 11 12 13 14  9 15  5 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0]


In [7]:
# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

In [8]:
x = np.arange(5)
x = np.random.permutation(x)
p = ["abcd" , "asdhkljasd" , "asdhjkkagsd" , "79327981" , "217632613"]
p = np.asarray(p)
p[x]

array(['abcd', 'asdhjkkagsd', '217632613', 'asdhkljasd', '79327981'],
      dtype='<U11')

In [9]:
# Split train/test set
# TODO: This is very crude, should use cross-validation
dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))


Vocabulary Size: 18758
Train/Dev split: 9596/1066


In [37]:
def my_func(iterator):
  return (x.split(" ") for x in iterator)

vocab={'hello':3, '.':5, 'world':20, '/' : 10}
sentences= ['hello world . / hello', 'hello']

vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length=6, vocabulary=vocab, tokenizer_fn = my_func, min_frequency=2)

list(vocab_processor.transform(sentences)) 

[array([ 3, 20,  3,  0,  0,  0]), array([3, 0, 0, 0, 0, 0])]

In [11]:
import numpy as np
from tensorflow.contrib import learn

x_text = ['This is a cat','This must be boy', 'This is a a dog']
tokens = vocab_processor.fit_transform(["a b c", "a\nb\nc", "a, b - c"])max_document_length = max([len(x.split(" ")) for x in x_text])

## Create the vocabularyprocessor object, setting the max lengh of the documents.
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)

## Transform the documents using the vocabulary.
x = np.array(list(vocab_processor.fit_transform(x_text)))    
#print(x)
## Extract word:id mapping from the object.
vocab_dict = vocab_processor.vocabulary_._mapping
# print(vocab_dict)
## Sort the vocabulary dictionary on the basis of values(id).
## Both statements perform same task.
#sorted_vocab = sorted(vocab_dict.items(), key=operator.itemgetter(1))
sorted_vocab = sorted(vocab_dict.items(), key = lambda x : x[1])
## Treat the id's as index into list and create a list of words in the ascending order of id's
## word with id i goes at index i of the list.
print(sorted_vocab)
print(zip(*sorted_vocab))
print(list(zip(*sorted_vocab)))
print(list(list(zip(*sorted_vocab))[0]))
vocabulary = list(list(zip(*sorted_vocab))[0])
# print(vocabulary)
#print(vocabulary)
#print(x)

SyntaxError: invalid syntax (<ipython-input-11-7259b01b16a6>, line 5)

In [65]:
def my_func(iterator):
  return (x.split(" ") for x in iterator)

vocab={'hello':3, '.':5, 'world':20, '/' : 10}
sentences= ['hello world . / hello', 'hello']

vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length=6, vocabulary=vocab, tokenizer_fn = my_func, min_frequency=2)

list(vocab_processor.transform(sentences)) 

[array([ 3, 20,  5, 10,  3,  0]), array([3, 0, 0, 0, 0, 0])]

In [74]:
def my_func(iterator):
  return (x.split(" ") for x in iterator)

for i in range(10):
    vocab_processor = learn.preprocessing.VocabularyProcessor(
        max_document_length=7, min_frequency=i)
    sentences= ["a b c d e f","a b c d e","a b c" , "a b", "a"]
    print(list(vocab_processor.transform(sentences))[0] )

[1 2 3 4 5 6 0]
[1 2 3 4 5 0 0]
[1 2 3 0 0 0 0]
[1 2 0 0 0 0 0]
[1 0 0 0 0 0 0]
[0 0 0 0 0 0 0]
[0 0 0 0 0 0 0]
[0 0 0 0 0 0 0]
[0 0 0 0 0 0 0]
[0 0 0 0 0 0 0]


In [62]:
for i in range(10):
    vocab_processor = learn.preprocessing.VocabularyProcessor(
            max_document_length=7, min_frequency=i)
    tokens = vocab_processor.fit_transform(["a b c d e f","a b c d e","a b c" , "a b", "a"])
    print(list(tokens)[0])

[1 2 3 4 5 6 0]
[1 2 3 4 5 0 0]
[1 2 3 0 0 0 0]
[1 2 0 0 0 0 0]
[1 0 0 0 0 0 0]
[0 0 0 0 0 0 0]
[0 0 0 0 0 0 0]
[0 0 0 0 0 0 0]
[0 0 0 0 0 0 0]
[0 0 0 0 0 0 0]
