In [1]:
%matplotlib inline

REPO_ROOT = "/usr/src/app"

import collections
import json
import math
import pickle
import random

import matplotlib.pyplot as plt
import numpy as np
from sklearn.manifold import TSNE
import tensorflow as tf

import util
reload(util)



<module 'util' from 'util.pyc'>

In [2]:
with open("%s/model-data/metadata.pickle" % (REPO_ROOT,), "r") as f:
    size_data = pickle.load(f)
    
TRAIN_SIZES = size_data["train_sizes"]
TEST_SIZE = size_data["test_size"]

BATCH_SIZE = 100
VOCAB_SIZE = 10000
WORD_EMBEDDING_SIZE = 1600
DOC_EMBEDDING_SIZE = 1000
CONTEXT_SIZE = 3

VALIDATE_SIZE = 400

IOError: [Errno 2] No such file or directory: '/usr/src/app/model-data/metadata.pickle'

In [None]:
with open("%s/scripts/table_balanced.json" % REPO_ROOT) as f:
    data_table = json.load(f)

In [None]:
counts = collections.Counter()
print "Tokenizing %d scripts..." % len(data_table)
for idx, script in enumerate(data_table):
    if idx % 1000 == 0:
        print "%d done." % idx
        
    with open("%s/scripts/%s.js" % (REPO_ROOT, script["sha"])) as f:
        js = f.read()
        tokens = util.tokenize_js(js)
        counts.update(tokens)

In [None]:
for token, count in counts.most_common(100):
    print "%s: %d" % (token, count)

In [3]:
word_table = {
    entry[0]: idx
    for idx, entry in enumerate(counts.most_common(VOCAB_SIZE-1))
}

with open("%s/model-data/js-vocab.pickle" % (REPO_ROOT,), "w") as f:
    pickle.dump(word_table, f)

def numerize(word):
    return word_table.get(word, VOCAB_SIZE-1)

NameError: name 'counts' is not defined

In [4]:
def embedding_lookup(embeddings, table_rows):
    vocab_size, embed_size = np.shape(embeddings)
    embeddings_trans = np.transpose(embeddings)

    ret = np.zeros([len(table_rows), embed_size])
    for script_idx, script in enumerate(util.parse_js(table_rows)):
        token_ids = [
            numerize(token)
            for token in util.tokenize_js(script)
        ]
        word_vec = np.zeros(vocab_size)
        for token_id in token_ids:
            word_vec[token_id] += 1

        ret[script_idx] = np.matmul(embeddings_trans, word_vec)

    return ret

def save_training_set(embeddings, data_table, train_indices, test_indices):   
    data = {
        "X_train": embedding_lookup(embeddings, [data_table[index] for index in train_indices]),
        "Y_train": np.array([data_table[index]["flag-any"] for index in train_indices]),
        "X_test": embedding_lookup(embeddings, [data_table[index] for index in test_indices]),
        "Y_test": np.array([data_table[index]["flag-any"] for index in test_indices]),
    }

    with open("%s/model-data/dataset_Word2Vec_%d.pickle" % (REPO_ROOT, len(train_indices)), "w") as f:
        pickle.dump(data, f)

In [14]:
# Word2Vec algorithm
class WordEmbeddingGraph(object):
    def __init__(self, data_table, batch_size, vocabulary_size, embedding_size, context_size,
                 train_size, test_size):
        
        self.batch_size = batch_size
        self.context_size = context_size
        self.train_size = train_size
        
        random.seed(9812)
        indices = random.sample(range(len(data_table)), train_size + test_size)
        self.train_indices = indices[:train_size]
        self.validate_indices = indices[-test_size:]

        self.graph = tf.Graph()
        with self.graph.as_default():    
            self.embeddings = tf.Variable(
                tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0),
                name="word_embeddings")

            norm = tf.sqrt(tf.reduce_sum(tf.square(self.embeddings), 1, keep_dims=True))
            self.normalized_embeddings = self.embeddings / norm

            nce_weights = tf.Variable(
              tf.truncated_normal([vocabulary_size, embedding_size],
                                  stddev=1.0 / math.sqrt(embedding_size)))
            nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

            self.x_ph = tf.placeholder(tf.int32, shape=[batch_size,CONTEXT_SIZE])
            self.y_ph = tf.placeholder(tf.int32, shape=[batch_size,1])

            embed = tf.add_n(
                [tf.nn.embedding_lookup(self.embeddings, self.x_ph[:,idx]) for idx in xrange(CONTEXT_SIZE)])

            # Compute the NCE loss, using a sample of the negative labels each time.
            self.loss = tf.reduce_mean(
                tf.nn.nce_loss(nce_weights, nce_biases, embed, self.y_ph,
                               64, vocabulary_size))

            output = tf.transpose(tf.matmul(nce_weights, tf.transpose(embed))) + nce_biases
            self.y_pred = tf.argmax(tf.nn.softmax(output), 1)

            self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(self.loss)
            
    def generate_batch(self, indices, sample_ratio):
        x = np.ndarray(shape=(self.batch_size, self.context_size), dtype=np.int32)
        y = np.ndarray(shape=(self.batch_size,1), dtype=np.int32)
        batch_idx = 0
        for index in indices:
            with open("%s/scripts/%s.js" % (REPO_ROOT, data_table[index]["sha"])) as f:
                js = f.read()
                tokens = [numerize(token) for token in util.tokenize_js(js)]
                if len(tokens) < self.context_size + 1:
                    continue

                sample_size = len(tokens) - 1 - self.context_size
                num_to_sample = max(1, int(math.floor(sample_size * sample_ratio)))
                for pos in random.sample(range(0, sample_size + 1), num_to_sample):
                    for idx in xrange(self.context_size):
                        x[batch_idx,idx] = tokens[pos+idx]
                    y[batch_idx,0] = tokens[pos + self.context_size]
                    batch_idx += 1
                    if batch_idx == self.batch_size:
                        yield {self.x_ph: x, self.y_ph: y}
                        batch_idx = 0
                        
    def train(self, session):
        tf.initialize_all_variables().run()

        batch_idx = 1
        average_loss = 0
        for feed_dict in self.generate_batch(self.train_indices, 1.0):
            _, cur_loss = session.run([self.optimizer, self.loss], feed_dict=feed_dict)
            average_loss += cur_loss

            if batch_idx % 2000 == 0:
                validate_batches = 0
                average_accuracy = 0
                for feed_dict2 in self.generate_batch(self.validate_indices, 0.01):
                    predicted_labels = session.run([self.y_pred], feed_dict=feed_dict2)
                    predicted_labels = np.transpose(predicted_labels)
                    average_accuracy += (
                        float(np.count_nonzero(feed_dict2[self.y_ph] == predicted_labels)) / BATCH_SIZE)
                    validate_batches += 1

                print "%d Loss: %f, Accuracy: %f" % (
                    batch_idx,
                    average_loss / 1999,
                    average_accuracy / validate_batches)
                average_loss = 0

            batch_idx += 1

In [15]:
for train_size in TRAIN_SIZES:
    print "Word2Vec Training size %d" % train_size
    
    g = WordEmbeddingGraph(
        data_table, BATCH_SIZE, VOCAB_SIZE, WORD_EMBEDDING_SIZE, CONTEXT_SIZE, train_size, TEST_SIZE)

    with tf.Session(graph=g.graph) as session:
        g.train(session)
        save_training_set(g.normalized_embeddings.eval(), data_table, g.train_indices, g.validate_indices)
    
print "Done training."

Word2Vec Training size 300


KeyboardInterrupt: 

In [16]:
model = TSNE(n_components=2, random_state=0)
points = model.fit_transform(final_embeddings)

NameError: name 'final_embeddings' is not defined

In [None]:
words = [entry[0] for entry in counts.most_common(VOCAB_SIZE-1)] + ["UNK"]

fig, ax = plt.subplots(figsize=(12, 12))
ax.plot(points[:100,0], points[:100,1], 'bo', markersize=0)

for i, txt in enumerate(words[:100]):
    ax.annotate(txt, (points[i][0],points[i][1]))

In [15]:
# Doc2Vec algorithm
class DocEmbeddingGraph(object):
    def __init__(self, data_table, batch_size, vocabulary_size, doc_embedding_size,
                 word_embedding_size, context_size, train_size, test_size,
                 word_embeddings=None, nce_weights=None, nce_biases=None, use_test=False):
        
        self.batch_size = batch_size
        self.context_size = context_size
        
        random.seed(9812)
        indices = random.sample(range(len(data_table)), train_size + test_size)
        if not use_test:
            self.train_indices = indices[:train_size]
            self.test_indices = indices[-test_size:]
        else:
            self.train_indices = indices[-test_size:]

        self.graph = tf.Graph()

        with self.graph.as_default():    
            self.doc_embeddings = tf.Variable(
                tf.random_uniform([len(self.train_indices), doc_embedding_size], -1.0, 1.0),
                name="doc_embeddings")

            if word_embeddings is None:
                self.word_embeddings = tf.Variable(
                    tf.random_uniform([vocabulary_size, word_embedding_size], -1.0, 1.0),
                    name="word_embeddings")
            else:
                self.word_embeddings = tf.constant(word_embeddings, name="word_embeddings")

            all_embedding_size = doc_embedding_size + word_embedding_size

            norm = tf.sqrt(tf.reduce_sum(tf.square(self.doc_embeddings), 1, keep_dims=True))
            self.normalized_doc_embeddings = self.doc_embeddings / norm

            norm = tf.sqrt(tf.reduce_sum(tf.square(self.word_embeddings), 1, keep_dims=True))
            self.normalized_word_embeddings = self.word_embeddings / norm

            if nce_weights is None:
                self.nce_weights = tf.Variable(
                  tf.truncated_normal([vocabulary_size, all_embedding_size],
                                      stddev=1.0 / math.sqrt(all_embedding_size)))
            else:
                self.nce_weights = tf.constant(nce_weights)
            
            if nce_biases is None:
                self.nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
            else:
                self.nce_biases = tf.constant(nce_biases)

            self.x_ph = tf.placeholder(tf.int32, shape=[batch_size,CONTEXT_SIZE])
            self.doc_ph = tf.placeholder(tf.int32, shape=[batch_size])
            self.y_ph = tf.placeholder(tf.int32, shape=[batch_size,1])

            doc_embed = tf.nn.embedding_lookup(self.doc_embeddings, self.doc_ph)
            words_embed = tf.add_n(
                [tf.nn.embedding_lookup(self.word_embeddings, self.x_ph[:,idx]) for idx in xrange(CONTEXT_SIZE)])

            embed = tf.concat(1, [doc_embed, words_embed])

            # Compute the NCE loss, using a sample of the negative labels each time.
            self.loss = tf.reduce_mean(
                tf.nn.nce_loss(self.nce_weights, self.nce_biases, embed, self.y_ph,
                               64, vocabulary_size))

            output = tf.transpose(tf.matmul(self.nce_weights, tf.transpose(embed))) + self.nce_biases
            self.y_pred = tf.argmax(tf.nn.softmax(output), 1)

            self.optimizer = tf.train.AdamOptimizer().minimize(self.loss)
            
    def generate_batch(self, indices, sample_ratio):
        x = np.ndarray(shape=(self.batch_size, self.context_size), dtype=np.int32)
        di = np.ndarray(shape=(self.batch_size), dtype=np.int32)
        y = np.ndarray(shape=(self.batch_size,1), dtype=np.int32)
        batch_idx = 0
        for doc_idx, index in enumerate(indices):
            with open("%s/scripts/%s.js" % (REPO_ROOT, data_table[index]["sha"])) as f:
                js = f.read()
                tokens = [numerize(token) for token in util.tokenize_js(js)]
                if len(tokens) < self.context_size + 1:
                    continue

                sample_size = len(tokens) - 1 - self.context_size
                num_to_sample = max(1, int(math.floor(sample_size * sample_ratio)))
                for pos in random.sample(range(0, sample_size + 1), num_to_sample):
                    di[batch_idx] = doc_idx
                    for idx in xrange(self.context_size):
                        x[batch_idx,idx] = tokens[pos+idx]
                    y[batch_idx,0] = tokens[pos+self.context_size]
                    batch_idx += 1
                    if batch_idx == self.batch_size:
                        yield {self.x_ph: x, self.doc_ph: di, self.y_ph: y}
                        batch_idx = 0
                        
    def train(self, session):
        tf.initialize_all_variables().run()

        batch_idx = 1
        average_loss = 0
        for feed_dict in self.generate_batch(self.train_indices, 1.0):
            _, cur_loss = session.run([self.optimizer, self.loss], feed_dict=feed_dict)
            average_loss += cur_loss

            if batch_idx % 2000 == 0:
                print "%d Loss: %f" % (batch_idx, average_loss / 1999)
                average_loss = 0

            batch_idx += 1                         

In [None]:
model_data = {}

for train_size in TRAIN_SIZES:
    print "Doc2Vec Training size %d" % train_size
    
    g = DocEmbeddingGraph(
        data_table, BATCH_SIZE, VOCAB_SIZE, DOC_EMBEDDING_SIZE, WORD_EMBEDDING_SIZE, CONTEXT_SIZE,
        train_size, TEST_SIZE)

    with tf.Session(graph=g.graph) as session:
        g.train(session)

        model_data["X_train"] = g.doc_embeddings.eval()
        model_data["Y_train"] = np.array([data_table[index]["flag-any"] for index in g.train_indices])

        word_embeddings = g.word_embeddings.eval()
        nce_weights = g.nce_weights.eval()
        nce_biases = g.nce_biases.eval()

        print "Done training. Embeddings: %s, %s, NCE weights %s, %s" % (
            np.shape(model_data["X_train"]), np.shape(word_embeddings),
            np.shape(nce_weights), np.shape(nce_biases))
        
        save_training_set(g.normalized_word_embeddings.eval(), data_table, g.train_indices, g.test_indices)

    g2 = DocEmbeddingGraph(
        data_table, BATCH_SIZE, VOCAB_SIZE, DOC_EMBEDDING_SIZE, WORD_EMBEDDING_SIZE, CONTEXT_SIZE,
        train_size, TEST_SIZE, word_embeddings, nce_weights, nce_biases, True)

    with tf.Session(graph=g2.graph) as session:
        g2.train(session)

        model_data["X_test"] = g2.doc_embeddings.eval()
        model_data["Y_test"] = np.array([data_table[index]["flag-any"] for index in g2.train_indices])

        print "Done projecting test set. %s" % (np.shape(model_data["X_test"]),)

    with open("%s/model-data/dataset_Doc2Vec_%d.pickle" % (REPO_ROOT, train_size), "w") as f:
        pickle.dump(model_data, f)

Doc2Vec Training size 300
