In [1]:
import os
import collections
import random
import math
import click
import numpy as np
import tensorflow as tf
from six.moves import xrange
import mlflow
from scipy.spatial import distance
from scipy.stats import kurtosis, skew
from bokeh.plotting import figure, show, output_notebook
output_notebook()

In [2]:
# mlflow.create_experiment('Model')
model_experiment = 1
# mlflow.create_experiment('Returns')
returns_experiment = 2 

In [3]:
class Word2Vec:
    def __init__(self,filename
                 , vocabulary_size=50000
                 , batch_size=128
                 , skip_window=1
                 , num_skips=2
                 , embedding_size = 128
                 , num_sampled = 64
                 , num_steps = 100001
                ):
        self.filename = filename
        self.vocabulary_size = vocabulary_size
        self.data_index = 0
        self.batch_size = batch_size
        self.skip_window = skip_window  # Number of words to consider left and right.
        self.num_skips = num_skips # Number of times to reuse a window to generate a label
        # self.embedding_size is the length of the word vector, empirically it should be in range [50, 2000].
        self.embedding_size = embedding_size
        self.num_sampled = num_sampled  # Number of negative examples to sample.
        self.num_steps = num_steps # Number of steps the model will be trained for.
        # The below is only for displaynig model accuracy, remove this in production.
        # We pick a random validation set to sample nearest neighbors. 
        # Here we limit the validation samples to the words that have a low numeric ID, which by construction are also the most frequent. 
        # These 3 variables are used only for displaying model accuracy, they don't affect calculation.
        self.valid_size = 16  # Random set of words to evaluate similarity on.
        self.valid_window = 100  # Only pick samples in the head of the distribution.
        self.valid_examples = np.random.choice(self.valid_window, self.valid_size, replace=False)
        self.min_loss = 99999999999
        self.best_embeddings = None

    def run(self):
        self.load_vocabulary()
        self.build_dataset()
        self.build_model()
        self.train_model()

    def load_vocabulary(self):
        self.vocabulary = []
        f = open(self.filename, 'r')
        for l in f.readlines():
            for r in l.split(" "):
                if r.strip() == "":
                    continue
                self.vocabulary.append(r)
        f.close()
        print('Success: Loaded Vocabulary.')
        print('Vocabulary size :', len(self.vocabulary))

    def build_dataset(self):
        # Compose count (a list of lists which maps top n words to their frequencies in a descending fashion)
        k = []
        k.extend(collections.Counter(self.vocabulary).most_common(len(set(self.vocabulary))))
        count = [['UNK', -1]]
        for x in k:
            if x[1] >= 2:
                count.append(x)
        self.vocabulary_size = math.floor(len(count))
        dictionary = {}
        # Compose dictionary (a one-to-one mapping of words to some integer)
        for word, _ in count:
            dictionary[word] = len(dictionary)
        data = []
        unk_count = 0
        # Compose data (a list of integers (corresponding to the each word in vocabulary))
        for word in self.vocabulary:
            index = dictionary.get(word, 0)
            # If index equals zero, then its a rare word(UNK).
            if index == 0:
                unk_count += 1
            data.append(index)
        # Update the frequency of rare words(UNK) from -1.
        count[0][1] = unk_count
        # Compose reversed_dictionary ( a one-to-one mapping of integer codes to their string representation(words))
        reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
        self.data = data
        self.count = count
        self.unused_dictionary = dictionary
        self.reverse_dictionary = reverse_dictionary
        print('Success: Built Dataset.')

    def generate_batch(self):
        assert self.batch_size % self.num_skips == 0
        assert self.num_skips <= 2 * self.skip_window
        batch = np.ndarray(shape=(self.batch_size), dtype=np.int32)
        labels = np.ndarray(shape=(self.batch_size, 1), dtype=np.int32)
        # Compute the length/span of the scanning window [ self.skip_window target self.skip_window ]
        span = 2 * self.skip_window + 1
        buffer = collections.deque(maxlen=span)
        # Reset the data_index to zero if the scanning window has reached the end of vocabulary.
        if self.data_index + span > len(self.data):
            self.data_index = 0
        buffer.extend(self.data[self.data_index:self.data_index + span])
        # Update the data_index value so that it points to end of scanning window.
        self.data_index += span
        for i in range(self.batch_size // self.num_skips):
            # Get the indexes for outer context_words surronding the center word.
#             context_words = [w for w in range(span) if w != self.skip_window]
            context_words = []
            for w in range(0, span):
#             for w in range(math.floor(skip_window / 2), span):
                if w == self.skip_window:
                    continue
                context_words.append(w)
            # Select a sub sample of the 'numm_skips' words to be used from the context_words.
            words_to_use = random.sample(context_words, self.num_skips)
            for j, context_word in enumerate(words_to_use):
                batch[i * self.num_skips + j] = buffer[self.skip_window]
                labels[i * self.num_skips + j, 0] = buffer[context_word]
            # Shift the scanning window to the right by inserting element pointed by data_index.
            if self.data_index == len(self.data):
                buffer.extend(self.data[0:span])
                self.data_index = span
            else:
                buffer.append(self.data[self.data_index])
                self.data_index += 1
        # Adjust data_index by 'span' so that no words are skipped.
        self.data_index = (self.data_index + len(self.data) - span) % len(self.data)
        # batch(row vector) contains the 'self.batch_size' number of integer ids(context words)
        # labels(column vector) contain the integer ids for the center/target words
        return batch, labels

    def build_model(self):
        graph = tf.Graph()
        with graph.as_default():
            # Input data.
            with tf.name_scope('inputs'):
                self.train_inputs = tf.placeholder(tf.int32, shape=[self.batch_size])
                self.train_labels = tf.placeholder(tf.int32, shape=[self.batch_size, 1])
                valid_dataset = tf.constant(self.valid_examples, dtype=tf.int32)
            # Ops and variables pinned to the CPU because of missing GPU implementation
            with tf.device('/cpu:0'):
                # Look up embeddings for inputs.
                with tf.name_scope('embeddings'):
                    embeddings = tf.Variable(tf.random_uniform([self.vocabulary_size, self.embedding_size], -1.0, 1.0))
                    # We need to look up the embeddings corresponding to the integer ids in the training inputs.
                    embed = tf.nn.embedding_lookup(embeddings, self.train_inputs)
                # Construct the variables for the NCE loss
                with tf.name_scope('weights'):
                    nce_weights = tf.Variable(tf.truncated_normal([self.vocabulary_size, self.embedding_size], stddev=1.0 / math.sqrt(self.embedding_size)))
                with tf.name_scope('biases'):
                    nce_biases = tf.Variable(tf.zeros([self.vocabulary_size]))

            # Define the loss as average NCE loss for the batch.
            # tf.nce_loss automatically draws a new sample of the negative labels each time we evaluate the loss.
            # Explanation of the meaning of NCE loss: http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
            with tf.name_scope('loss'):
                self.loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights, biases=nce_biases,
                                                     labels=self.train_labels, inputs=embed,
                                                     num_sampled=self.num_sampled, num_classes=self.vocabulary_size))
            # Construct the SGD optimizer using a learning rate of 1.0.
            with tf.name_scope('optimizer'):
                self.optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(self.loss)
            # Compute the cosine similarity between minibatch examples and all embeddings.
            norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
            self.normalized_embeddings = embeddings / norm
            valid_embeddings = tf.nn.embedding_lookup(self.normalized_embeddings, valid_dataset)
            self.similarity = tf.matmul(valid_embeddings, self.normalized_embeddings, transpose_b=True)
            # Add variable initializer.
            init = tf.global_variables_initializer()
        self.graph = graph
        self.init = init
        print('Success: Built Model.')

    def train_model(self):
        self.log_to_mlflow()
        try:
            with tf.Session(graph=self.graph) as session:
                # We must initialize graph variables before we use them.
                self.init.run()
                print('Initialized Model Parameters.')
                average_loss = 0
                for step in xrange(self.num_steps):
                    batch_inputs, batch_labels = self.generate_batch()
                    feed_dict = {self.train_inputs: batch_inputs, self.train_labels: batch_labels}
                    _, loss_val = session.run([self.optimizer, self.loss], feed_dict=feed_dict)
                    mlflow.log_metric('step', step)
                    mlflow.log_metric('loss', loss_val)
                    average_loss += loss_val
                    if step % 2000 == 0:
                        if step > 0:
                            average_loss /= 2000
                        # The average loss is an estimate of the loss over the last 2000 batches.
                        print('Average loss at step ', step, ': ', average_loss)
                        average_loss = 0
                    self.final_embeddings = self.normalized_embeddings.eval()
                    if loss_val <= self.min_loss:
                        self.min_loss = loss_val
                        print('Min Loss :', self.min_loss)
                        self.best_embeddings = self.final_embeddings
            print('Success: Trained Model.')
        finally:
#             self.save_and_log_embeddings()
            self.compute_distances()
            self.compute_probabilities()

    def visualize(self, filename='tsne.png'):
        # Remove this in production as it is very expensive to compute.
        def plot_with_labels(low_dim_embs, labels, filename):
            assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
            plt.figure(figsize=(18, 18))  # in inches
            for i, label in enumerate(labels):
                x, y = low_dim_embs[i, :]
                plt.scatter(x, y)
                plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
            plt.savefig(filename)
            plt.show()
        try:
            from sklearn.manifold import TSNE
            import matplotlib.pyplot as plt
            tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
            plot_only = min(500, len(set(self.reverse_dictionary)))
            low_dim_embs = tsne.fit_transform(self.final_embeddings[:plot_only, :])
            labels = [self.reverse_dictionary[i] for i in xrange(plot_only)]
            plot_with_labels(low_dim_embs, labels, os.path.join(os.getcwd(), filename))
            mlflow.log_artifact(filename)
        except ImportError as ex:
            print('Please install sklearn, matplotlib, and scipy to show embeddings.')
            print(ex)
            
    def save_and_log_embeddings(self):
        classes_filename = 'output_classes.txt'
        embeddings_filename = 'output_embeddings.txt'
        classes = open(classes_filename, 'w')
        embeddings = open(embeddings_filename, 'w')
        for key in self.reverse_dictionary:
            classes.write(str(self.reverse_dictionary[key]) + '\n')
            embeddings.write(' '.join(map(str, self.best_embeddings[0])) + '\n')
        classes.close()
        embeddings.close()
        mlflow.log_artifact(classes_filename)
        mlflow.log_artifact(embeddings_filename)
        print('Success: Saved Embeddings.')
            
    def log_to_mlflow(self):
        mlflow.log_artifact(self.filename)
        mlflow.log_param('vocabulary_size', self.vocabulary_size)
        mlflow.log_param('batch_size', self.batch_size)
        mlflow.log_param('skip_window', self.skip_window)
        mlflow.log_param('num_skips', self.num_skips)
        mlflow.log_param('embedding_size', self.embedding_size)
        mlflow.log_param('num_sampled', self.num_sampled)
        mlflow.log_param('num_steps', self.num_steps)
    
    def compute_distances(self):
        self.distances = distance.cdist(self.best_embeddings, self.best_embeddings, 'euclidean')
        print('Success: Computed Distances.')
        
    def compute_probabilities(self):
        x = 1 / self.distances
        np.fill_diagonal(x, 0)
        self.probabilities = x / x.sum(axis=1, keepdims=True)
        print('Success: Computed Probabilities.')
        
    def draw_distribution(self, num):
        probs = []
        vals = []
        for key in self.unused_dictionary:
            if key == 'UNK':
                continue
            vals.append(float(key))
        vals = sorted(vals)
        keys = []
        for v in vals:
            keys.append(self.unused_dictionary[str(v)])
        for k in keys:
            probs.append(self.probabilities[self.unused_dictionary[str(float(num))]][k])
        vals_strings = []
        for v in vals:
            vals_strings.append(str(v))
        p = figure(x_range=vals_strings, plot_height=500, plot_width=500, title="Return : " + str(float(num)))
        p.vbar(x=vals_strings, top=probs, width=0.9)
        p.xgrid.grid_line_color = None
        p.y_range.start = 0
        mean = np.array(vals).mean()
        std = np.array(vals).std()
        print('Mean :', mean)
        print('Std  :', abs(std))
        p.vbar(x=[str(round(mean, 2))], top=[np.array(probs).max()], width=1.8, color='black')
        p.vbar(x=[str(round(mean + abs(mean - std), 2))], top=[np.array(probs).max()], width=1.8, color='red')
        p.vbar(x=[str(round(mean - abs(mean - std), 2))], top=[np.array(probs).max()], width=1.8, color='red')
        show(p)
        
    def compute_expected_val_probs(self, num, required_prob = 0.1):
        if str(float(num)) not in self.unused_dictionary:
            return {}
        vals = list(self.unused_dictionary.keys())[1:]
        probs = self.probabilities[self.unused_dictionary[str(float(num))]][1:]
        val_to_prob = {}
        for i in range(len(vals)):
            val_to_prob[vals[i]] = probs[i]
        expected_val_probs = {}
        p = 0
        for k, v in sorted(val_to_prob.items(), key=lambda x: x[1], reverse=True):
            if p >= required_prob:
                break
            expected_val_probs[float(k)] = v
            p += v
        return expected_val_probs

    def breadth_search(self, n, change_percent=0.2, required_prob=0.1, top_samples=0):
        l = [k for k in self.unused_dictionary][1:]
        x = round((1 - change_percent) * n, 2)
        y = []
        while True:
            if x <= round((1 + change_percent) * n, 2):
                if str(x) in l:
                    y.append(x)
            else:
                break
            x = round(x + 0.01, 2)
        z = []
        for v in y:
            if str(float(v)) in self.unused_dictionary:
                z.append(v)
        val_probs = {}
        breadth_val_probs = {}
        if top_samples == 0:
            for num in z:
                expected_val_probs = self.compute_expected_val_probs(num, required_prob)
                for key in expected_val_probs:
                    if key in val_probs:
                        val_probs[key] += expected_val_probs[key]
                    else:
                        val_probs[key] = expected_val_probs[key]
            for k, v in sorted(val_probs.items(), key=lambda x: x[1], reverse=True):
                breadth_val_probs[k] = v / len(z)
        else:
            for num in z:
                expected_val_probs = self.compute_expected_val_probs(num, 1)
                for key in expected_val_probs:
                    if key in val_probs:
                        val_probs[key] += expected_val_probs[key]
                    else:
                        val_probs[key] = expected_val_probs[key]
            total = 0
            for key in val_probs:
                total += val_probs[key]
            normalized_val_probs = {}
            for key in val_probs:
                normalized_val_probs[key] = val_probs[key] / total
            count = 0
            for k, v in sorted(normalized_val_probs.items(), key=lambda x: x[1], reverse=True):
                breadth_val_probs[k] = v
                count += 1
                if count >= top_samples:
                    break
        return breadth_val_probs

    def depth_search(self, num_list, change_percent=0.2, required_prob=0.1):
        current_val_probs = {}
        for num in num_list:
            breadth_val_probs = self.breadth_search(num, change_percent, 1, 0)
            for key in breadth_val_probs:
                if key in current_val_probs:
                    current_val_probs[key] *= breadth_val_probs[key]
                else:
                    current_val_probs[key] = breadth_val_probs[key]
            total = 0
            for key in current_val_probs:
                total += current_val_probs[key]
            normalized_val_probs = {}
            for key in current_val_probs:
                normalized_val_probs[key] = current_val_probs[key] / total
            current_val_probs = normalized_val_probs
        depth_val_probs = {}
        p = 0
        for k, v in sorted(current_val_probs.items(), key=lambda x: x[1], reverse=True):
            if p >= required_prob:
                break
            depth_val_probs[float(k)] = v
            p += v
        return depth_val_probs
        

In [6]:
ticker = 'AAPL'
filename = ticker + '_returns_train.txt'
# Hyperparameters
embedding_size = 300
# # Use these in production.
skip_window=20
num_skips=16
# vocabulary_size is the number of tokens with frquency greater than 1.
batch_size=128
num_sampled = 32

In [7]:
num_steps = 2001
model = Word2Vec(filename, None, batch_size, skip_window, num_skips, embedding_size, num_sampled, num_steps)
with mlflow.start_run(experiment_id=model_experiment):
    model.run()

W0702 16:28:46.797079 140081117902656 deprecation.py:323] From /home/demq/.anaconda3/envs/dq/lib/python3.7/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Success: Loaded Vocabulary.
Vocabulary size : 1549
Success: Built Dataset.
Success: Built Model.
Initialized Model Parameters.
Average loss at step  0 :  28.162139892578125
Min Loss : 28.16214
Min Loss : 17.595816
Min Loss : 6.589919
Min Loss : 3.537791
Min Loss : 2.2679682
Success: Computed Distances.
Success: Computed Probabilities.




MlflowException: Got invalid value inf for metric 'loss' (timestamp=1562065130516). Please specify value as a valid double (64-bit floating point)

In [8]:
model.draw_distribution(0)

Mean : 0.0028007092531630857
Std  : 0.018561790921332606


In [None]:
f = open('vocabulary_snp_returns.txt', 'r')
new_returns = []
for l in f.readlines():
    for r in l.split():
        if r.strip() != "":
            new_returns.append(float(r))
f.close()

In [None]:
future_window = skip_window
change_percent = 0.2
required_prob = 1
leader_persistence = 2
confusion_matrix = {'TP' : 0, 'FP' : 0, 'TN' : 0, 'FN' : 0}
tp_fp_leader = None
tn_fn_leader = None
tp_fp_leader_counter = 0
tn_fn_leader_counter = 0
i = -1
win, loss, alpha = 0, 0, 0
while True:
    i += 1
    if (win + loss) != 0:
        alpha = round(win / (win + loss) * 100, 2)
        print(alpha)
    if i >= (len(new_returns) - 1):
        break
    if i < future_window:
        continue
    present = new_returns[(i - future_window):i]
    past = new_returns[(i - 2 * future_window):(i - future_window)]
    future = new_returns[i:(i + future_window)]
    if past == []:
        continue
#     future_expectations = list(model.depth_search(present, change_percent, required_prob).keys())    
#     p, n = 0, 0
#     for v in future_expectations:
#         if v > 0:
#             p += abs(v)
#         else:
#             n += abs(v)
    past_mean = np.array(past).mean()
    future_mean = np.array(future).mean()
    if past_mean > 0 and future_mean < 0:
        win += 1
    elif past_mean < 0 and future_mean > 0:
        win += 1
    else:
        loss += 1
    continue
    if p > n:
        if tp_fp_leader != None:
            # If Leader is False, change the sign of the mean.
            if tp_fp_leader == 'FP':
                past_mean = -past_mean
            if past_mean > 0:
                confusion_matrix['TP'] += 1
                if tp_fp_leader == 'TP':
                    win += 1
                else:
                    loss += 1
                if tp_fp_leader != 'TP':
                    tp_fp_leader_counter += 1
                    if tp_fp_leader_counter % leader_persistence == 0:
                        tp_fp_leader = 'TP'
            elif past_mean < 0:
                confusion_matrix['FP'] += 1
                if tp_fp_leader == 'FP':
                    win += 1
                else:
                    loss += 1
                if tp_fp_leader != 'FP':
                    tp_fp_leader_counter += 1
                    if tp_fp_leader_counter % leader_persistence == 0:
                        tp_fp_leader = 'FP'
        else:
            # TP_FP_LEADER has not been decided.
            tp_fp_leader_counter += 1
            if past_mean > 0:
                confusion_matrix['TP'] += 1
                tp_fp_leader = 'TP'
                win += 1
            elif past_mean < 0:
                confusion_matrix['FP'] += 1
                tp_fp_leader = 'FP'
                loss += 1
    elif p < n:
        if tn_fn_leader != None:
            # If Leader is False, change the sign of the mean.
            if tn_fn_leader == 'FN':
                past_mean = -past_mean
            if past_mean < 0:
                confusion_matrix['TN'] += 1
                if tn_fn_leader == 'TN':
                    win += 1
                else:
                    loss += 1
                if tn_fn_leader != 'TN':
                    tn_fn_leader_counter += 1
                    if tn_fn_leader_counter % leader_persistence == 0:
                        tn_fn_leader = 'TN'
            elif past_mean > 0:
                confusion_matrix['FN'] += 1
                if tn_fn_leader == 'FN':
                    win += 1
                else:
                    loss += 1
                if tn_fn_leader != 'FN':
                    tn_fn_leader_counter += 1
                    if tn_fn_leader_counter % leader_persistence == 0:
                        tn_fn_leader = 'FN'
        else:
            # TN_FN_LEADER has not been decided.
            tn_fn_leader_counter += 1
            if past_mean < 0:
                confusion_matrix['TN'] += 1
                tn_fn_leader = 'TN'
                win += 1
            elif past_mean > 0:
                confusion_matrix['FN'] += 1
                tn_fn_leader = 'FN'
                loss += 1
    else:
        continue
print(confusion_matrix, win, loss, alpha)