In [None]:
# define global constants
splitYear = 2014
firstYear = 2011

# Make the TensorFlow Graph

In [None]:
# constants pertaining to the graph
trainDim = 11
num_epochs = 25
minibatch_size = 64
num_hidden = 32
seed = 0
windowLen = 50
lr = 0.001

In [None]:
# okay let's actually set up a tensor flow graph
import tensorflow as tf

# placeholders
tf.reset_default_graph()
inputs = tf.placeholder(tf.float32, shape=(None, windowLen, trainDim), name='inputs')
labels = tf.placeholder(tf.float32, shape=(None, 1), name='labels')
lengths = tf.placeholder(tf.int64, shape = (None, ), name='lengths')

# create the cells
#cell = tf.nn.rnn_cell.LSTMCell(num_hidden, state_is_tuple=True)
cell = tf.nn.rnn_cell.BasicRNNCell(num_hidden)
output, states = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32, sequence_length = lengths)

# output layer weight matrices
W1 = tf.get_variable("W1", shape=[num_hidden, 1],\
           initializer=tf.contrib.layers.xavier_initializer())
b1 = tf.get_variable("b1", shape=[1, 1],\
           initializer=tf.zeros_initializer())

In [None]:
# set up the relationships
output = tf.add(tf.matmul(states, W1), b1)
cost = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels = labels, logits = output))

In [None]:
# optimizer and cost
optimizer = tf.train.AdamOptimizer(learning_rate = lr).minimize(cost)
init = tf.global_variables_initializer()

# Load in the Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import chain

In [None]:
from kaggle.competitions import twosigmanews
# You can only call make_env() once, so don't lose it!
env = twosigmanews.make_env()
print('Done!')

In [None]:
(market_train_df, _) = env.get_training_data()

In [None]:
# pre-process and scale
cat_cols = ['assetCode']
num_cols = ['volume', 'close', 'open', 'returnsClosePrevRaw1', 'returnsOpenPrevRaw1', 'returnsClosePrevMktres1',
                    'returnsOpenPrevMktres1', 'returnsClosePrevRaw10', 'returnsOpenPrevRaw10', 'returnsClosePrevMktres10',
                    'returnsOpenPrevMktres10']

from sklearn.preprocessing import StandardScaler
print('scaling numerical columns')
scaler = StandardScaler()

market_train_df[num_cols] = scaler.fit_transform(market_train_df[num_cols])

market_train_df[num_cols] = market_train_df[num_cols].fillna(0)

In [None]:
# data formatting and splitting
market_train_df['y'] = ((market_train_df.returnsOpenNextMktres10 > 0).values).astype(int)
market_train_df['year'] = pd.to_datetime(market_train_df.time).dt.year

In [None]:
# split the data into train and test sets
train = market_train_df[(market_train_df.year >= firstYear) & (market_train_df.year <= splitYear)]
test = market_train_df[(market_train_df.year > splitYear)]

In [None]:
# function extract all the sequences
import datetime
def sequenceExtraction(df): 
    
    # prepare to store the data
    X = [] 
    y = [] 
    r = [] 
    u = [] 
    d = [] 
    
    # for each asset code 
    cntr = 0
    for assetCode in df.assetCode.unique():
        if cntr % 100 == 0:
            print(cntr, datetime.datetime.now())
        cntr += 1

        # get the whole sequence
        if not df[df.assetCode == assetCode].time.is_monotonic:
            print('not sequential time series data!')
        data = df[df.assetCode == assetCode]
        Xvalues = data[num_cols].values
        outcome = data['y'].values
        rvalues = data['returnsOpenNextMktres10'].values
        uvalues = data['universe'].values
        dvalues = data['time'].values

        # generate a sliding window of data 
        for i in range(data.shape[0] - windowLen):
            X += [Xvalues[i:(i + windowLen)]]
            y += [outcome[(i + windowLen)]]
            r += [rvalues[(i + windowLen)]]
            u += [uvalues[(i + windowLen)]]
            d += [dvalues[(i + windowLen)]]
            
    return((X, y, r, u, d))

X_train, y_train, r_train, u_train, d_train = sequenceExtraction(train)
X_test, y_test, r_test, u_test, d_test = sequenceExtraction(test)

# Train neural net model

In [None]:
# mini-batch generation function
from pdb import set_trace as t
import math
def random_mini_batches(X, Y, mini_batch_size = 64, seed = 0, random = True):
    """
    Creates a list of random minibatches from (X, Y)
"""
    
    m = len(X)                  # number of training examples
    mini_batches = []
    np.random.seed(seed)
    
    # Step 1: Shuffle (X, Y)
    if random:
        permutation = list(np.random.permutation(m))
    else:
        permutation = range(m)
    shuffled_X = [X[i] for i in permutation]
    shuffled_Y = [Y[i] for i in permutation]

    # Step 2: Partition (shuffled_X, shuffled_Y). Minus the end case.
    num_complete_minibatches = math.floor(m/mini_batch_size) # number of mini batches of size mini_batch_size in your partitionning
    for k in range(0, num_complete_minibatches):
        mini_batch_X = shuffled_X[k * mini_batch_size : k * mini_batch_size + mini_batch_size]
        mini_batch_Y = shuffled_Y[k * mini_batch_size : k * mini_batch_size + mini_batch_size]
        mini_batch_X = np.dstack(mini_batch_X).transpose([2, 0, 1])
        mini_batch_Y = np.asarray(mini_batch_Y).reshape(-1,1)
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    # Handling the end case (last mini-batch < mini_batch_size)
    if m % mini_batch_size != 0:
        mini_batch_X = shuffled_X[num_complete_minibatches * mini_batch_size : m]
        mini_batch_Y = shuffled_Y[num_complete_minibatches * mini_batch_size : m]
        mini_batch_X = np.dstack(mini_batch_X).transpose([2, 0, 1])
        mini_batch_Y = np.asarray(mini_batch_Y).reshape(-1,1)
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    return mini_batches

In [None]:
# make the mini-batches
minibatches = random_mini_batches(X_train, y_train)
trainMiniBatches = random_mini_batches(X_train, y_train, random = False)
testMiniBatches = random_mini_batches(X_test, y_test, random = False)

In [None]:
y_train = np.asarray(y_train).reshape(-1,1)
r_train = np.asarray(r_train).reshape(-1,1)
u_train = np.asarray(u_train).reshape(-1,1)
d_train = np.asarray(d_train).reshape(-1,1)

y_test = np.asarray(y_test).reshape(-1,1)
r_test = np.asarray(r_test).reshape(-1,1)
u_test = np.asarray(u_test).reshape(-1,1)
d_test = np.asarray(d_test).reshape(-1,1)

In [None]:
# Start the session to compute the tensorflow graph
testPreds = [] 
trainPreds = [] 

with tf.Session() as sess:

    # Run the initialization
    sess.run(init)

    # Do the training loop
    for epoch in range(num_epochs):
        
        epoch_cost = 0.
                
        # iterate through the minibatches
        for minibatch in minibatches:
            
            # Select a minibatch
            (minibatch_X, minibatch_Y) = minibatch
            
            # IMPORTANT: The line that runs the graph on a minibatch.
            # Run the session to execute the "optimizer" and the "cost", the feedict should contain a minibatch for (X,Y).
            seqLens = np.repeat(windowLen, minibatch_Y.shape[0])
            _ , minibatch_cost = sess.run([optimizer, cost], feed_dict={inputs: minibatch_X, labels: minibatch_Y, lengths: seqLens})
            
            epoch_cost += minibatch_cost 

        # Print the cost every epoch
        print ("Cost after epoch %i: %f" % (epoch, epoch_cost))

    # make the predictions on the test and train data
    for minibatch in trainMiniBatches:
        
            # Select a minibatch
            (minibatch_X, minibatch_Y) = minibatch
            
            # Run the session to execute the "optimizer" and the "cost", the feedict should contain a minibatch for (X,Y).
            seqLens = np.repeat(windowLen, minibatch_Y.shape[0])
            preds = sess.run([output], feed_dict={inputs: minibatch_X, lengths: seqLens})
            trainPreds += preds[0].flatten().tolist()
            
    for minibatch in testMiniBatches:
        
            # Select a minibatch
            (minibatch_X, minibatch_Y) = minibatch
            
            # Run the session to execute the "optimizer" and the "cost", the feedict should contain a minibatch for (X,Y).
            seqLens = np.repeat(windowLen, minibatch_Y.shape[0])
            preds = sess.run([output], feed_dict={inputs: minibatch_X, lengths: seqLens})
            testPreds += preds[0].flatten().tolist()

In [None]:
# get the test and train predictions
temp_train = np.asarray(trainPreds)
probs_train = np.exp(temp_train)/(1 + np.exp(temp_train))
confidence_train = 2*(probs_train - 0.5)

temp_test = np.asarray(testPreds)
probs_test = np.exp(temp_test)/(1 + np.exp(temp_test))
confidence_test = 2*(probs_test - 0.5)

In [None]:
def computeSigmaScore(preds, r, u, d):
    x_t_i = preds * r * u
    data = {'day' : d, 'x_t_i' : x_t_i}
    df = pd.DataFrame(data)
    x_t = df.groupby('day').sum().values.flatten()
    mean = np.mean(x_t)
    std = np.std(x_t)
    score_valid = mean / std
    return(score_valid)
    
def computeCrossEntropyLoss(probs, r, eps = 1e-7):
    labels = (r >= 0).astype(int)
    probs_clipped = np.clip(probs, eps, 1.0-eps)
    return(np.mean(labels*np.log(probs_clipped) + (1-labels)*np.log(1-probs_clipped)))

In [None]:
[computeSigmaScore(confidence_test, r_test.flatten(), u_test.flatten(), d_test.flatten()), 
 -computeCrossEntropyLoss(probs_test, r_test.flatten())]

In [None]:
[computeSigmaScore(confidence_train, r_train.flatten(), u_train.flatten(), d_train.flatten()), 
 -computeCrossEntropyLoss(probs_train, r_train.flatten())]