In [None]:
# define constants
splitYear = 2014
firstYear = 2011


# Make the TensorFlow Graph

In [None]:
# constants
trainDim = 63
num_epochs = 25
minibatch_size = 64
seed = 0
hidden_size = 32
lr = 0.001

In [None]:
# okay let's actually set up a tensor flow graph
import tensorflow as tf

# placeholders
tf.reset_default_graph()
inputs = tf.placeholder(tf.float32, shape=(None, trainDim), name='inputs')
labels = tf.placeholder(tf.float32, shape=(None, 1), name='labels')

# First layer
W1 = tf.get_variable("W1", shape=[trainDim, hidden_size],\
           initializer=tf.contrib.layers.xavier_initializer())
b1 = tf.get_variable("b1", shape=[1, hidden_size],\
           initializer=tf.zeros_initializer())
W2 = tf.get_variable("W2", [hidden_size, 1],\
            initializer = tf.contrib.layers.xavier_initializer())
b2 = tf.get_variable("b2", [1, 1], initializer = tf.zeros_initializer())



In [None]:
# set up the relationships
Z1 = tf.add(tf.matmul(inputs, W1), b1)
A1 = tf.nn.sigmoid(Z1)
Z2 = tf.add(tf.matmul(A1, W2), b2) 
cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels = labels, logits = Z2))

In [None]:
# optimizer and cost
optimizer = tf.train.AdamOptimizer(learning_rate = lr).minimize(cost)
init = tf.global_variables_initializer()

# Load in the Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import chain

In [None]:
from kaggle.competitions import twosigmanews
# You can only call make_env() once, so don't lose it!
env = twosigmanews.make_env()
print('Done!')

In [None]:
(market_train_df, news_train_df) = env.get_training_data()

In [None]:
# add outcome and year data
market_train_df['y'] = ((market_train_df.returnsOpenNextMktres10 > 0).values).astype(int)
market_train_df['year'] = pd.to_datetime(market_train_df.time).dt.year
news_train_df['year'] = pd.to_datetime(news_train_df.time).dt.year

In [None]:
# drop years we won't need
market_train_df = market_train_df.drop(market_train_df[market_train_df['year'] < firstYear].index)
news_train_df = news_train_df.drop(news_train_df[news_train_df['year'] < firstYear].index)

In [None]:
news_cols_agg = {
#    'urgency': ['min', 'count'],
#    'takeSequence': ['max'],
#    'bodySize': ['min', 'max', 'mean', 'std'],
    'wordCount': ['min', 'max', 'mean', 'std'],
    'sentenceCount': ['min', 'max', 'mean', 'std'],
    'companyCount': ['min', 'max', 'mean', 'std'],
    'marketCommentary': ['min', 'max', 'mean', 'std'],
    'relevance': ['min', 'max', 'mean', 'std'],
    'sentimentNegative': ['min', 'max', 'mean', 'std'],
    'sentimentNeutral': ['min', 'max', 'mean', 'std'],
    'sentimentPositive': ['min', 'max', 'mean', 'std'],
    'sentimentWordCount': ['min', 'max', 'mean', 'std'],
    'noveltyCount12H': ['min', 'max', 'mean', 'std'],
#    'noveltyCount24H': ['min', 'max', 'mean', 'std'],
#    'noveltyCount3D': ['min', 'max', 'mean', 'std'],
#    'noveltyCount5D': ['min', 'max', 'mean', 'std'],
    'noveltyCount7D': ['min', 'max', 'mean', 'std'],
    'volumeCounts12H': ['min', 'max', 'mean', 'std'],
#    'volumeCounts24H': ['min', 'max', 'mean', 'std'],
#    'volumeCounts3D': ['min', 'max', 'mean', 'std'],
#    'volumeCounts5D': ['min', 'max', 'mean', 'std'],
    'volumeCounts7D': ['min', 'max', 'mean', 'std']
}

In [None]:
def join_market_news(market_train_df, news_train_df):
    # Fix asset codes (str -> list)
    news_train_df['assetCodes'] = news_train_df['assetCodes'].str.findall(f"'([\w\./]+)'")    
    
    # Expand assetCodes
    assetCodes_expanded = list(chain(*news_train_df['assetCodes']))
    assetCodes_index = news_train_df.index.repeat( news_train_df['assetCodes'].apply(len) )

    assert len(assetCodes_index) == len(assetCodes_expanded)
    df_assetCodes = pd.DataFrame({'level_0': assetCodes_index, 'assetCode': assetCodes_expanded})

    # Create expandaded news (will repeat every assetCodes' row)
    news_cols = ['time', 'assetCodes'] + sorted(news_cols_agg.keys())
    news_train_df_expanded = pd.merge(df_assetCodes, news_train_df[news_cols], left_on='level_0', right_index=True, suffixes=(['','_old']))

    # Free memory
    del news_train_df, df_assetCodes

    # Aggregate numerical news features
    news_train_df_aggregated = news_train_df_expanded.groupby(['time', 'assetCode']).agg(news_cols_agg)
    
    # Free memory
    del news_train_df_expanded

    # Convert to float32 to save memory
    news_train_df_aggregated = news_train_df_aggregated.apply(np.float32)

    # Flat columns
    news_train_df_aggregated.columns = ['_'.join(col).strip() for col in news_train_df_aggregated.columns.values]

    # Join with train
    market_train_df = market_train_df.join(news_train_df_aggregated, on=['time', 'assetCode'])

    # Free memory
    del news_train_df_aggregated
    
    return market_train_df

In [None]:
def get_xy(market_train_df, news_train_df, le=None):
    x, le = get_x(market_train_df, news_train_df)
    y = market_train_df['returnsOpenNextMktres10'].clip(-1, 1)
    return x, y, le


def label_encode(series, min_count):
    vc = series.value_counts()
    le = {c:i for i, c in enumerate(vc.index[vc >= min_count])}
    return le


def get_x(market_train_df, news_train_df, le=None):
    # Split date into before and after 22h (the time used in train data)
    # E.g: 2007-03-07 23:26:39+00:00 -> 2007-03-08 00:00:00+00:00 (next day)
    #      2009-02-25 21:00:50+00:00 -> 2009-02-25 00:00:00+00:00 (current day)
    news_train_df['time'] = (news_train_df['time'] - np.timedelta64(22,'h')).dt.ceil('1D')

    # Round time of market_train_df to 0h of curret day
    market_train_df['time'] = market_train_df['time'].dt.floor('1D')

    # Join market and news
    x = join_market_news(market_train_df, news_train_df)
    
    # If not label-encoder... encode assetCode
    if le is None:
        le_assetCode = label_encode(x['assetCode'], min_count=10)
        le_assetName = label_encode(x['assetName'], min_count=5)
    else:
        # 'unpack' label encoders
        le_assetCode, le_assetName = le
        
    x['assetCode'] = x['assetCode'].map(le_assetCode).fillna(-1).astype(int)
    x['assetName'] = x['assetName'].map(le_assetName).fillna(-1).astype(int)
    
    try:
        x.drop(columns=['returnsOpenNextMktres10'], inplace=True)
    except:
        pass
    try:
        x.drop(columns=['universe'], inplace=True)
    except:
        pass
    x['dayofweek'], x['month'] = x.time.dt.dayofweek, x.time.dt.month
    x.drop(columns='time', inplace=True)
#    x.fillna(-1000,inplace=True)

    # Fix some mixed-type columns
#    for bogus_col in ['marketCommentary_min', 'marketCommentary_max']:
#        x[bogus_col] = x[bogus_col].astype(float)
    
    return x, (le_assetCode, le_assetName)

In [None]:
# This will take some time...
X, y, le = get_xy(market_train_df, news_train_df)

In [None]:
# filter out the examples that do not have news features  
X_filtered = X[~np.isnan(X.wordCount_min)]
y_filtered = y[~np.isnan(X.wordCount_min)]

In [None]:
def get_input(market_train, indices):
    y = (market_train.loc[indices,'returnsOpenNextMktres10'] >= 0).values
    r = market_train.loc[indices,'returnsOpenNextMktres10'].values
    u = market_train.loc[indices, 'universe']
    d = market_train.loc[indices, 'time'].dt.date
    return y,r,u,d

# r, u and d are used to calculate the scoring metric
train_indices = market_train_df[market_train_df['year'] <= splitYear].index
test_indices = market_train_df[market_train_df['year'] > splitYear].index
y_train,r_train,u_train,d_train = get_input(market_train_df, train_indices)
y_test, r_test, u_test, d_test = get_input(market_train_df, test_indices)

In [None]:
y_train = y_train[~np.isnan(X[X.year <= splitYear].wordCount_min)]
r_train = r_train[~np.isnan(X[X.year <= splitYear].wordCount_min)]
u_train = u_train[~np.isnan(X[X.year <= splitYear].wordCount_min)]
d_train = d_train[~np.isnan(X[X.year <= splitYear].wordCount_min)]

y_test = y_test[~np.isnan(X[X.year > splitYear].wordCount_min)]
r_test = r_test[~np.isnan(X[X.year > splitYear].wordCount_min)]
u_test = u_test[~np.isnan(X[X.year > splitYear].wordCount_min)]
d_test = d_test[~np.isnan(X[X.year > splitYear].wordCount_min)]

In [None]:
# split into treatment and control
y = (y >= 0).astype(int)

num_cols = X.columns[(X.dtypes == "float64") | (X.dtypes == "float32")]
X_train = X_filtered[X_filtered.year <= splitYear][num_cols].values
y_train = y_train.reshape(-1, 1)
X_test = X_filtered[X_filtered.year > splitYear][num_cols].values
y_test = y_test.reshape(-1, 1)

In [None]:
# scale the variables
from sklearn.preprocessing import StandardScaler
print('scaling numerical columns')
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_train = np.nan_to_num(X_train)
X_train = scaler.fit_transform(X_train)

X_test = scaler.fit_transform(X_test)
X_test = np.nan_to_num(X_test)
X_test = scaler.fit_transform(X_test)

# Train neural net model

In [None]:
# mini-batch generation function
import math
def random_mini_batches(X, Y, mini_batch_size = 64, seed = 0):
    """
    Creates a list of random minibatches from (X, Y)
    
    Arguments:
    X -- input data, of shape (input size, number of examples)
    Y -- true "label" vector (containing 0 if cat, 1 if non-cat), of shape (1, number of examples)
    mini_batch_size - size of the mini-batches, integer
    seed -- this is only for the purpose of grading, so that you're "random minibatches are the same as ours.
    
    Returns:
    mini_batches -- list of synchronous (mini_batch_X, mini_batch_Y)
    """
    
    m = X.shape[0]                  # number of training examples
    mini_batches = []
    np.random.seed(seed)
    
    # Step 1: Shuffle (X, Y)
    permutation = list(np.random.permutation(m))
    shuffled_X = X[permutation, :]
    shuffled_Y = Y[permutation, :].reshape((m, Y.shape[1]))

    # Step 2: Partition (shuffled_X, shuffled_Y). Minus the end case.
    num_complete_minibatches = math.floor(m/mini_batch_size) # number of mini batches of size mini_batch_size in your partitionning
    for k in range(0, num_complete_minibatches):
        mini_batch_X = shuffled_X[k * mini_batch_size : k * mini_batch_size + mini_batch_size, :]
        mini_batch_Y = shuffled_Y[k * mini_batch_size : k * mini_batch_size + mini_batch_size, :]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    # Handling the end case (last mini-batch < mini_batch_size)
    if m % mini_batch_size != 0:
        mini_batch_X = shuffled_X[num_complete_minibatches * mini_batch_size : m, :]
        mini_batch_Y = shuffled_Y[num_complete_minibatches * mini_batch_size : m, :]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    return mini_batches

In [None]:
# make the mini-batches
minibatches = random_mini_batches(X_train, y_train.astype(int).reshape(-1, 1))

In [None]:
# Start the session to compute the tensorflow graph
with tf.Session() as sess:

    # Run the initialization
    sess.run(init)
        
    # Do the training loop
    for epoch in range(num_epochs):
        
        epoch_cost = 0.
                
        # iterate through the minibatches
        for minibatch in minibatches:
            
            # Select a minibatch
            (minibatch_X, minibatch_Y) = minibatch
            
            # IMPORTANT: The line that runs the graph on a minibatch.
            # Run the session to execute the "optimizer" and the "cost", the feedict should contain a minibatch for (X,Y).
            _ , minibatch_cost = sess.run([optimizer, cost], feed_dict={inputs: minibatch_X, labels: minibatch_Y})
            epoch_cost += minibatch_cost 

        # Print the cost every epoch
        print ("Cost after epoch %i: %f" % (epoch, epoch_cost))

    # make the predictions on the outcome
    trainPreds = sess.run([Z2], feed_dict={inputs: X_train})
    testPreds = sess.run([Z2], feed_dict={inputs: X_test})

In [None]:
# get the test and train predictions
temp_train = trainPreds[0].flatten()
probs_train = np.exp(temp_train)/(1 + np.exp(temp_train))
confidence_train = 2*(probs_train - 0.5)

temp_test = testPreds[0].flatten()
probs_test = np.exp(temp_test)/(1 + np.exp(temp_test))
confidence_test = 2*(probs_test - 0.5)

In [None]:
def computeSigmaScore(preds, r, u, d):
    x_t_i = preds * r * u
    data = {'day' : d, 'x_t_i' : x_t_i}
    df = pd.DataFrame(data)
    x_t = df.groupby('day').sum().values.flatten()
    mean = np.mean(x_t)
    std = np.std(x_t)
    score_valid = mean / std
    return(score_valid)
    
def computeCrossEntropyLoss(probs, r, eps = 1e-7):
    labels = (r >= 0).astype(int)
    probs_clipped = np.clip(probs, eps, 1.0-eps)
    return(np.mean(labels*np.log(probs_clipped) + (1-labels)*np.log(1-probs_clipped)))

In [None]:
[computeSigmaScore(confidence_test, r_test, u_test, d_test), 
 -computeCrossEntropyLoss(probs_test, r_test)]

In [None]:
[computeSigmaScore(confidence_train, r_train, u_train, d_train), 
 -computeCrossEntropyLoss(probs_train, r_train)]