In [1]:
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import plotly.tools as tls
from tqdm import tqdm
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

tqdm.pandas()

ModuleNotFoundError: No module named 'plotly'

In [None]:
# Loading the training data
from kaggle.competitions import twosigmanews
env = twosigmanews.make_env()
(market_train_df, news_train_df) = env.get_training_data()

In [None]:
# Sampling the train data data
# Note that the data goes from 2007-02-01 to 2016-12-30
dateFrom = '2016-08-01'
dateTo = '2016-12-30'
market_train_df_sample = market_train_df.loc[(dateFrom < market_train_df['time']) & (market_train_df['time'] < dateTo)]
news_train_df_sample = news_train_df.loc[(dateFrom < news_train_df['time']) & (news_train_df['time'] < dateTo)]

In [None]:
# CLEANING MARKET DATA
# if open price is too far from mean open price for this company, replace it. Otherwise replace close price.
market_train_df_sample['close_to_open'] =  np.abs(market_train_df_sample['close'] / market_train_df_sample['open'])
market_train_df_sample['assetName_mean_open'] = market_train_df_sample.groupby('assetName')['open'].transform('mean')
market_train_df_sample['assetName_mean_close'] = market_train_df_sample.groupby('assetName')['close'].transform('mean')
for i, row in market_train_df_sample.loc[market_train_df_sample['close_to_open'] >= 2].iterrows():
    if np.abs(row['assetName_mean_open'] - row['open']) > np.abs(row['assetName_mean_close'] - row['close']):
        market_train_df_sample.iloc[i,5] = row['assetName_mean_open']
    else:
        market_train_df_sample.iloc[i,4] = row['assetName_mean_close']
        
for i, row in market_train_df_sample.loc[market_train_df_sample['close_to_open'] <= 0.5].iterrows():
    if np.abs(row['assetName_mean_open'] - row['open']) > np.abs(row['assetName_mean_close'] - row['close']):
        market_train_df_sample.iloc[i,5] = row['assetName_mean_open']
    else:
        market_train_df_sample.iloc[i,4] = row['assetName_mean_close']


In [None]:
# Extracting date only from date time market data
market_train_df_sample['date'] = pd.to_datetime(market_train_df_sample['time']).progress_apply(lambda x: x.date())

In [None]:
# Cleaning financial news data. Keeping only news touching the asset codes in the market data.
# TODO: Regroup all financial news without market data in a asset code `Market`
assetCodes = set(market_train_df_sample.assetCode.unique())
def extractCodes(row):
    codes = row.assetCodes.strip('{').strip('}').replace(' ','').replace("'","").split(',')
    assets = []
    for code in codes:
        if code in assetCodes:
            assets.append(code)
    return assets, len(assets)

def isInAsset(row):
    if row.assetName in assetNames:
        return True
# Only keep row containing asset code
news_train_df_sample['assets'], news_train_df_sample['assetsSize'] = zip(*news_train_df_sample.progress_apply(extractCodes, axis=1))

# Simple approach for now: removing article where article are not related to one asset
# Later we will create one extra asset representing the 'market'
news_train_df_sample = news_train_df_sample[news_train_df_sample['assetsSize'] == 1]

# turn asset array into single asset
def extractElt(row):
    return row.assets[0]
news_train_df_sample['assetCode'] = news_train_df_sample.progress_apply(extractElt, axis=1)

In [None]:
# TODO: Feature Engineering and Dimension Reduction to be done at that point.
# TODO: Do smoothing on Financial News ONLY here.

In [None]:
# Grouping financial news data by asset by date
# Extracting date only
news_train_df_sample['date'] = pd.to_datetime(news_train_df_sample['time']).progress_apply(lambda x: x.date())
# Adding column to count article
news_train_df_sample['count'] = 1
# Grouping lines by date and asset 
# Sum number of article ( add column count = 1)
news_train_df_sample = news_train_df_sample.groupby(['assetCode', 'date']).sum()

In [None]:
# TODO: regularize by asset

In [None]:
# Indexing Market Data by date, assetCode
market_train_df_sample = market_train_df_sample.groupby(['assetCode', 'date']).mean()

# Concatenating market and news
train_df_sample = pd.concat([market_train_df_sample, news_train_df_sample], axis=1)

# We only keep line with output present (Some news day might have been weekend)
train_df_sample = train_df_sample[train_df_sample['open'].notnull()]
# Dropping any column where output is empty
train_df_sample = train_df_sample.fillna(0)
# We don't need the indexes anymore
train_df_sample.reset_index(inplace=True)

In [None]:
# Smoothing all the data using Exponential Moving Average
decay = 0.5
train_df_sample = train_df_sample.set_index('assetCode')
numericalColumns = set(train_df_sample.columns.values)
numericalColumns.remove('date')

for asset in tqdm(assetCodes):
    train_df_sample.loc[asset, numericalColumns] = train_df_sample.loc[asset, numericalColumns].ewm(com=decay).mean()

In [None]:
# train_df_sample.head()
market_train_df_sample.head()

In [None]:
# Creating unique asset id
def create_asset_id_dataframe(market_train_df):
    assetId_df = market_train_df[['assetCode', 'assetName']].drop_duplicates()
    print("%d unique asset codes" % len(assetId_df))
    assetId_df['assetName'] = assetId_df['assetName'].str.replace(' ','')
    assetId_df['assetCodePrefix'] = assetId_df['assetCode'].apply(lambda x: x.split('.')[0])
    assetId_df['assetId'] = np.where(assetId_df['assetName']=='Unknown', assetId_df['assetCodePrefix'], assetId_df['assetName'])
    return assetId_df.drop_duplicates()

asset_id_df = create_asset_id_dataframe(market_train_df)[['assetCode','assetId']]
asset_id_df = asset_id_df.set_index('assetCode')

%time
train_df_sample = pd.merge(train_df_sample, asset_id_df, how='inner', left_index=True, right_index=True)
train_df_sample.head()

In [None]:
# Build Models
train_df_sample.head()

In [None]:
train_df_sample['assetNumber'] = train_df_sample['assetId'].astype('category').cat.codes + 1

In [None]:
def regularizeDataPerAsset(train_df_sample, assetCodes):
    # Regularization by asset
    # TODO: Use another regularization technique this one will work poorly with outliers.
    # TODO: This should return a map that will be used during forecasting...
    train_df_sample.reset_index(inplace=True)
    numericalColumns = set(train_df_sample.columns.values)
    numericalColumns.remove('date')
    numericalColumns.remove('assetCode')
    numericalColumns.remove('assetId')

    train_df_sample.set_index('assetCode', inplace=True)
    observation = train_df_sample['returnsOpenNextMktres10']

    for asset in tqdm(assetCodes):
        df = train_df_sample.loc[asset, numericalColumns]
        df = (df - df.mean()) / (df.max() - df.min())
        train_df_sample.loc[asset, numericalColumns] = df

    train_df_sample['nonRegularizedObservation'] = observation
    train_df_sample.reset_index(inplace=True)
    
    return train_df_sample


In [None]:
set(train_df_sample['assetCode'])
train_df_sample = regularizeDataPerAsset(train_df_sample, list(set(train_df_sample['assetCode'])))
# df = df.set_index('assetCode')
# df.loc['A.N'].describe()

In [None]:
# train_df_sample.head()
numericalColumns = set(train_df_sample.columns.values)
numericalColumns.remove('date')
numericalColumns.remove('assetId')
assets = np.unique(train_df_sample.index.values)

df = train_df_sample.loc[assets, numericalColumns]
abs_max_df = df.abs().groupby('assetCode').max()
min_df = df.groupby('assetCode').min()
mean_df = df.groupby('assetCode').mean()
std_df = df.groupby('assetCode').std()

for asset in tqdm(assets):
    df.loc[asset] = df.loc[asset] / abs_max_df.loc[asset]

df.describe()

In [None]:
norm_train_df = pd.DataFrame(train_df_sample)
for asset in tqdm(assets):
    norm_train_df.loc[asset, numericalColumns] = df.loc[asset, numericalColumns]

In [None]:
norm_train_df.describe()

In [None]:
train_df_sample = train_df_sample.reset_index()
train_df_sample = train_df_sample.sort_values(by=['date'])
labels = train_df_sample['returnsOpenNextMktres10']
non_numeric = train_df_sample[['returnsOpenNextMktres10','assetCode','date','assetId']]
train = train_df_sample.drop(['returnsOpenNextMktres10','assetCode','date','assetId'], axis=1)

In [None]:
# train_df_sample.head()
# non_numeric.head()
# train.head()
# len(train.columns)
labels.head()

In [None]:
train = norm_train_df.sort_values(by=['date'])
train = train.fillna(0)
labels = train['returnsOpenNextMktres10']
train = train.drop(['returnsOpenNextMktres10','date','assetId'], axis=1)
train.isna().sum()
labels.describe()

In [None]:
train.head()

In [None]:
# Model construction and training. Single layer NN
model = keras.Sequential([keras.layers.Dense(100, activation=tf.nn.relu, input_shape=(len(train.columns),)), 
#                           keras.layers.Dense(2, activation=tf.nn.softmax)
                         keras.layers.Dense(1)])

model.compile(optimizer=tf.train.AdamOptimizer(), loss='mean_squared_error', metrics=['accuracy'])
model.fit(train, labels, epochs=2)
model.summary()

In [None]:
# predictions = model.predict(train.head())
predictions = model.predict(train)
# np.argmax(predictions[1])
# np.apply_along_axis(np.argmax, 1, predictions)

In [None]:
# predictions.mean()
predictions

In [None]:
def evaluate(df):
    df['yr'] = df.apply(lambda row: row.returnsOpenNextMktres10 * row.pred, axis=1)
    df[['date','yr']].groupby(['date']).sum().reset_index()

In [None]:
train_df_sample = train_df_sample.sort_values(by=['date'])
score_df = train_df_sample[['date','returnsOpenNextMktres10']]
score_df['pred'] = predictions
score_df['yr'] = score_df.apply(lambda row: row.returnsOpenNextMktres10 * row.pred, axis=1)

score_df.head()


In [None]:
sum_score = score_df[['date','yr']].groupby(['date']).sum()
final_score = sum_score['yr'].mean() / sum_score['yr'].std()
final_score

In [None]:
score_df[['date','yr']].groupby(['date']).sum()