In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math

import os
from kaggle.competitions import twosigmanews

In [None]:
# Load training data
env = twosigmanews.make_env()
(market_train_df, news_train_df) = env.get_training_data()

In [None]:
news_drop_col = ['sourceTimestamp','firstCreated','sourceId','companyCount','bodySize','assetCodes']
news_train_df = news_train_df.drop(news_drop_col, 1, errors = 'ignore')
market_drop_cols = ['returnsClosePrevRaw1','returnsClosePrevMktres1','returnsClosePrevRaw10','returnsClosePrevMktres10','universe']
market_train_df = market_train_df.drop(market_drop_cols, 1, errors = 'ignore')
market_train_df = market_train_df.dropna()
#market_remain_cols = ['time','assetCode','assetName','volume','close','open','returnsOpenPrevRaw1','returnsOpenPrevMktres1','returnsOpenPrevRaw10','returnsOpenPrevMktres10','returnsOpenNextMktres10']
#market_remain_cols = market_remain_cols.loc[:,cols]

In [None]:
from datetime import datetime,time, timedelta
import pytz

tz = pytz.timezone('Europe/Istanbul')
cut_time = tz.localize( datetime.strptime('2015-01-01 00:00:00','%Y-%m-%d %H:%M:%S'), is_dst=None)
news_train_df = news_train_df[news_train_df['time'] > cut_time]
market_train_df['time'] = pd.to_datetime(market_train_df['time'])
market_train_df = market_train_df[market_train_df['time'] > cut_time]

Data Preprocessing - Merge Data

In [None]:
df_market = market_train_df
df_market.returnsOpenNextMktres10 = df_market.returnsOpenNextMktres10.clip(-1,1)
df_market.head()

In [None]:
from datetime import datetime,time, timedelta

def time2market(x):
    date = x.date()
    Ntime = x.time()
    if Ntime <time(22,30):
        return (date.strftime('%Y-%m-%d 22:00:00+00:00'))
    else:
        date = date +timedelta(days=1)
        return (date.strftime('%Y-%m-%d 22:00:00+00:00'))


news_train_df['time2market'] = news_train_df['time'].apply(lambda s: time2market(s))
market_train_df ['time2market'] = market_train_df ['time']

In [None]:
news_train_df.head()

In [None]:
drop_cols4nn = ['time','assetCode','assetName','time2market']
def preprocess_market(market_df, news_df):
    # aggregate news data =============
    news_numeric_df = news_df.groupby(['time2market','assetName']).agg({'noveltyCount12H':'mean','noveltyCount24H':'mean','noveltyCount3D':'mean','noveltyCount5D':'mean','noveltyCount7D':'mean','volumeCounts12H':'mean','volumeCounts24H':'mean','volumeCounts3D':'mean','volumeCounts5D':'mean','volumeCounts7D':'mean','relevance':'mean','urgency':'count'}).reset_index()
    news_numeric_df = news_numeric_df.rename(columns={ news_numeric_df.columns[-1]: "newsCount" })
    
    # merge table =============
    news_numeric_df['time2market'] = pd.to_datetime(news_numeric_df['time2market'])
    news_numeric_df['time2market'] = news_numeric_df['time2market'].astype('datetime64[ns, UTC]')

    market_df.set_index(['time2market', 'assetName','assetCode'])
    news_numeric_df.set_index(['time2market', 'assetName'])

    merged_df = pd.merge(market_df, news_numeric_df, how='left', on=['time2market', 'assetName'])
    
    # create newsDataAvailability column =============
    merged_df['newsDataAvailability'] = merged_df['volumeCounts7D'].apply(lambda x: 0 if math.isnan(x) else 1)
    
    # convert NA to 0 =============
    merged_df = merged_df.fillna(0)
    
    # left columns for NN  =============
    drop_cols4nn = ['time','assetCode','assetName','time2market']
    merged_df = merged_df.drop(drop_cols4nn, 1, errors = 'ignore')
    
    return merged_df

In [None]:
merged_df = preprocess_market(market_train_df,news_train_df)
merged_df.head()

Simple Neural Network - Market Data

In [None]:
# function for min-max normalization of stock
import sklearn
import sklearn.preprocessing
from sklearn.preprocessing import MinMaxScaler

In [None]:
X = merged_df.drop(['returnsOpenNextMktres10'], axis=1).values
y = merged_df['returnsOpenNextMktres10'].values

min_max_scaler = sklearn.preprocessing.MinMaxScaler()
X = min_max_scaler.fit_transform(X)

#cols = df_market.columns.values
#df_market_norm = normalize_data(df_market)

In [None]:
#imports
import tensorflow as tf
from tensorflow import keras
from keras import Sequential
from keras.layers import Dense

#build model
model = Sequential()

model.add(Dense(50, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(1))

model.compile(loss='mean_squared_error', optimizer='adam')

In [None]:
# Train the model
model.fit(
    X,
    y,
    epochs=10,
    shuffle=False,
    verbose=2
)

Prediction

In [None]:
days = env.get_prediction_days()
#(market_obs_df, news_obs_df, predictions_template_df) = next(days)

In [None]:
def make_market_predictions(predictions_template_df, market_obs_df, news_obs_df):
    news_obs_df = news_obs_df.drop(news_drop_col, 1, errors = 'ignore')
    market_obs_df = market_obs_df.drop(market_drop_cols, 1, errors = 'ignore')
    
    news_obs_df['time2market'] = news_obs_df['time'].apply(lambda s: time2market(s))
    market_obs_df ['time2market'] = market_obs_df ['time']

    merged_df = preprocess_market(market_obs_df,news_obs_df)
    X_obs = merged_df.drop(drop_cols4nn, 1, errors = 'ignore').values
    X_obs = min_max_scaler.transform(X_obs)
    predicted = model.predict(X_obs)
    predicted[np.isnan(predicted)] = 0
    predictions_template_df.confidenceValue = predicted
    env.predict(predictions_template_df)

In [None]:
#(market_obs_df, news_obs_df, predictions_template_df) = next(days)
#make_market_predictions(predictions_template_df, market_obs_df, news_obs_df)

In [None]:
for (market_obs_df, news_obs_df, predictions_template_df) in days:
    make_market_predictions(predictions_template_df, market_obs_df, news_obs_df)
    #env.predict(predictions_template_df)
print('Done!')

Submission

In [None]:
env.write_submission_file()

In [None]:
import os
print([filename for filename in os.listdir('.') if '.csv' in filename])