This implementation will be heavily based on the Keras library. Certain aspects, such as developing the testing structure, will be taken from scikit-learn in order to test the results of the Keras implementation to our scikit-learn implementation.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.
# Import environment
from kaggle.competitions import twosigmanews
env = twosigmanews.make_env()

In [None]:
# Import training dataset
(market_train_df, _) = env.get_training_data()

<font size = 5>**Data Pre-Processing**</font>

In [None]:
# process data
def process_merged_data(df):
    # Drop rows with NaN values
    df = df.dropna()
    # Let's choose our features
    features = ['time','returnsClosePrevRaw1','returnsOpenPrevRaw1','returnsClosePrevMktres1','returnsOpenPrevMktres1','returnsClosePrevRaw10','returnsOpenPrevRaw10','returnsClosePrevMktres10','returnsOpenPrevMktres10','returnsOpenNextMktres10']
    X = df[features]
    return X

market_data_no_outlier = process_merged_data(market_train_df)

Removing outliers - **MAY REMOVE LATER**

In [None]:
def remove_outlier(df,column_list,lower_percentile,upper_percentile):
    for i in range(len(column_list)):
        df = (df[(df[column_list[i]]<np.percentile(df[column_list[i]],upper_percentile)) & (df[column_list[i]]>np.percentile(df[column_list[i]],lower_percentile))])
    return df
outlier_removal_list = [ 'returnsClosePrevRaw1',
                         'returnsOpenPrevRaw1',
                         'returnsClosePrevRaw10',
                         'returnsOpenPrevRaw10']
market_data_no_outlier = remove_outlier(market_data_no_outlier,outlier_removal_list,2,98)
print("Number of data decreased from ",len(market_train_df['returnsOpenNextMktres10'])," to ",len(market_data_no_outlier['returnsOpenNextMktres10']))

Dropping rows containing NaN values from Market Data

In [None]:
# Test
market_data_no_outlier.columns.values

Final Feature Selection

In [None]:
X = market_data_no_outlier[['returnsClosePrevRaw10','returnsOpenPrevRaw10','returnsClosePrevMktres10']].copy()
y = market_data_no_outlier[['returnsOpenNextMktres10']].copy()
time = market_data_no_outlier[['time']].copy()

In [None]:
X.columns.values

In [None]:
y.columns.values

In [None]:
time.columns.values

Standardize Data

In [None]:
from sklearn.preprocessing import StandardScaler

def scale_data(df,features):
    scaler = StandardScaler()
    df[features]=scaler.fit_transform(df[features])
    return df
market_data_no_outlier_scaled = scale_data(market_data_no_outlier,X.columns.values)

Splitting Data into Training and Validation - Cross-validation using k-fold testing

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits = 5) # Define the split - into 5 folds 
print(kf) 

In [None]:
X.shape

<font size="6">**Run Neural Network: **</font>  
<font size="6"></font>

Network Definition

In [None]:
# Model
from keras.models import Sequential
from keras.layers import Dense,Dropout,BatchNormalization,Input
from keras.optimizers import Adam

# Initialize Model
model = Sequential()
# Input layer & hidden layer
model.add(Dense(5, input_shape=(X.shape[1],), activation='relu'))
model.add(Dense(5,activation='relu'))
# Output layer
model.add(Dense(1))
# Compile the architecture and view summary
optimizer = Adam(lr=0.001)
model.compile(optimizer=optimizer, loss='mean_squared_error')
model.summary()

*- Try to understand early stopping and callbacks

In [None]:
from keras.callbacks import ModelCheckpoint,EarlyStopping

# checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
# checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_acc', verbose = 1, save_best_only = True, mode ='auto')
early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='auto',restore_best_weights=True)
callbacks_list = [early_stopping]
# callbacks_list = [checkpoint,early_stopping]

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    train_time, test_time = time.iloc[train_index], time.iloc[test_index]
# Change epoch number: for testing purposes
    model.fit(x=X_train.values,y=y_train.values, epochs=5,shuffle=True,validation_data=(X_test.values, y_test.values), callbacks=callbacks_list)# validation_split=0.2)#) #, callbacks=callbacks_list)


Sanity Checking:

In [None]:
data = {'y_real':y_test[:1],'y_pred':(model.predict(X_test.values[:1])).reshape(1,-1)[0]}
pd.DataFrame(data)

Confidence Value creation:

In [None]:
import time
start_time = time.time()
my_pred_test = model.predict(X_test).reshape(1,-1)[0]
positive_pred = my_pred_test[my_pred_test>=0]
negative_pred = my_pred_test[my_pred_test<0]
pos_min = positive_pred.min()
pos_max = positive_pred.max()
neg_min = negative_pred.min()
neg_max = negative_pred.max()

for i in range(len(positive_pred)):
    positive_pred[i] = (positive_pred[i]-pos_min)/(pos_max - pos_min)
for m in range(len(negative_pred)):
    negative_pred[m] = -1 + (negative_pred[m]-neg_min)/(neg_max-neg_min)
elapsed_time = time.time() - start_time
print('It took', elapsed_time/60, 'minutes make predictions and scale them to confidence interval')

In [None]:
# def make_my_prediction(x):
#     my_pred = (model.predict(x)).reshape(1,-1)[0]
#     my_pred[my_pred>0]=1
#     my_pred[my_pred<0]=-1
#     return my_pred

In [None]:
# my_pred_test = make_my_prediction(X_test)
# my_pred_train = make_my_prediction(X_train)

In [None]:
np.reshape(my_pred_test, (-1, 1))
print(my_pred_test.shape, ",", y_test.values.shape)

In [None]:
import matplotlib.pyplot as plt
data = pd.DataFrame({'y_real':y_test.iloc[:,0],'y_predicted':my_pred_test})
y_real = data.iloc[:,0]
sample_y = y_real[0:100]
sample_pred = data.iloc[0:100, 1]
scaled_pred = sample_pred*50 - 0.2
diff = (sample_y)-(scaled_pred)
t = range(0,100)
plt.plot(t, scaled_pred)
plt.plot(t, data.iloc[0:100, 0])
plt.legend()
plt.title('Real Return Values vs. Predicted\nReturn Values for Keras Neural Network')
plt.xlabel('Time (Days)')
plt.ylabel('10 Day Leading Market Adjusted Return')

plt.savefig('KerasPlot.png')
plt.show()

Statistics:

In [None]:
from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.metrics import median_absolute_error, r2_score
from sklearn.metrics import mean_absolute_error

print('mean_absolute_error is', mean_absolute_error(sample_y, sample_pred))
print('mean_squared_error is', mean_squared_error(sample_y, sample_pred))

Sigma Score:

In [None]:
# # sigma_score function is considered as a custom evaluation metric for xgboost
# # example of how custom evaluation function is incorporated into xgboost's training can be found here : https://github.com/dmlc/xgboost/blob/master/demo/guide-python/custom_objective.py
# def sigma_score(preds,dval,df):
    
#     # get y_target values
#     labels = dval
#     # call time parameter to be used for grouping, so that we can add x_t values for each day
#     df_time = df
    
#     #calculate x_t and score as specified by the competition
#     x_t = pd.Series(preds*labels)
#     x_t_sum = x_t.groupby(df_time).sum()    
#     score = (x_t_sum.mean())/(x_t_sum.std())
#     return 'sigma_score', round(score,5)

# print("Testing......\n")
# my_pred_test = make_my_prediction(X_test.values)
# print("test : ",sigma_score(my_pred_test,y_test,test_time))

# my_pred_train = make_my_prediction(X_train.values)
# print("train : ",sigma_score(my_pred_train,y_train,train_time))