In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.neural_network import MLPRegressor

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

# Import environment
from kaggle.competitions import twosigmanews
env = twosigmanews.make_env()

In [None]:
from sklearn.model_selection import KFold # import KFold to split data set into K splits
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
# distribution of confidence as a sanity check: they should be distributed as above
import time
import warnings
warnings.simplefilter(action='ignore')

In [None]:
(mt_df, nt_df) = env.get_training_data() # load test dataset

# Processing Market Data
Removing rows with NaN/Null  and Outliers

In [None]:
##### Analyze the null and NaN values in the dataset - they are the same
def dropNull(mt_df):
    total_null = mt_df.isnull().sum().sort_values(ascending = False)
    #total_NaN = mt_df.isna().sum().sort_values(ascending = False)
    total = mt_df.shape[0]
    percent_Null = round(mt_df.isnull().sum().sort_values(ascending = False)/total*100, 2)
    #percent_NaN = round(mt_df.isna().sum().sort_values(ascending = False)/mt_df.shape[0]*100, 2)
    mt_df_null = pd.concat([total_null, percent_Null], axis = 1,\
                            keys= ['Number of Null Entries', 'Percentage of Null Entries'])
    return mt_df_null

mt_df_null = dropNull(mt_df) # Analyzing Null values
print(mt_df_null)

In [None]:
# drop rows with Null values
mt_df = mt_df.dropna()

# sanity check that it did drop Null rows
mt_df_null = dropNull(mt_df) # Analyzing Null values
print(mt_df_null)


# Putting all functions in this next cell

In [None]:

from sklearn.preprocessing import StandardScaler

# Function to remove outliers in defined columns
def remove_outlier(df,column_list,lower_percentile,upper_percentile):
    for i in range(len(column_list)):
        #upper_bound = np.percentile(df[column_list[i]],upper_percentile)
        #lower_bound = np.percentile(df[column_list[i]],lower_percentile)
        df = (df[(df[column_list[i]]<np.percentile(df[column_list[i]],upper_percentile)) & (df[column_list[i]]>np.percentile(df[column_list[i]],lower_percentile))])
    return df

# split data into features and target values
def get_target_data(df, features):
#     features = ['returnsClosePrevRaw1',
#          'returnsOpenPrevRaw1',
#          'returnsClosePrevMktres1',
#          'returnsOpenPrevMktres1',
#          'returnsClosePrevRaw10',
#          'returnsOpenPrevRaw10',
#          'returnsClosePrevMktres10',
#          'returnsOpenPrevMktres10']  
    ft = ['time'] + features
    x = df[ft]
    y = df[['time','returnsOpenNextMktres10']]
    return x,y

# Standardizing/Scaling the Data so that mean ~= 0 and standard deviation is 1
def scale_data(X_train, X_test, features):
    scaler = StandardScaler()
    scaler.fit(X_train[features]) # do not want to fit to the test data
    X_train[features] = scaler.transform(X_train[features])
    X_test[features] = scaler.transform(X_test[features])
    return X_train, X_test

def make_my_prediction(x):
#     my_pred = (mlpR.predict(x)).reshape(1,-1)[0]
#     my_pred[my_pred>0]=1
#     my_pred[my_pred<0]=-1
    start_time = time.time()
    my_pred = (mlpR.predict(x)).reshape(1,-1)[0]
    positive_pred = my_pred[my_pred>=0]
    negative_pred = my_pred[my_pred<0]
    pos_min = positive_pred.min()
    pos_max = positive_pred.max()
    neg_min = negative_pred.min()
    neg_max = negative_pred.max()

    for i in range(len(positive_pred)):
        positive_pred[i] = (positive_pred[i]-pos_min)/(pos_max - pos_min)
    for m in range(len(negative_pred)):
        negative_pred[m] = -1 + (negative_pred[m]-neg_min)/(neg_max-neg_min)
    elapsed_time = time.time() - start_time
    print('It took', elapsed_time/60, 'minutes make predictions and scale them to confidence interval')

    return my_pred

# sigma_score function is considered as a custom evaluation metric for xgboost
# example of how custom evaluation function is incorporated into xgboost's training can be found here : https://github.com/dmlc/xgboost/blob/master/demo/guide-python/custom_objective.py
def sigma_score(preds,dval,df):
    
    # get y_target values
    labels = dval
    # call time parameter to be used for grouping, so that we can add x_t values for each day
    df_time = df
    
    #calculate x_t and score as specified by the competition
    x_t = pd.Series(preds*labels)
    x_t_sum = x_t.groupby(df_time).sum()    
    score = (x_t_sum.mean())/(x_t_sum.std())
    return 'sigma_score', round(score,5)


In [None]:
# Removing the market residual outliers because of trial and error
# raw outliers did not seem to impact some residual outliers but vice versa was true
outlier_removal_list = [#'returnsClosePrevRaw1',
         #'returnsOpenPrevRaw1',
         'returnsClosePrevMktres1',
         'returnsOpenPrevMktres1',
         #'returnsClosePrevRaw10',
         #'returnsOpenPrevRaw10',
         'returnsClosePrevMktres10',
         'returnsOpenPrevMktres10',   
        'returnsOpenNextMktres10']
# Defining features that will be fed into the Neural Network
features = ['returnsClosePrevRaw1',
         'returnsOpenPrevRaw1',
         'returnsClosePrevMktres1',
         'returnsOpenPrevMktres1',
         'returnsClosePrevRaw10',
         'returnsOpenPrevRaw10',
         'returnsClosePrevMktres10',
         'returnsOpenPrevMktres10']  
# Orgininal feature values
mt_df[features].describe().transpose()



In [None]:
mt_df_no_outlier = remove_outlier(mt_df,outlier_removal_list,2,98)

# feature values without outliers
mt_df_no_outlier[features].describe().transpose()

In [None]:
# # Split data into n sets for cross validation
from sklearn.model_selection import KFold # import KFold

# Getting feature data and target data
mt_df_features,mt_df_target = \
                        get_target_data(mt_df_no_outlier, features)

def split_data_Kfold(X, y, n_splits, NNfeatures):
    kf = KFold(n_splits)
    NNfeatures = [#'returnsClosePrevRaw1',
#          'returnsOpenPrevRaw1',
#          'returnsClosePrevMktres1',
#          'returnsOpenPrevMktres1',
         'returnsClosePrevRaw10',
         'returnsOpenPrevRaw10',
#         'returnsClosePrevMktres10',
         'returnsOpenPrevMktres10']  
    for train_index, test_index in kf.split(X, y): # passes y since surpervised learning is desired
        X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
        y_train, y_test = y.iloc[train_index,:], y.iloc[test_index,:]
        train_t = y_train['time']
        test_t = y_test['time']
        return  X_train[NNfeatures], X_test[NNfeatures], y_train.iloc[:,1], y_test.iloc[:,1],\
                                                                train_t, test_t

n_splits = 5
mt_features_train, mt_features_test, mt_target_train, mt_target_test, train_time, test_time \
            = split_data_Kfold(mt_df_features,mt_df_target, n_splits, features)

In [None]:
# Before cross validation
# Multi-layer Perceptron (MLP) Regressor
from sklearn.neural_network import MLPRegressor
# Suppose there are n training samples, m features, k hidden layers, 
# each containing h neurons - for simplicity, and o output neurons. 
# The time complexity of backpropagation is O(n⋅m⋅h^k⋅o⋅i), where i is the number of iterations

# Creating the model
mlpR = MLPRegressor(hidden_layer_sizes = (5,5,5,5),
                   max_iter=300) 

In [None]:
import time
# Training the model
start_time = time.time()
mlpR.fit(mt_features_train,mt_target_train)
elapsed_time = time.time() - start_time
print('It took', elapsed_time/60, 'minutes to train the neural network')

In [None]:
mlpR.n_layers_

In [None]:
# testing and evaluating the model
from sklearn.metrics import classification_report,confusion_matrix
mt_target_pred = mlpR.predict(mt_features_test)
mt_target_pred

In [None]:
import matplotlib.pyplot as plt
data = pd.DataFrame({'y_real':mt_target_test,'y_pred':mt_target_pred})
diff = data.iloc[:,0]-(data.iloc[:,1])
t = range(0,100)
plt.plot(t,data.iloc[0:100, 0])
plt.plot(t,10*data.iloc[0:100, 1])
#plt.plot(diff.iloc[0:100])
# plt.plot(mt_target_train.iloc[0:100])
plt.legend()
plt.title('Real Return Values vs. Predicted\nReturn Values for Keras Neural Network')
plt.xlabel('Time (Days)')
plt.ylabel('10 Day Leading Market Adjusted Return')
plt.show()

# Getting Regression metrics

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

print('mean_absolute_error is', mean_absolute_error(mt_target_test, mt_target_pred))
print('mean_squared_error is', mean_squared_error(mt_target_test, mt_target_pred))

In [None]:
mt_pred_test = make_my_prediction(mt_features_test)
print("test : ",sigma_score(mt_pred_test,mt_target_test,test_time))
mt_pred_train = make_my_prediction(mt_features_train)
print("train : ",sigma_score(mt_pred_train,mt_target_train,train_time))

In [None]:
mlpR.get_params(deep=True)

In [None]:
# for (market_obs_df, _, predictions_template_df) in env.get_prediction_days():  
#     features = [#'returnsClosePrevRaw1',
# #          'returnsOpenPrevRaw1',
# #          'returnsClosePrevMktres1',
# #          'returnsOpenPrevMktres1',
#          'returnsClosePrevRaw10',
#          'returnsOpenPrevRaw10',
#          'returnsClosePrevMktres10',
#          'returnsOpenPrevMktres10']  
#     market_obs_df_scaled = scale_data(market_obs_df,features)    
#     x_submission = market_obs_df_scaled[features].copy()
#     # fill in NaN values with mean of rest of the values
#     for i in range(len(features)):
#          x_submission[features[i]]= x_submission[features[i]].fillna(x_submission[features[i]].mean())
#     predictions_template_df['confidenceValue'] = make_my_prediction(x_submission)
#     env.predict(predictions_template_df)
#     del x_submission
# print('Done!')
# # Write submission file    
# env.write_submission_file()