In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.decomposition import IncrementalPCA
import gc
gc.enable()

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Improve dataset download speed with the datatable package

In [None]:
!pip install datatable
import datatable as dt

Jordan Crabbe<br>
# **Basic NN Submission**<br>
Implementation of a MLP  for the Jane Street problem. Some feature engineering revolving around the NaN values in the training set

Bring in the training/test sets:

In [None]:
%%time
folder_path = '../input/jane-street-market-prediction/'
features = dt.fread(folder_path + 'train.csv').to_pandas().iloc[:,[1]+list(range(7,137))+[0]]
features = features.astype({c: np.float32 for c in features.select_dtypes(include='float64').columns}) ## reduce memory use
targets = dt.fread(folder_path + 'train.csv').to_pandas().iloc[:,2:7]
targets = targets.astype({c: np.float32 for c in targets.select_dtypes(include='float64').columns}) ## reduce memory use
features_details = dt.fread(folder_path + 'features.csv').to_pandas()
sample = dt.fread(folder_path + 'example_sample_submission.csv').to_pandas()
test_data = dt.fread(folder_path + 'example_test.csv').to_pandas()

featureEngineering accepts a df of features in the same format as provided by the submission API and returns a np array of modified and added features:

In [None]:
nulls = features.iloc[:, :-1].isnull().sum()
nulls_list = list(nulls[nulls > 0].index)

means = features.iloc[:, :-1].mean(axis = 0).values

In [None]:
def featureEngineering(features, nulls_index, averages) :
    null_ftrs = (features.loc[:, nulls_index].isnull() * 1).values
    features = features.iloc[:, :-1].values - averages
    features = np.nan_to_num(features, nan = 0)
    features = features + averages
    
    return np.concatenate((features, null_ftrs), axis = 1)

Deal with NaNs in *features*, create is.nan() features and standardise all features.

In [None]:
features = featureEngineering(features, nulls_list, means)
scaler = StandardScaler()
scaler.fit(features)
features = scaler.transform(features)

In [None]:
ipca = IncrementalPCA(copy = False, n_components = 90, batch_size = (len(features) // 5))
ipca.fit(features)
features = ipca.transform(features)

Train neural net *reg* on *features*

In [None]:
reg = MLPRegressor(hidden_layer_sizes=(80,80,20,),activation='tanh', random_state=1, max_iter=12, verbose = True).fit(features, targets.values[:, -1])

In [None]:
## Predictor combines reg with +/- 0
def predictor(array) :
    for pred in range(len(array)) :
        if array[pred] > 0 :
            array[pred] = 1
        else :
            array[pred] = 0
    return array.astype('i')

In [None]:
import janestreet
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set

for (test_df, sample_prediction_df) in iter_test:
    test_df = featureEngineering(test_df, nulls_list, means)
    test_df = scaler.transform(test_df)
    test_df = ipca.transform(test_df)
    sample_prediction_df.action = predictor(reg.predict(test_df))
    env.predict(sample_prediction_df)