* Single Neural Network version: https://www.kaggle.com/binhlc/jane-street-tensorflow-dense
* Highest Scored: 7736
* Time: 2.5 hours on GPU

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import gc
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
train = reduce_mem_usage(train)
features = [c for c in train.columns if 'feature' in c]
train = train.astype({c: np.float32 for c in train.select_dtypes(include='float16').columns}) 
train = train.fillna(train.mean())
f_mean = np.mean(train[features[1:]].values,axis=0)
train = train.query('date > 85').reset_index(drop = True)
train = train[train.weight != 0]

In [None]:
n_folds = 5
seed = 42

ScaleData = False
if (ScaleData):
    scaler = MinMaxScaler(feature_range = (0, 1)).fit(train[features])
    train.loc[:,features] = scaler.transform(train[features])

X = train[features]
y = (train['resp'].values > 0).astype(int)

#del train
#gc.collect()

#skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
#train_index, test_index = next(skf.split(X, y))
#X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#y_train, y_test = y[train_index], y[test_index]


X_train = train[train['date'] < 400][features]
X_test = train[train['date'] >= 400][features]
train['action'] = ((train['weight'].values * train['resp'].values) > 0).astype('int')

y_train = train[train['date'] < 400]['action']
y_test = train[train['date'] >= 400]['action']

X_train = train[features]
y_train = train['action']

In [None]:
#np.histogram(y_train, bins=2)
unique, counts = np.unique(y_test, return_counts=True)
dict(zip(unique, counts))

In [None]:
import lightgbm as lgb

NUM_ROUND = 2000
params = {
    'objective': 'cross_entropy',
    "metric": "auc",
    "random_state": seed,
    "learning_rate": 0.05,
    #"max_depth": 16,
    #"max_bin": 255,
    "num_leaves": 511,
    "feature_fraction": 0.8,
    "min_data_in_leaf": 20,
    "min_data_per_group": 90,
    "lambda_l1": 20,
    "lambda_l2": 5,
    "device": "gpu"
    
}

trainData = lgb.Dataset(X, label = y, free_raw_data = False)
#validData = lgb.Dataset(X_test, label = y_test, free_raw_data = False)
model = lgb.train(params, trainData, num_boost_round = NUM_ROUND, 
                  #early_stopping_rounds = 30, 
                  #valid_sets = [validData, trainData], verbose_eval=50
                 )

'''
import xgboost as xgb

params = {
    #'objective': 'binary:logistic',
    'learning_rate': 0.05,
    "eval_metric": "auc",
    #'tree_method': 'gpu_hist',
    #'gpu_id': 0,
    "random_state": seed
}

NUM_ROUND = 500
trainData = xgb.DMatrix(X_train, label = y_train)
validData = xgb.DMatrix(X_test, label = y_test)
evallist = [(trainData, 'train'),(validData, 'eval')]
model = xgb.train(params, trainData, NUM_ROUND, evallist, early_stopping_rounds = 20, verbose_eval = 50)
'''

In [None]:
# Create submission
import janestreet
from tqdm.notebook import tqdm
janestreet.competition.make_env.__called__ = False
env = janestreet.make_env()
iter_test = env.iter_test()
for (test_df, sample_prediction_df) in tqdm(iter_test):
    if (test_df.iloc[0].weight > 0): 
        if (ScaleData):
            test_df[features] = scaler.transform(test_df[features])
        test_np = test_df.loc[:, features].values
        if np.isnan(test_np[:, 1:].sum()):
            test_np[:, 1:] = np.nan_to_num(test_np[:, 1:]) + np.isnan(test_np[:, 1:]) * f_mean
            test_df[features] = test_np
        #action = model.predict(xgb.DMatrix(test_df[features]))[0]
        action = model.predict(test_df[features])[0]
        if (action > 0.5):
            sample_prediction_df.action = 1
        else:
            sample_prediction_df.action = 0                            
    else:
        sample_prediction_df.action = 0
    env.predict(sample_prediction_df)
