In [None]:
import json
from datetime import datetime
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datatable as dt
import tensorflow.keras.backend as K
from tensorflow import keras
from sklearn.model_selection import KFold
now = datetime.now().strftime("%Y%m%d_%H%M%S")
print(now)

In [None]:
df = dt.fread("../input/jane-street-market-prediction/train.csv")
df = df.to_pandas()
df = df.query('weight > 0').drop(columns=["resp_"+str(r) for r in range(1,5)])
df.shape

In [None]:
def __profit_maximizer__(y_true, y_pred):
    """
    """
    L = - K.sum(y_true * y_pred)
    return L  

def __create_NN_model__(NN_params, input_shape):
    """
    """
    n_layer = len(NN_params["layers"])
    model = keras.Sequential()
    for l in range(n_layer):
        if l == 0:
            if NN_params["pre_drop"] is None:
                pass
            else:
                model.add(keras.layers.Dropout(NN_params["pre_drop"]))
            model.add(keras.layers.Dense(
                NN_params["layers"][l],
                input_shape=[input_shape],
                activation=NN_params["actifun"][l],
                kernel_regularizer=keras.regularizers.l2(NN_params["L2"][l])))
        else:
            model.add(keras.layers.Dense(
                NN_params["layers"][l],
                activation=NN_params["actifun"][l],
                kernel_regularizer=keras.regularizers.l2(NN_params["L2"][l])))
        model.add(keras.layers.Dropout(NN_params["dropout"][l]))
    model.add(keras.layers.Dense(1, activation='sigmoid'))
    return model

NN_params = {
    "lr": 0.0055,
    "n_epoch": 49,
    "n_batch": 2,
    "pre_drop": None,
    "layers": [60, 80, 120],
    "actifun": ['relu', 'relu', 'relu'],
    "dropout": [0.01, 0.03, 0.05],
    "L2": [0.001, 0.003, 0.005]
}

oth_params = {
    "weight_pwr": 0.6,
    "seed": 42,
    "n_fold": 5,
    "thresholds": [0.5, 0.75, 0.9, 0.95]
}

all_params = {"NN_params": NN_params, "oth_params": oth_params}
with open("D%s_train_config.json"%now, 'w') as tcf:
    json.dump(all_params, tcf)

In [None]:
df.loc[:, "target"] = df['resp'] * (df['weight'] ** oth_params["weight_pwr"])
X_col = [f for f in df.columns if f.startswith("feature")]

dates = np.array(range(500))
kf = KFold(n_splits=oth_params["n_fold"], random_state=oth_params["seed"], shuffle=True)
results = {}

for i, (train_index, test_index) in enumerate(kf.split(dates)):
    
    # Train/test split
    x_train = np.nan_to_num(df.loc[df['date'].isin(train_index), X_col].values)
    y_train = df.loc[df['date'].isin(train_index), "target"].values
    
    x_test = np.nan_to_num(df.loc[df['date'].isin(test_index), X_col].values)
    pred_df = df.loc[df['date'].isin(test_index), ['date', 'resp', 'weight']]
    stats = {'resp_mean': pred_df['resp'].mean(),
             'resp_median': pred_df['resp'].median(),
             'resp_std': pred_df['resp'].std(),
             'wgt_median': pred_df['weight'].median()}
    
    # Model training
    model = __create_NN_model__(NN_params, input_shape=x_train.shape[1])
    model.compile(loss=__profit_maximizer__, 
                  optimizer=keras.optimizers.Adam(learning_rate=NN_params["lr"]))
    model.fit(x_train, y_train, 
              epochs=NN_params["n_epoch"], 
              batch_size=np.ceil(x_train.shape[0]/
                                 NN_params["n_batch"]).astype(int),
              verbose=0)
    
    # Predictions and utility
    preds = model.predict(x_test)
    utility = {}
    for th in oth_params["thresholds"]:
        pred_df.loc[:, "action"] = (preds > th).astype(int)
        pred_df.loc[:, 'profit'] = pred_df['weight'] * pred_df['resp'] * pred_df['action'] 
        daily_profit = pred_df.groupby('date')['profit'].sum()

        # compute utility and its components 
        p = np.sum(daily_profit)
        v = np.sqrt(np.sum(daily_profit ** 2)*len(daily_profit)/250)
        t = p / v
        u = min(max(0, t), 6) * p
        print("Utility in fold %d with %d%% threshold: %.2f"%(i+1, int(th*100), u))
        utility["T_"+str(int(th*100))] = {'profit': p, 'volatility': v, 'ratio': t, 'utility': u}
        
    results["K"+str(i+1)] = {'utility': utility, 'stats': stats}

In [None]:
d_list = []
for k, v in results.items():
    for t, u in v['utility'].items():
        x = u.copy()
        x['threshold'] = t
        x['kfold'] = k
        for stat, val in v['stats'].items():
            x[stat] = val
        d_list.append(x)
        
u_df = pd.DataFrame(d_list).sort_values(['threshold', 'kfold'])
u_df.to_csv("D%s_Kfold_stats.csv"%now, index=False)
display(u_df)

In [None]:
# Train/test split
x_train = np.nan_to_num(df[X_col].values)
y_train = df["target"].values

# Model training
model = __create_NN_model__(NN_params, input_shape=x_train.shape[1])
model.compile(loss=__profit_maximizer__, 
              optimizer=keras.optimizers.Adam(learning_rate=NN_params["lr"]))
model.fit(x_train, y_train, 
          epochs=NN_params["n_epoch"], 
          batch_size=np.ceil(x_train.shape[0]/
                             NN_params["n_batch"]).astype(int),
          verbose=1)

# Save model
n_layer = len(NN_params["layers"]) + 1
names = [s + str(i) for i in range(n_layer) for s in ["w", "b"]]
weights = {}
for i, a in enumerate(model.get_weights()):
    weights[names[i]] = a.tolist()
with open("D%s_model_weights.json"%now, "w") as mwf:
    json.dump(weights, mwf)