# Trabalho Final da Disciplina de Aprendizado de Máquina - Janestreet Market

Autor: Michel Ricardo Meyer

Este Notebook foi feito para ser executado dentro do ambiente do Kaggle.

In [None]:
# Basta rodá-lo dentro do ambiente do kaggle que tá tudo certo

## Pacotes

import numpy as np
import pandas as pd
import dask.dataframe as dd
import janestreet
import tensorflow as tf
from sklearn.linear_model import LinearRegression
from xgboost import XGBClassifier, XGBRegressor
from sklearn.impute import SimpleImputer


## Funções
def utility_score(date, weight, resp, action):
    # Fonte: https://www.kaggle.com/gogo827jz/jane-street-super-fast-utility-score-function
    count_i = len(np.unique(date))
    Pi = np.zeros(count_i)
    for i, day in enumerate(np.unique(date)):
        Pi[i] = np.sum(weight[date == day] * resp[date == day] * action[date == day])
    t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
    u = np.clip(t, 0, 6) * np.sum(Pi)
    return u
print("Pacotes e Funções OK")

## Dados
treino = dd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv').compute().sort_values('ts_id')
features = [c for c in treino.columns if "feature" in c]
print("Dados OK")

## NaNs
mediana = np.median(treino.loc[:, features].dropna(), axis = 0)
valoresNaN = {features[x] : mediana[x] for x in range(0, len(features))}

treino = treino.fillna(value = valoresNaN)    
print("NaNs OK")

## Modelo - Rede Neural Profunda
# Definições
hidden_units = [len(features) * 3, len(features) * 7, len(features) * 7, len(features) * 3]
dropout_rates = [0.035 * 3, 0.035 * 3, 0.035 * 7, 0.035 * 7, 0.035 * 3]

inp = tf.keras.layers.Input(shape = len(features))
x = tf.keras.layers.BatchNormalization()(inp)
x = tf.keras.layers.Dropout(dropout_rates[0])(x)

for i in range(len(hidden_units)): 
    x = tf.keras.layers.Dense(hidden_units[i])(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
    x = tf.keras.layers.Dropout(dropout_rates[i+1])(x)    

x = tf.keras.layers.Dense(1)(x)
outp = tf.keras.layers.Activation('swish')(x)


# Treinamento

batch_size = 2 ** 10 * 4

dnn = tf.keras.models.Model(inputs = inp, outputs = outp)

for i in [1, 1.5, 2]:
    # Treinamento
    learning_rate = 1e-2 / 10 ** i

    dnn.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate),
                loss = tf.keras.losses.MeanSquaredError())

    dnn.fit(np.asarray(treino.loc[treino.loc[:, "weight"] > 1, features]).reshape(-1, len(features)),
            np.asarray(treino.loc[treino.loc[:, "weight"] > 1, "resp"]).reshape(-1, 1),
            validation_split=0.1,
            epochs = 15, batch_size = batch_size)

modelo = dnn

# Calculo melhor ponto de corte
validacao = treino.tail(100000)
previstos = modelo.predict(validacao.loc[:, features].values).reshape(validacao.shape[0])
quantis = np.quantile(previstos, q = np.array(range(0, 100)) / 100)
medias = [utility_score(validacao.loc[:, "date"], validacao.loc[:, "weight"], validacao.loc[:, "resp"],
                        (previstos > q) * 1) for q in quantis]

del validacao
print("Modelo OK")

## Submissão
env = janestreet.make_env()
iter_test = env.iter_test()
corte = corte = quantis[np.argmax(medias)]

for (test_df, sample_prediction_df) in iter_test:    
    sample_prediction_df.action = int((modelo.predict(
        np.where(np.isnan(test_df.loc[:, features]), mediana, test_df.loc[:, features])
    ) >= corte) * 1)
    env.predict(sample_prediction_df)
print("SUBMETIDO!!")