In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import math
import numpy as np
import pandas as pd
from numba import jit, njit

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


import pytorch_lightning as pl
import torch
import torch.nn.functional as F
from pytorch_lightning import Trainer
from pytorch_lightning.metrics.functional.classification import auroc
from torch.optim.lr_scheduler import LambdaLR
from torch.utils import checkpoint
from torch.utils.data import Dataset
from transformers.configuration_utils import PretrainedConfig
import random
from pytorch_lightning import loggers as pl_loggers

logger = logging.getLogger(__name__)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import os
import warnings
from typing import List, Optional, Tuple
from torch.optim.optimizer import Optimizer
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss

from transformers.activations import ACT2FN
from transformers.file_utils import ModelOutput
from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
from transformers.modeling_utils import (
    Conv1D,
    PreTrainedModel,
    SequenceSummary,
    find_pruneable_heads_and_indices,
    prune_conv1d_layer,
)
from transformers.utils import logging

logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "GPT2Config"
_TOKENIZER_FOR_DOC = "GPT2Tokenizer"

GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "gpt2",
    "gpt2-medium",
    "gpt2-large",
    "gpt2-xl",
    "distilgpt2",
    # See all GPT-2 models at https://huggingface.co/models?filter=gpt2
]

In [None]:
train_df = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv', nrows=20000)

In [None]:
from tqdm import tqdm

emb_dim=384
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dicts = torch.load('../input/janestreet-filesv1/transformation_dicts_021921n.pth', map_location='cpu')
transformations = dicts['transformations_dict']

def get_lists(dicts):
    floats = dicts['floats']
    strings = dicts['strings']
    dep_var = dicts['dep_var']
    str_dicts = dicts['keys_dict']
    float_dicts = dicts['float_keys']
    f_mean = dicts['f_mean']
    return floats, strings, dep_var, str_dicts, float_dicts, f_mean

float_cols, string_cols, dep_var, str_dicts, float_dicts, f_mean = get_lists(dicts)

def normalize_df(df_in):
    df_new = df_in.copy()
    df_new['decoded_weight'] = df_new['weight']
    with tqdm(total=len(float_cols)) as pbar:
        transformation_dict = dicts['transformations_dict']
        for col in float_cols:
            mean = transformation_dict[col]['mean']
            std = transformation_dict[col]['std']
            df_new[col] = df_new[col].apply(lambda x: (x-mean)/std)
            pbar.update(1)
    return df_new


def fill_nans(df_in):
    df_newer = df_in.copy()
    with tqdm(total=len(float_cols)) as pbar:
        transformation_dict = dicts['transformations_dict']
        f_means = dicts['f_mean']
        for n, col in enumerate(float_cols[1:]):
            mean = transformation_dict[col]['mean']
            std = transformation_dict[col]['std']
            f_mean = f_means[n+1]
            new_num = (f_mean-mean)/std
            df_newer[col] = df_newer[col].fillna(new_num)
            pbar.update(1)
    return df_newer


def get_y_vals(df_in):
    df_new = df_in.copy()
    df_new['action_0'] = (df_new['resp_1'] > 1.5e-3).astype(int)
    df_new['action_1'] = (df_new['resp_2'] > 1.5e-3).astype(int)
    df_new['action_2'] = (df_new['resp_3'] > 1.5e-3).astype(int)
    df_new['action_3'] = (df_new['resp'] > 1.5e-3).astype(int)
    df_new['action_4'] = (df_new['resp_4'] > 1.5e-3).astype(int)
    return df_new


train_df = train_df.sample(frac=.1)
train_df = normalize_df(train_df)
train_df = fill_nans(train_df)
train_df = get_y_vals(train_df)

# Creating dataset to test on valid data

In [None]:
class StructuredDataset(Dataset):
    def __init__(self,
                 df,
                 float_cols,
                 string_cols,
#                  dep_col,
                 dicts):
        
        self.df = df
        self.float_cols = float_cols
        self.string_cols = string_cols
#         self.dep_col = dep_col
        self.dicts = dicts

    def __len__(self):
        return self.df.shape[0]

    def lookup_floats(self, floats_list):
        encoded_list = [self.dicts['float_keys'][np.round(y, 1)] for x, y in list(zip(self.float_cols, floats_list))]
        return torch.LongTensor(encoded_list)

    def lookup_strs(self, strings):
        encoded_list = [self.dicts['keys_dict'][x]['word2key'][int(y)] for x, y in list(zip(self.string_cols, strings))]
        return torch.LongTensor(encoded_list)

    def __getitem__(self, idx):
        strings = self.df.iloc[idx, :][self.string_cols].tolist()
        floats = self.df.iloc[idx, :][self.float_cols].to_numpy()
        floats[np.isnan(floats)] = 999.0

#         dep_var = torch.Tensor(self.df.iloc[idx, :][self.dep_col].astype(int).tolist())
        
        float_cols_array = self.lookup_floats(floats)
        str_cols_array = self.lookup_strs(strings)
        
        attention_mask = torch.ones(132).to(torch.int64)
        return str_cols_array.to(torch.int64), float_cols_array.to(torch.int64), attention_mask
    

# LOAD DATASET AND DATALOADER FOR TESTING
bs=128
validation_data = pd.read_hdf('../input/janestreet-filesv1/valid_feather_z12_only.hdf')
validation_data = validation_data.iloc[:-(validation_data.shape[0]%bs)]

ds = StructuredDataset(validation_data,
                       float_cols,
                       string_cols,
                       dicts)

dataloader = torch.utils.data.DataLoader(ds, 
                                         batch_size=128, 
                                         shuffle=False,
                                         drop_last=False)

In [None]:
#loading neural net - converted to torch script to speed up inference

traced_model_bs128 = torch.jit.load('../input/janestreet-filesv1/traced_model_size128.pt', map_location='cpu')


In [None]:
model_outs_list = []

traced_model_bs128.to(device)
traced_model_bs128.eval()

with tqdm(total=len(ds)//128) as pbar:
    for n, batch in enumerate(dataloader):
        x, y, z = batch
        x, y, z = x.to(device), y.to(device), z.to(device)
        outs = traced_model_bs128(x, y, z)
        model_outs_list.append(outs.detach().cpu().numpy())
        pbar.update(1)

In [None]:
# CREATING DATAFRAME WITH INFERENCES ON VALIDATION SET
th = 0.5
outs_array = np.concatenate(model_outs_list)
outs_array =  np.where(outs_array > th, 1, 0)

answers = pd.DataFrame(outs_array).rename({0: 'action_0a', 1: 'action_1a', 2: 'action_2a', 3: 'action_3a', 4: 'action_4a'}, axis=1)
answers['keys'] = answers.index
answers.to_hdf('/kaggle/working/validation_inference.hdf', key='keys')

val_inference = pd.concat([validation_data, answers], axis=1)
cols = val_inference.columns.tolist()

# GET COMPARISON COLUMNS
resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']
val_actions = ['action_0', 'action_1', 'action_2', 'action_3', 'action_4']
inference_cols = ['action_0a', 'action_1a', 'action_2a', 'action_3a', 'action_4a']

# Convert matches to bools
compare1 = val_inference[val_actions].copy()
compare2 = val_inference[inference_cols].copy()
accuracy_df = pd.DataFrame(compare1.to_numpy()==compare2.to_numpy())

# Get accuracy on validation set 
print(str(round(((((val_inference[inference_cols].sum(axis=1)>=3) == (val_inference[val_actions].sum(axis=1)>=3)).sum())/val_inference.shape[0] * 100),2)) + '% accuracy on validation')

In [None]:
# Scores for Each Resp are > 0 

# Scores for All Resp Cols are > 0 - then why wouldn't kaggle give me a score?

In [None]:
# Utility Score on Validation Set
val_inference['resp_mean'] = val_inference[resp_cols].mean(axis=1)
val_inference['resp_sum'] = val_inference[resp_cols].sum(axis=1)
for resp in resp_cols + ['resp_mean', 'resp_sum']:
    val_inference['buy_sell_signal'] = val_inference[inference_cols].sum(axis=1)>=3
    val_inference['pi'] = val_inference['decoded_weight']*val_inference[resp]*val_inference['buy_sell_signal']

    days_in_val = validation_data['date'].unique().shape[0]
    t_score = ((val_inference['pi'].sum())/((val_inference['pi']**2).sum())**(1/2))*((250/days_in_val)**(1/2))
    utility_score = min(max(t_score,0),6)*val_inference['pi'].sum()
    print('{} utility score: '.format(resp) + str(round(utility_score, 2)))

# Below is the code I implemented to pre-process the streams from the competition

In [None]:
# Processing for data from competition stream

def get_col_transforms(transformations=transformations, float_cols=float_cols):
    mean_array = np.array([transformations[col]['mean'] for col in float_cols], dtype=np.float64)
    std_array = np.array([transformations[col]['std'] for col in float_cols], dtype=np.float64)
    return mean_array, std_array

mean_array, std_array = get_col_transforms(transformations=transformations, float_cols=float_cols)

@jit(nopython=True)
def transform_test_df(tensor, mean_array=mean_array, std_array=std_array):
    out1 = np.divide(np.subtract(tensor,mean_array),std_array)
    return out1

def lookup_floats(floats_list, dicts=dicts, float_cols=float_cols):
    encoded_list = [dicts['float_keys'][np.round(y,1)] for x,y in list(zip(float_cols, floats_list))]
    return torch.LongTensor(encoded_list)
    
def lookup_strs(strings_list, dicts=dicts, string_cols=string_cols):
    encoded_list = [dicts['keys_dict'][x]['word2key'][int(y)] for x,y in list(zip(string_cols, strings_list))]
    return torch.LongTensor(encoded_list)


def process_data(df, mean_array=mean_array, std_array=std_array, string_cols=string_cols, float_cols=float_cols, dicts=dicts, transformations=transformations, f_mean=f_mean):
    
    strings = df.iloc[0,:][string_cols].tolist()
    floats = df.iloc[0][float_cols].replace([np.inf, -np.inf], np.nan).to_numpy()
    
    floats[np.isnan(floats)] = f_mean[np.isnan(floats)]
    floats_list = transform_test_df(floats)

    float_cols_array = lookup_floats(floats_list)
    float_cols_array = float_cols_array.unsqueeze(0)
    
    str_cols_array = lookup_strs(strings)
    str_cols_array = str_cols_array.unsqueeze(0) 
    
    attention_mask = torch.ones(132).to(torch.int64)
    attention_mask = attention_mask.unsqueeze(0).to(device)
    
    return str_cols_array, float_cols_array, attention_mask

# Code used to submit model outputs to kaggle:

In [None]:
import janestreet
env = janestreet.make_env()

traced_model = torch.jit.load('../input/janestreet-filesv1/traced_model_size1.pt', map_location='cpu')

th = 0.5
traced_model.eval()
traced_model.to(device)
mean_array, std_array = get_col_transforms(transformations=transformations, float_cols=float_cols)

for (test_df, pred_df) in tqdm(env.iter_test()):
    try:
        if test_df.weight.iloc[0] > 0:
            x, y, z = process_data(test_df, mean_array, std_array)
            out2 = traced_model(*[x.to(device), y.to(device), z.to(device)])
            pred = out2.detach().cpu().numpy()
            pred_df.action = np.where(np.where(pred > .5, 1, 0).sum() >= 3, 1, 0).astype(int)

        else:
            pred_df.action = 0
        
        env.predict(pred_df)
    
    except:
        pred_df.action = 0
        env.predict(pred_df)
        print('exception')