In [None]:
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl > /dev/null 2>&1

import datatable as dt

In [None]:
# ----- Import common library -----
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
from tqdm import tqdm_notebook as tqdm
from glob import glob
import gc
import pickle
from time import time
import json
import pytz
import random
pd.set_option('display.max_columns', 500)
import warnings
warnings.filterwarnings('ignore')
from IPython.core.display import display

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np
import pandas as pd
import json

def read_data():
    train = dt.fread(f'/kaggle/input/jane-street-market-prediction/train.csv').to_pandas()
    features = pd.read_csv(f'/kaggle/input/jane-street-market-prediction/features.csv')
    example_test = pd.read_csv(f'/kaggle/input/jane-street-market-prediction/example_test.csv')
    example_sample_submission = pd.read_csv(f'/kaggle/input/jane-street-market-prediction/example_sample_submission.csv')
    return train, features, example_test, example_sample_submission

train, features, example_test, example_sample_submission = read_data()

In [None]:
train

In [None]:
features

In [None]:
example_test

In [None]:
example_sample_submission

In [None]:
train['action'] = np.where(train['resp'] < 0, 0, 1)
train = train.loc[(train.weight > 0) & (train.date > 0), :]

In [None]:
target = 'action'
train_feats = [col for col in list(train.columns) if 'feature' in col]

In [None]:
train_df = train.copy()
del train
gc.collect()

In [None]:
import lightgbm as lgb
from lightgbm.callback import _format_eval_result
import logging
from logging import getLogger
from sklearn.metrics import mean_absolute_error, roc_auc_score
from sklearn.model_selection import KFold

def metric(y_true, y_pred):
    return roc_auc_score(y_true, y_pred)

def postprocess(y_pred):
    y_pred = np.clip(y_pred, 0, 99999)
    return y_pred

oof = np.zeros(len(train_df))

for SEED in [2021]:
    for fold_id, (train_index, valid_index) in enumerate([(range(int(len(train_df)*0.8)), range(int(len(train_df)*0.8), len(train_df)))]):

        ### ------------- PREPARATION TRAIN VALID TEST DATASET
        train_x, train_y = train_df.iloc[train_index][train_feats], train_df.iloc[train_index][target]
        valid_x, valid_y = train_df.iloc[valid_index][train_feats], train_df.iloc[valid_index][target]
        #test_x = test_df[train_feats]
        
        train_x = train_x.fillna(-999)
        valid_x = valid_x.fillna(-999)

        print(f'train_x.shape = {train_x.shape}, train_y.shape = {train_y.shape}')
        print(f'valid_x.shape = {valid_x.shape}, valid_y.shape = {valid_y.shape}')
    
        dtrain = lgb.Dataset(train_x, label=train_y)
        dval = lgb.Dataset(valid_x, label=valid_y, reference=dtrain)
        valid_sets = [dval, dtrain]
        
        trainparams = {
            "params": {
                "objective" : "binary", 
                "boosting" : "gbdt", 
                "metric" : "auc",  
                "max_depth": 7,
                "min_data_in_leaf": 50, 
                "reg_alpha": 0.1, 
                "reg_lambda": 1, 
                "num_leaves" : 31, 
                "learning_rate" : 0.1,
                "bagging_fraction" : 0.8,
                "feature_fraction" : 0.8,
                "seed": SEED,
                #"num_threads": 8
                #"device": "gpu",
                "verbosity": -1
            },
            "train_set": dtrain,
            "valid_sets": valid_sets,
            "num_boost_round" : 5000, 
            "fobj": None, 
            "feval": None, 
            "early_stopping_rounds": 500, 
            "verbose_eval": 100, 
            "categorical_feature": [], 
        }
        
        print('trainparams={}'.format(trainparams))
        model = lgb.train(
            **trainparams
        )
        print('end training.')
        
        ### ------------- PREDICTION VALIDATION
        #valid_preds = model.predict(valid_x, num_iteration=model.best_iteration)
        
        ### ------------- FEATURE IMPORTANCE
        feature_importance = sorted(zip(model.feature_name(), model.feature_importance(importance_type='gain')),key=lambda x: x[1], reverse=True)[:]
        for i, item in enumerate(feature_importance[:]):
            print('Feature importance {}: {}'.format(i, str(item)))
        
        #del model
        #gc.collect()

In [None]:
import janestreet
env = janestreet.make_env()
iter_test = env.iter_test()

In [None]:
import numpy as np
from numba import njit

@njit
def fillna_npwhere_njit(array, values):
    if np.isnan(array.sum()):
        array = np.where(np.isnan(array), values, array)
    return array

In [None]:
from tqdm.notebook import tqdm

for i, (test_df, sample_prediction_df) in enumerate(tqdm(iter_test)):
    sample_prediction_df.action = sample_prediction_df.action.astype(int)
    wt = test_df.iloc[0].weight
    if(wt == 0):
        sample_prediction_df.action = int(0)
    else:
        test_x = test_df.loc[:, test_df.columns.str.contains('feature')].values # https://www.kaggle.com/c/jane-street-market-prediction/discussion/210680, but occured scoring error
        test_x=fillna_npwhere_njit(test_x,-999)
        #test_x = test_df[train_feats] ### occured scoring error
        #test_x = test_x.fillna(-999) ### occured scoring error
        preds = model.predict(test_x, num_iteration=model.best_iteration)
        sample_prediction_df.action = (preds > 0.5).astype(int)
    env.predict(sample_prediction_df)

In [None]:
submission = pd.read_csv('./submission.csv')
submission.head()