In [None]:
DEBUG = False
TRAINING = False

if not TRAINING:
    treelite_model_path = '../input/lgbstock-treelite'

tree_model_idx = [3]

In [None]:
!pip --quiet install ../input/treelite/treelite-0.93-py3-none-manylinux2010_x86_64.whl

In [None]:
!pip --quiet install ../input/treelite/treelite_runtime-0.93-py3-none-manylinux2010_x86_64.whl

In [None]:
# treelite
import treelite
import treelite_runtime 

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
import optuna

import lightgbm as lgbm
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
#plt.style.use('fivethirtyeight')
import xgboost as xgb
import sklearn
import tqdm
import random
import janestreet
import tensorflow as tf

In [None]:
SEED=1111

In [None]:
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [None]:
train = pd.read_csv("/kaggle/input/jane-street-market-prediction/train.csv")

In [None]:
# train = pd.read_csv('../input/jane-street-market-prediction/train.csv')
train = train.query('date > 85').reset_index(drop = True) 
train = train[:int(len(train)/1000)] if DEBUG else train
train = train[train['weight'] != 0]

train.fillna(train.mean(),inplace=True)

train['action'] = ((train['resp'].values) > 0).astype(int)


features = [c for c in train.columns if "feature" in c]


In [None]:
features.remove('feature_0')

In [None]:
#features.remove('feature_48')
#features.remove('feature_45')
#features.remove('feature_3')

In [None]:
features.extend(['weight'])


In [None]:

f_mean = np.mean(train[features[1:]].values,axis=0)

resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']

#X_train = train.loc[:, train.columns.str.contains('feature')]
X_train=train[features].values
#y_train = (train.loc[:, 'action'])

y_train = np.stack([(train[c] > 0).astype('int') for c in resp_cols]).T

In [None]:
if DEBUG:
    # modeling step 
    params={"num_leaves":2,
           "max_bin":2,
           "feature_fraction":0.52,
           "bagging_fraction":0.52,
           "objective":"binary",
           "learning_rate":0.05,
           "boosting_type":"gbdt",
           "metric":"auc"
           }
else:
    # modeling step 
    params={"num_leaves":300,
           "max_bin":450,
           "feature_fraction":0.52,
           "bagging_fraction":0.52,
           "objective":"binary",
           "learning_rate":0.05,
           "boosting_type":"gbdt",
           "metric":"auc"
           }
models = [] # list of model , we will train 
treelite_models = []
for i in range(y_train.shape[1]):
    nom_fich = "weights_target_" + resp_cols[i] 
    
    if TRAINING:
        xtr,xval,ytr,yval = train_test_split(X_train ,y_train[:,i],test_size=0.2,stratify=y_train[:,i])

        d_train = lgbm.Dataset(xtr,label=ytr)
        d_eval = lgbm.Dataset(xval,label=yval,reference=d_train)
        clf = lgbm.train(params,d_train,valid_sets=[d_train,d_eval],num_boost_round=1000,\
                        early_stopping_rounds=50,verbose_eval=50)


        clf.save_model(nom_fich)    
        models.append(clf)
    
        # to treelite
    # #     AttributeError: 'Booster' object has no attribute 'booster_'
    #     clf.booster_.save_model(f'lightgbm_{resp_cols[i]}.txt')
        clf.save_model(f'lightgbm_{resp_cols[i]}.txt')
        treelite_model = treelite.Model.load(f'lightgbm_{resp_cols[i]}.txt', model_format='lightgbm')
        toolchain = 'gcc'
        treelite_model.export_lib(toolchain=toolchain, libpath=f'./mymodel_{resp_cols[i]}.so',
                          params={'parallel_comp': 32}, verbose=False)
        treelite_model = treelite_runtime.Predictor(f'./mymodel_{resp_cols[i]}.so', verbose=True)

        treelite_models.append(treelite_model)        
    else:
#         treelite_model_path = '../input/lgbstock-treelite'
        treelite_model = treelite_runtime.Predictor(
            f'{treelite_model_path}/mymodel_{resp_cols[i]}.so', verbose=True)

        treelite_models.append(treelite_model)

In [None]:
treelite_models = [treelite_models[i] for i in tree_model_idx]

In [None]:
# # tree_model = joblib.load(f'../input/model-tree/model_lightgbm_fa_2.hdf5')
# # to treelite
# tree_model.booster_.save_model(f'lightgbm_{resp_cols[i]}.txt')
# treelite_model = treelite.Model.load(f'lightgbm_{resp_cols[i]}.txt', model_format='lightgbm')
# toolchain = 'gcc'
# treelite_model.export_lib(toolchain=toolchain, libpath=f'./mymodel_{resp_cols[i]}.so',
#                   params={'parallel_comp': 32}, verbose=False)
# treelite_model = treelite_runtime.Predictor(f'./mymodel_{resp_cols[i]}.so', verbose=True)


In [None]:
if TRAINING:
    fig,ax = plt.subplots(figsize=(25,50))
    lgbm.plot_importance(clf, ax=ax,importance_type='gain',max_num_features=130)
    plt.show()

In [None]:
f = np.median
th = 0.5000
import janestreet
from tqdm import tqdm
env = janestreet.make_env()
for (test_df, pred_df) in tqdm(env.iter_test()):
    if test_df['weight'].item() > 0:
#         #x_tt = test_df.loc[:, features].values
#         x_tt=test_df[features].values
#         if np.isnan(x_tt[:, 1:].sum()):
#             x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean
#         pred = np.mean([model.predict(x_tt) for model in models],axis=0)
        
        
        x_tt = test_df.loc[:, features].values
        
        # GBDT inference with treelite
        batch = treelite_runtime.Batch.from_npy2d(x_tt)
#         xgb_pred = predictor.predict(batch)        
        pred = np.mean([model.predict(batch) for model in treelite_models],axis=0)
        
        pred = f(pred)
        pred_df.action = np.where(pred >= th, 1, 0).astype(int)
    else:
        pred_df.action = 0
    env.predict(pred_df)

In [None]:
#preds = clf.predict(xtr)
#pred_labels = np.rint(preds)


    
#accuracy = sklearn.metrics.accuracy_score(ytr, pred_labels)
