In [None]:
import os
import re
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm

import lightgbm

In [None]:
data = pd.read_csv('../input/jane-street-market-prediction/train.csv')

In [None]:
FEATURES = [f for f in data.columns if re.match('^feature_', f)]
AUC_TARGET = [f for f in data.columns if re.match('^rep_', f)]
DATE = 'date'
WEIGHT = 'weight'
TARGET = 'resp'

In [None]:
NAN = -9999

X = data.loc[data[WEIGHT] > 0.0][FEATURES].fillna(NAN).values
y = data.loc[data[WEIGHT] > 0.0][TARGET].values > 0.0

In [None]:
model = lightgbm.LGBMClassifier(learning_rate=0.05, n_estimators=150, reg_alpha=0.5, reg_lambda=2.0, subsample=0.8, colsample_bytree=0.6, num_leaves=35)
model.fit(X, y, eval_set=(X, y), eval_metric='auc')

# Plot

In [None]:
pred = model.predict_proba(X)[:, 1]
plt.hist(pred, 100, density=True, color='tab:red', alpha=0.5)
plt.vlines(0.5, *plt.ylim(), color='k', alpha=0.5, linestyle='--');

# Save

In [None]:
pickle.dump(model, open(os.path.join('../working', 'model.pkl'), 'wb'), protocol=pickle.HIGHEST_PROTOCOL)

# Submission

In [None]:
THR = 0.5

In [None]:
import janestreet
env = janestreet.make_env()
iter_test = env.iter_test()

for (test_df, sample_prediction_df) in tqdm(iter_test):
    sample_prediction_df.action = (model.predict_proba(test_df[FEATURES].fillna(NAN).values)[:, 1] > THR).astype(int)
    env.predict(sample_prediction_df)