In [None]:
import os
import pandas as pd
import numpy as np
import time

In [None]:
import re
import pickle
import numba

import datatable as dtable

import matplotlib.pyplot as plt

from tqdm.notebook import tqdm

from sklearn import linear_model

In [None]:
@numba.njit(fastmath = True)
def utility_score_numba(date, weight, resp, action):
    Pi = np.bincount(date, weight * resp * action)
    t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / len(Pi))
    u = min(max(t, 0), 6) * np.sum(Pi)
    return u

@numba.njit
def fillna_npwhere_njit(array, values):
    if np.isnan(array.sum()):
        array = np.where(np.isnan(array), values, array)
    return array

In [None]:
data = dtable.fread("C:\\Users\\LYC\\Desktop\\Jane Street\\train.csv").to_pandas()
data = data.astype({c: np.float32 for c in data.select_dtypes(include='float64').columns})
data = data.query('weight > 0')

In [None]:
FEATURES = [f for f in data.columns if re.match('^feature_', f)]
AUC_TARGET = [f for f in data.columns if re.match('^rep_', f)]
DATE = 'date'
WEIGHT = 'weight'
TARGET = 'resp'

FILL_NAN_PD = 0 * data.mean(0)
FILL_NAN = np.zeros((1, len(FEATURES)))

def utility(data, action):
    return utility_score_numba(data[DATE].values, data[WEIGHT].values, data[TARGET].values, action)

In [None]:
idxT = data[DATE] <= np.quantile(data[DATE], 0.8)
idxV = (~idxT) & (data[DATE] <= np.quantile(data[DATE], 0.9))
idxO = (~idxT) & (~idxV)

X_train = data.loc[idxT][FEATURES].fillna(FILL_NAN_PD).values
y_train = (data.loc[idxT][TARGET] > 0.0).astype('int').values

X_valid = data.loc[idxV][FEATURES].fillna(FILL_NAN_PD).values
y_valid = (data.loc[idxV][TARGET] > 0.0).astype('int').values

X_other = data.loc[idxO][FEATURES].fillna(FILL_NAN_PD).values
y_other = (data.loc[idxO][TARGET] > 0.0).astype('int').values

print("Train = %d\t\tValid = %d\t\tOther = %d" % (idxT.sum(), idxV.sum(), idxO.sum()))

In [None]:
model = linear_model.LogisticRegression()
model.fit(X_train, y_train)

# Plot & Summaries

In [None]:
plt.figure(figsize=(16, 4))

pred_train = model.predict_proba(X_train)[:, 1]
plt.hist(pred_train, 100, density=True, color='tab:green', alpha=0.5, histtype='step', linewidth=1.5)
if len(X_valid) > 0:
    pred_valid = model.predict_proba(X_valid)[:, 1]
    plt.hist(pred_valid, 100, density=True, color='tab:red', alpha=0.5, histtype='step', linewidth=1.5)
if len(X_other) > 0:
    pred_other = model.predict_proba(X_other)[:, 1]
    plt.hist(pred_other, 100, density=True, color='tab:blue', alpha=0.5, histtype='step', linewidth=1.5)

plt.vlines(0.5, *plt.ylim(), color='k', alpha=0.5, linestyle='--')
plt.xlim(0.4, 0.6);

In [None]:
plt.figure(figsize=(16, 10))
plt.title("Prediction by target")
plt.subplot(3, 1, 1)
if len(X_train) > 0:
    plt.hist(pred_train[y_train >= 0.5], 1000, density=True, color='tab:green', alpha=0.5, histtype='step', linewidth=1.5, label='y=1')
    plt.hist(pred_train[y_train < 0.5], 1000, density=True, color='tab:red', alpha=0.5, histtype='step', linewidth=1.5, label='y=0')
    plt.xlim(0.4, 0.6)
    plt.legend()

plt.subplot(3, 1, 2)
if len(X_valid) > 0:
    plt.hist(pred_valid[y_valid >= 0.5], 200, density=True, color='tab:green', alpha=0.5, histtype='step', linewidth=1.5)
    plt.hist(pred_valid[y_valid < 0.5], 200, density=True, color='tab:red', alpha=0.5, histtype='step', linewidth=1.5)
    plt.xlim(0.4, 0.6)
    
plt.subplot(3, 1, 3)
if len(X_other) > 0:
    plt.hist(pred_other[y_other >= 0.5], 200, density=True, color='tab:green', alpha=0.5, histtype='step', linewidth=1.5)
    plt.hist(pred_other[y_other < 0.5], 200, density=True, color='tab:red', alpha=0.5, histtype='step', linewidth=1.5)
    plt.xlim(0.4, 0.6)

In [None]:
print("Train Score = %.4f" % utility(data.loc[idxT], pred_train > 0.5))
if len(X_valid) > 0:
    print("Valid Score = %.4f" % utility(data.loc[idxV], pred_valid > 0.5))
if len(X_other) > 0:
    print("Other Score = %.4f" % utility(data.loc[idxO], pred_other > 0.5))

In [None]:
res = pd.DataFrame()
frac = 0.6
thr_list = np.linspace(0.47, 0.53, 51)
for rep in tqdm(range(10)):
    for thr in thr_list:
        sample_idx = data.loc[idxO].reset_index().sample(frac=frac).index.tolist()
        ut = utility(data.loc[idxO].iloc[sample_idx], pred_other[sample_idx] > thr)
        res = pd.concat([res, pd.DataFrame({'rep': rep, 'thr': thr, 'ut': ut}, index=[0])], axis=0).reset_index(drop=True)

In [None]:
# use the lower bound to choose the threshold
plt.figure(figsize=(12, 6))
plt.scatter(res['thr'], res['ut'], color='tab:red', alpha=0.1, zorder=0)

res_agg = res.groupby('thr')['ut'].agg(['mean', 'std'])
res_agg['lwb'] = res_agg['mean'] - 1 * res_agg['std']
res_agg['upb'] = res_agg['mean'] + 1 * res_agg['std']
plt.plot(res_agg.index, res_agg['mean'], color='tab:red', zorder=-1)
plt.fill_between(res_agg.index, res_agg['lwb'], res_agg['upb'], color='tab:red', alpha=0.1, zorder=-2)

lwb_best = float(res_agg['lwb'].max())
thr_best = float(res_agg.index[res_agg['lwb'].argmax()])
print("Best threshold = %.4f" % thr_best)

plt.scatter(thr_best, lwb_best, s=400, edgecolor='k', color='gold', marker='*', zorder=+1)

plt.vlines(0.5, min(0, plt.ylim()[0]), plt.ylim()[1], color='k', linestyle='--', alpha=0.4)
plt.hlines(0, *plt.xlim(), color='k', linestyle='--', alpha=0.4)

plt.xlabel('Thr')
plt.ylabel('Utility')
plt.title('Best threshold');

# Save

In [None]:
try:
    pickle.dump(model, open(os.path.join('../working', 'model.pkl'), 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
except TypeError:
    model.save(open(os.path.join('../working', 'model.h5')))

# Submission

In [None]:
THR = thr_best

In [None]:
import janestreet

In [None]:
env = janestreet.make_env()
iter_test = env.iter_test()

for (test_df, sample_prediction_df) in tqdm(iter_test):
    sample_prediction_df.action = (model.predict_proba(test_df[FEATURES].fillna(0.0).values)[:, 1] > THR).astype(int)
    env.predict(sample_prediction_df)