In [None]:
import numpy as np
import pandas as pd
from scipy.special import expit as sigmoid
import scipy.linalg

FEATURES_X = [f'feature_{i}' for i in range(1, 130)]
FEATURES_Y = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']
M_X = len(FEATURES_X)
M_Y = len(FEATURES_Y)

def fisher_params(df, target):
    df_C1 = df[df[target] >  0]
    df_C2 = df[df[target] <= 0]
    df_m1 = df_C1[FEATURES_X].mean(axis=0)
    df_m2 = df_C2[FEATURES_X].mean(axis=0)
    X1 = df_C1[FEATURES_X].fillna(df_m1).values
    X2 = df_C2[FEATURES_X].fillna(df_m2).values
    N1 = X1.shape[0]
    N2 = X2.shape[0]
    m1 = df_m1.values
    m2 = df_m2.values
    me = m2 - m1
    E1 = X1 - m1
    E2 = X2 - m2
    S1 = (1 / N1) * np.dot(E1.T, E1)
    S2 = (1 / N2) * np.dot(E2.T, E2)
    Sw = np.dot(E1.T, E1) + np.dot(E2.T, E2)
    return dict(m1=m1, m2=m2, me=me, S1=S1, S2=S2, Sw=Sw, N1=N1, N2=N2)

def fisher_params_list(df):
    return [fisher_params(df, target) for target in FEATURES_Y]

df = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
df = df[df['date'] > 85]
df = df[df['weight'] > 0]
params_pos = fisher_params_list(df[df['feature_0'] ==  1])
params_neg = fisher_params_list(df[df['feature_0'] == -1])
del df

In [None]:
import gzip
import pickle
params = dict(params_pos=params_pos, params_neg=params_neg)
with gzip.open('params.pkl.gz', 'wb') as fout:
    pickle.dump(params, fout)

In [None]:
MEMO = dict()
def fisher_projection(feature_0, mask):
    b_mask = mask.tobytes()
    key = (feature_0, b_mask)
    if key in MEMO:
        return MEMO[key]
    if feature_0 == 1:
        params_list = params_pos
    else:
        params_list = params_neg
    idx1 = [i for i, p in zip(range(M_X), mask) if p]
    idx2 = np.ix_(idx1, idx1)

    def constants(params):
        w  = scipy.linalg.solve(params['Sw'][idx2], params['me'][idx1], assume_a='pos')
        m1 = np.dot(params['m1'][idx1], w)
        m2 = np.dot(params['m2'][idx1], w)
        std1 = np.sqrt(np.dot(w, np.dot(params['S1'][idx2], w)))
        std2 = np.sqrt(np.dot(w, np.dot(params['S2'][idx2], w)))
        N1 = params['N1']
        N2 = params['N2']
        return w, m1, m2, std1, std2, N1, N2
    consts_list = [constants(params) for params in params_list]
    w, m1, m2, std1, std2, N1, N2 = [np.stack(x, axis=-1) for x in zip(*consts_list)]

    kwargs = dict(m1=m1, m2=m2, std1=std1, std2=std2, N1=N1, N2=N2)
    retval = (w, kwargs)
    MEMO[key] = retval
    return retval

def likelihood_ratio(x, m1, m2, std1, std2, N1, N2):
    e1 = (x - m1) / std1
    e2 = (x - m2) / std2
    return 0.5 * (e2**2 - e1**2) + np.log((std2 * N1) / (std1 * N2))

def inference(test_df):
    test_df = test_df.squeeze()
    if test_df['weight'] == 0:
        return 0
    feature_0 = test_df['feature_0']
    mask = np.logical_not(test_df[FEATURES_X].isnull().values)
    w, kwargs = fisher_projection(feature_0, mask)
    x = np.dot(test_df[FEATURES_X].dropna().values, w)
    y = sigmoid(likelihood_ratio(x, **kwargs))
    pred   = np.median(y)
    action = int(pred >= 0.502)
    return action

In [None]:
from tqdm import tqdm
import janestreet
env = janestreet.make_env()

for (test_df, pred_df) in tqdm(env.iter_test()):
    pred_df.action = inference(test_df)
    env.predict(pred_df)