In [None]:
def reduce_memory_usage(df):   
    start_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_memory} MB")
    
    df = df.astype({c: np.float32 for c in df.select_dtypes(include='float64').columns}) 
    
    end_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe after reduction {end_memory} MB")
    print(f"Reduced by {100 * (start_memory - end_memory) / start_memory} % ")
    return df

In [None]:
# Adjusted predictions based on classifier certainty
import os
from time import time
import multiprocessing
import gc

import numpy as np
from numpy import percentile
import pandas as pd

import xgboost as xgb

import janestreet

In [None]:
# load data
t0=time()
data = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
data = reduce_memory_usage(data)
data=data[data.weight!=0]
print("Load data finished in %0.3fs" % (time() - t0))

# Settings
NAN_VALUE = -9999
features = [c for c in data.columns if 'feature' in c]
targets = [c for c in data.columns if 'resp' in c]

t0=time()
# split data into X (features) and Y (target)
X = data.loc[:, features]
X = X.fillna(NAN_VALUE)
print("Define data features X and target Y finished in %0.3fs" % (time() - t0))

In [None]:
models=[]

n = 0
for target in targets:
    Y = (data.loc[:, target] > 0).astype(int)
    t0=time()
    # Train model
    model = xgb.XGBClassifier(
        n_estimators=500,
        max_depth=11,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.7,
        random_state=2020,
        tree_method='gpu_hist',
        nthread=multiprocessing.cpu_count()
    )
    model.fit(X, Y)
    models.append(model)
    print("Training model finished in %0.3fs" % (time() - t0))

In [None]:
weighted_weak =[1.0, 0.291670, 0.392026, 0.625467, 0.813532] # purged correlation
weighted_strong=[1.0, 0.452159, 0.595274, 0.815972, 0.956197] # raw correlation

In [None]:
accuracy_test = 0.50
# Create submission
env = janestreet.make_env()
iter_test = env.iter_test()
for (test_df, sample_prediction_df) in iter_test: 
    X_test = test_df.loc[:, features]
    X_test = X_test.fillna(NAN_VALUE)
    preds=[
        models[0].predict(X_test),
        models[1].predict(X_test),
        models[2].predict(X_test),
        models[3].predict(X_test),
        models[4].predict(X_test)
        ]   
    preds = np.average(preds, axis=0, weights=weighted_weak)
    sample_prediction_df.action = np.where(preds >= accuracy_test, 1, 0).astype(int)
    env.predict(sample_prediction_df)
