In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

This notebook is based on [this](https://www.kaggle.com/wilddave/xgb-starter) starter kit. Thanks for sharing it!

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

import pandas as pd
pd.set_option('display.max_columns', 500)

import warnings
warnings.filterwarnings("ignore")

import xgboost as xgb
print("XGBoost version:", xgb.__version__)

In [None]:
print('Reading datasets')

train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')

features_meta = pd.read_csv('/kaggle/input/jane-street-market-prediction/features.csv')

print('Finished reading')

In [None]:
print('Shape of train dataset {} and of features {}'.format(train.shape, features_meta.shape))

Eliminate 0-weights and create a binary action column, to be 1 if resp is positive and 0 otherwise.

In [None]:
train = train.query('date > 85')
train_weights = train[train['weight'] != 0]
del train

train_weights.reset_index(drop=True, inplace=True)

In [None]:
train_weights.head(n=10)

In [None]:
train_weights['resp_all'] = train_weights['resp'] + train_weights['resp_1'] + \
train_weights['resp_2'] + train_weights['resp_3'] + train_weights['resp_4']


See how different the signs of the resp and resp_all columns are

In [None]:
resp_different = ( (((train_weights['resp_all'].values) > 0) & ((train_weights['resp'].values) < 0)) \
                  | (((train_weights['resp_all'].values) < 0) & ((train_weights['resp'].values) > 0)) )
resp_different_count = ((resp_different > 0).astype(int)).sum()

In [None]:
print('There are {0} differences out of {1} trades if we take in account resp_1,2,3,4 '.format(resp_different_count, train_weights.shape[0]))

Binarize the targets according to resp_all

In [None]:
train_weights['action'] = (train_weights['resp_all'].values > 0).astype(int)
y_full = train_weights.loc[:, 'action']

In [None]:
# Number of missing values in each column of training data
missing_val_count_by_column = (train_weights.isnull().sum())
missing_values_count = missing_val_count_by_column[missing_val_count_by_column > 0]
print('number of columns with missing values ', len(missing_values_count))

In [None]:
plt.hist(missing_values_count.values, bins=50)
plt.show()

A lot of NaNs in the features, let's address that via feature selection with xgb classifier and sci-kit learn's selection model.

In [None]:
feature_cols = [col_name for col_name in train_weights.columns if 'feature' in col_name]

In [None]:
print('Number of features ', len(feature_cols))

Make a dictionary with number of selections per feature, which will be used when splitting the train df into folds

In [None]:
features_selection = {feature:0 for feature in feature_cols}
print('Length of feature selection dictionary ', len(features_selection))

In [None]:
X_not_imputed = train_weights[feature_cols]

Handle missing values with simple imputer, to be able to select best features

In [None]:
from sklearn.impute import SimpleImputer
simple_imputer = SimpleImputer()

X_imputed = pd.DataFrame(simple_imputer.fit_transform(X_not_imputed))
X_imputed.columns = X_not_imputed.columns

Some cleanup is mandatory, but keep X not imputed for later imputer fitting, when features are selected

In [None]:
del train_weights

In [None]:
X_imputed.head(n=10)

Define a method to return a XGBoost classifier with same configuration for feature selection and later generating the model used for prediction

In [None]:
def make_classifier(verbosity=0):
    return xgb.XGBClassifier(\
                             n_estimators=1000,\
                             max_depth=7,\
                             learning_rate=0.05,\
                             missing=None,\
                             random_state=42,\
                             tree_method='gpu_hist',\
                             subsample=0.8,\
                             colsample_bytree=1,\
                             eval_metric='auc',\
                             objective='binary:logistic',\
                             verbosity=verbosity)

Define the method responsible for selecting a subset of features.\
We'll use sci-kit learn's train test split with a train ratio of 80% and valid ratio of 20%

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split

def select_features_xgb(X, y):
    """Return selected features using xgb classifier """
    xgb_model_for_selection = make_classifier()
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                          random_state=42)
    print('Fit xgb classifier for feature selection with shape of X ', X.shape)
    xgb_model_for_selection.fit(X_train, y_train,\
                                eval_set=[(X_valid, y_valid)],\
                                early_stopping_rounds=3)

    model_for_selection = SelectFromModel(xgb_model_for_selection, prefit=True)
    print('Transform selection model')
    X_selected = model_for_selection.transform(X)
    print('Create df with selected features having non-zero variance')
    selected_features = pd.DataFrame(model_for_selection.inverse_transform(X_selected), index = X.index, columns=X.columns)
    del X_selected
    print('Selecting columns wih best features')
    selected_columns = selected_features.columns[selected_features.var() != 0]
    del selected_features

    selected_columns_list = list(selected_columns)
    print('Number of selected columns ', len(selected_columns_list))
    for feature_col in selected_columns_list:
        features_selection[feature_col] += 1
    

Let's choose the column features based on partitions in K folds and a dictionary counting occurrences of selected features per each fold.

In [None]:
folds = 4
print('Starting feature selection with {0} folds and rows of dataframe {1}'.format(folds, X_imputed.shape[0]))
for k in range(folds):
    nb_elem_fold = int(X_imputed.shape[0] / folds)
    left_margin = k * nb_elem_fold
    right_margin = (k + 1) * nb_elem_fold if k < folds - 1 else X_imputed.shape[0]
    print('Start for fold ', k)
    selected_features_per_fold = select_features_xgb(X_imputed[left_margin:right_margin], y_full[left_margin:right_margin])
    print('End fold ', k)


No more need for X_imputed

In [None]:
del X_imputed

In [None]:
print('Dictionary of feature selection')

In [None]:
plt.hist(features_selection.values(), bins=5)
plt.show()

In [None]:
features_at_least_1 = [feat_col for (feat_col, nb_times) in features_selection.items() if nb_times >= 1]

In [None]:
print(len(features_at_least_1))

We shall select only features that were selected at least once during the folding selection algorithm above

In [None]:
all_selected_features = features_at_least_1.copy()
print('Number of selected features is ', len(all_selected_features))

Recreate the imputer and fit it to the df with features that will be selected

In [None]:
simple_imputer = SimpleImputer()

X_ni_feat_sel = X_not_imputed.loc[:, all_selected_features]
del X_not_imputed

X_full = pd.DataFrame(simple_imputer.fit_transform(X_ni_feat_sel))
X_full.columns = X_ni_feat_sel.columns
del X_ni_feat_sel

print('Shape of X_full is ', X_full.shape)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_full, y_full, train_size=0.8, test_size=0.2,
                                                      random_state=42)
del X_full

Creating XGBoost classifier to be used for predictions

In [None]:
print('Creating classifier...', end='')
model = make_classifier(1)
print('Finished.')

Train the model

In [None]:
# Fit the model
print('Training classifier...', end='')
%time model.fit(X_train, y_train,\
                eval_set=[(X_valid, y_valid)],\
                early_stopping_rounds=3)
print('Finished.')
print('Done')

And predict with the provided environment in the submission file

In [None]:
import janestreet
env = janestreet.make_env() 

print('Creating submissions file...', end='')
rcount = 0
for (test_df, prediction_df) in env.iter_test():
    test_df_selected_features = test_df.loc[:, all_selected_features]
    X_test = pd.DataFrame(simple_imputer.transform(test_df_selected_features))
    X_test.columns = test_df_selected_features.columns
    
    y_preds = model.predict(X_test)
    prediction_df['action'] = y_preds
    env.predict(prediction_df)
    rcount += len(test_df.index)
print(f'Finished processing {rcount} rows.')