This is a simple XGB Model which uses only train data with no feature engineering to check the baseline performance

In [None]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
import seaborn as sns
np.random.seed(10)

In [None]:
# Reading the data
train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
features = pd.read_csv('/kaggle/input/jane-street-market-prediction/features.csv')
sample_submission = pd.read_csv("/kaggle/input/jane-street-market-prediction/example_sample_submission.csv")

In [None]:
features = pd.read_csv('/kaggle/input/jane-street-market-prediction/features.csv')


In [None]:
features.feature.unique()

In [None]:
# Quick view of text data
print(train.head())
print(train.shape)

In [None]:
sample_submission.head()

In [None]:
# Plotting missing value historgram to remove columns with high missing values
plt.hist(train.isnull().mean())

In [None]:
# Only selecting the columns where missing values is less than7 percent based on above graph
final_cols = train.isnull().mean()[train.isnull().mean() < 0.07]

In [None]:
# Selecting only the required columns
train = train[final_cols.index]

In [None]:
# Filling NA values with median
train = train.fillna(train.median())

In [None]:
# Plotting correlation

f, ax = plt.subplots(figsize=(10, 8))

sns.heatmap(train.drop(['date', 'weight', 'resp_1', 'resp_2', 'resp_3', 'resp_4','resp', 'ts_id'], axis = 1).corr(), 
            mask=np.zeros_like(train.drop(['date', 'weight', 'resp_1', 'resp_2', 'resp_3', 'resp_4','resp', 'ts_id'], axis = 1).corr(), dtype=np.bool), 
            cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)

In [None]:
# Creating Y Variable based on the condition that is resp is greater than 0 for all five columns then the action will be 1
train['action'] = np.where((train.resp_1 > 0) & (train.resp_2 > 0) & (train.resp_3 > 0) & (train.resp_4 > 0) & (train.resp > 0),1,0)

In [None]:
# Calculating the number responders
train.action.sum()

In [None]:
# A parameter grid for XGBoost
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5],
        'learning_rate' : [0.02, 0.5], 
        'n_estimators' : [400,600]        
        }

In [None]:
# Decalring XGB
xgb = XGBClassifier(nthread = -1)

In [None]:
# Fitting the train data using default XGB model to remove columns with zero feature importance
xgb.fit(train.drop(['date', 'weight', 'resp_1', 'resp_2', 'resp_3', 'resp_4','resp', 'ts_id', 'action'], axis = 1),train.action, verbose = 3)

In [None]:
feat_imp = pd.DataFrame({"features":train.drop(['date', 'weight', 'resp_1', 'resp_2', 'resp_3', 'resp_4','resp', 'ts_id', 'action'], axis = 1).columns, "importances" : xgb.feature_importances_})

In [None]:
# Selecting only those columns where feature importance is not equal to zero and selecting only top 20 columns
feat_imp = feat_imp[feat_imp['importances']!=0]
feat_imp = feat_imp.sort_values(by='importances', ascending=False)
feat_imp = feat_imp.head(10)

In [None]:
feat_imp.shape

In [None]:
# Perform 3 fold random search CV
folds = 2
param_comb = 2

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, 
                                   cv=skf.split(train[feat_imp.features],train.action), verbose=3, random_state=1001 )

# Here we go
random_search.fit(train[feat_imp.features], train.action)

In [None]:
# ## Prediction on test data. Please refer to https://www.kaggle.com/c/jane-street-market-prediction/overview/evaluation
# import janestreet
# env = janestreet.make_env() # initialize the environment
# iter_test = env.iter_test() # an iterator which loops over the test set
# for (test_df, sample_prediction_df) in iter_test:
#     test_df = train.fillna(test_df.median())
#     test_df['action'] = np.where(test_df['weight'] > 0, xgb.predict(test_df[feat_imp.features]), 0) #make your 0/1 prediction here
#     env.predict(test_df[['action']])

In [None]:
# ## Prediction on test data. Please refer to https://www.kaggle.com/c/jane-street-market-prediction/overview/evaluation
# feat_imp = pd.DataFrame({"features":train.drop(['date', 'weight', 'resp_1', 'resp_2', 'resp_3', 'resp_4','resp', 'ts_id', 'action'], axis = 1).columns, "importances" : 1})
# import janestreet
# env = janestreet.make_env() # initialize the environment
# iter_test = env.iter_test(np.where(test_df['weight'] > 0, xgb.predict(test_df[feat_imp.features]), 0)) # an iterator which loops over the test set
# for (test_df, sample_prediction_df) in iter_test:
#     test_df = train.fillna(test_df.median())
#     predictions = test_df[feat_imp.features]
#     sample_prediction_df.action = 0 #make your 0/1 prediction here
# #     sample_prediction_df.action
#     env.predict(sample_prediction_df)

In [None]:
## Prediction on test data. Please refer to https://www.kaggle.com/c/jane-street-market-prediction/overview/evaluation
import janestreet
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set
for (test_df, pred_df) in iter_test:
        if test_df['weight'].item() > 0:
            X_test = test_df.loc[:, feat_imp.features.values]
            for k in feat_imp.features.values:
                if k not in X_test:
                    X_test[k] = np.nan
            X_test = test_df.fillna(train[feat_imp.features.values].median())
            print(X_test.columns)
            pred = xgb.predict(X_test)
            pred_df.action = np.where(pred >= 0.5, 1, 0).astype(int)
        else:
            pred_df.action = 0
        env.predict(pred_df)