# Intro
Welcome to the [Jane Street Market Prediction](https://www.kaggle.com/c/jane-street-market-prediction/data) competition.
![](https://storage.googleapis.com/kaggle-competitions/kaggle/23304/logos/header.png)

This is a starter notebook and will help you to begin with the competition.

We pass a simple feature engineering to handle missing values and start with a simple XGB Classifier.

<span style="color: royalblue;">Please vote the notebook up if it helps you. Thank you. </span>

# Libraries

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")

# Create Environment

In [None]:
import janestreet
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set

# Path

In [None]:
path = '/kaggle/input/jane-street-market-prediction/'
os.listdir(path)

# Functions

In [None]:
def plot_timeseries(data, feature):
    fig = plt.figure(figsize=(10, 6))
    x = range(len(data))
    y = data[feature]
    plt.plot(x, y)
    plt.grid()

We use a memory reduction function based on this [notebook](https://www.kaggle.com/unrool/starter-notebook-with-mem-reducing).

In [None]:
def memory_reduction(df):
    """ Iterate through all the columns of the dataframe df 
        and modify the data type to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

# Load Data

In [None]:
train_data = pd.read_csv(path+'train.csv')
#test_data = pd.read_csv(path+'example_test.csv')

# EDA

In [None]:
print('number of train samples:', len(train_data))
#print('number of features:', len(feature_data))

In [None]:
train_data[['weight', 'resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp']].describe()

In [None]:
plot_timeseries(train_data, 'feature_4')

# Handle Missing Values

In [None]:
cols_with_missing_train = [col for col in train_data.columns if train_data[col].isnull().any()]

In [None]:
print('train columns with missing data:', cols_with_missing_train[0:4])

The features are numericals. There are several techniques to fill the missing data, i.e. set them to zero oder the mean value.

In [None]:
mean = train_data[cols_with_missing_train].mean()
#train_data[cols_with_missing_train] = train_data[cols_with_missing_train].fillna(0, inplace=False)
train_data[cols_with_missing_train] = train_data[cols_with_missing_train].fillna(mean, inplace=False)

# Reduce Memory

In [None]:
#train_data = memory_reduction(train_data)


# Prepare Data
We focus on the samples with weight grather than zero. And define the binar target based on the feature resp.

In [None]:
#train_data['resp_mean'] = train_data[['resp_4', 'resp']].mean(axis=1)

In [None]:
train_temp = train_data[train_data['weight'] != 0]
train_temp['action'] = (train_temp['resp'] > 0) * 1

In [None]:
X_train = train_temp.loc[:, train_temp.columns.str.contains('feature')]
y_train = train_temp.loc[:, 'action']

Scale data:

In [None]:
features = ['feature_'+str(i) for i in range(130)]

In [None]:
mean = X_train[features].mean(axis=0)
X_train[features] = X_train[features].astype('float32')
X_train[features] -= X_train[features].mean(axis=0)
std = X_train[features].std(axis=0)
X_train[features] /= X_train[features].std(axis=0)

In [None]:
X_train.describe()

# Split Train And Validation Data

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.1, random_state=2020)

In [None]:
print('number of train samples', len(X_train))
print('number of val samples', len(X_val))

# Model
We use a simple XGB classifier.

In [None]:
# param_grid = {'objective': ['binary:logistic'],
#               'learning_rate': [1/(10**i) for i in range(1, 2)],
#               'max_depth': [i for i in range(9, 11)],
#               'n_estimators': [i*100 for i in range(8, 10)],
#               'random_state': [2020],
#              'tree_method': ['gpu_hist']}
# grid = GridSearchCV(XGBClassifier(), param_grid=param_grid, cv=6)
# grid.fit(X_train, y_train)
# best_params = grid.best_params_
# print('Best score of cross validation: {:.2f}'.format(grid.best_score_))
# print('Best parameters:', best_params)

In [None]:
model_XGB = XGBClassifier(objective='binary:logistic',
                          n_estimators=900,
                          learning_rate=0.1,
                          random_state=2020,
                          max_depth=9,
                          tree_method = 'gpu_hist')
model_XGB.fit(X_train, y_train)

In [None]:
preds_val = model_XGB.predict(X_val)
accuracy_score(y_val, preds_val)

# Feature Importance
We want to know useful are the features for predicting a target variable.

In [None]:
importance = model_XGB.feature_importances_

In [None]:
fig = plt.figure(figsize=(10, 30))
x = X_train.columns.values
plt.barh(x, 100*importance, orientation='horizontal')
plt.title('Feature Importance', loc='left')
plt.xlabel('Percentage')
plt.grid()
plt.show()

# Submission

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    X_test = test_df.loc[:, test_df.columns.str.contains('feature')]
    
    #Predict Target
    y_preds = model_XGB.predict(X_test)
    sample_prediction_df.action = y_preds
    env.predict(sample_prediction_df)