# Import Dependencies

In [None]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt import tpe

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import janestreet
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")



# Initialize the environment

In [None]:
env = janestreet.make_env() 
iter_test = env.iter_test()


# Load Data

In [None]:
print('Loading training data...')
train = pd.read_csv('../input/jane-street-market-prediction/train.csv')
print('Finished.')

In [None]:
train.head()
train.describe()

In [None]:
train.columns

# Preprocessing

This competition is evaluated on a utility score. Each row in the test set represents a trading opportunity for which you will be predicting an action value, 1 to make the trade and 0 to pass on it. Each trade j has an associated weight and resp, which represents a return.

In [None]:
print('Preprocessing...')
train = train[train['weight'] != 0] #do not train data with 0 weight
train['action'] = (train['resp'].values > 0).astype('int') 



# Feature Engineering


In [None]:
missing_values_count = train.isnull().sum()
missing_values_count

In [None]:
X = train.loc[:, train.columns.str.contains('feature')]
f_mean = X.mean()
X.fillna(f_mean) #fill na values with feature mean

y = train.loc[:, 'action']
print('Finished.')

# Split into test and train

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Create Model

![](https://3qeqpr26caki16dnhd19sv6by6v-wpengine.netdna-ssl.com/wp-content/uploads/2016/07/XGBoost-Plot-of-Single-Decision-Tree-Left-To-Right.png)

Model used [1][2]

In [None]:
print('Creating classifier...')
clf = xgb.XGBClassifier(
    
    n_estimators=400,
    max_depth=7,
    eta=0.5, 
    missing=None,
    random_state=42,
    tree_method='gpu_hist',
    subsample=0.8,
    colsample_bytree=1,
    verbosity=2  
)
print('Finished.')

# Train classifier

In [None]:
print('Training classifier...')
clf.fit(X, y)
#clf.fit(X_train, y_train)
print('Finished.')

# Score Classifier 

In [None]:
#print('Scoring model...')
#y_pred = clf.predict(X_test)
#evaluate predictions
#accuracy = accuracy_score(y_test, y_pred)
#print("Accuracy: %.2f%%" % (accuracy * 100.0))

# Prepare Submission

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    X_test = test_df.loc[:, test_df.columns.str.contains('feature')]
    y_preds = clf.predict(X_test)
    sample_prediction_df.action = y_preds
    env.predict(sample_prediction_df)

# Hyperopt for Hyperparameter Tuning


hp.choice(label, options) — Returns one of the options, which should be a list or tuple.

hp.randint(label, upper) — Returns a random integer between the range [0, upper).

hp.uniform(label, low, high) — Returns a value uniformly between low and high.

hp.quniform(label, low, high, q) — Returns a value round(uniform(low, high) / q) * q, i.e it rounds the decimal values and returns an integer

hp.normal(label, mean, std) — Returns a real value that’s normally-distributed with mean and standard deviation sigma.
1. [3][4]

In [None]:
#def hyperparameter_tuning(space):
    #print('Building Model...')
    #model = xgb.XGBClassifier(
        #n_estimators=space['n_estimators'],
        #max_depth=space['max_depth'],
        #min_child_weight=space['min_child_weight'],
        #random_state=42,
        #subsample=space['subsample'],
        #learning_rate=space['learning_rate'],
        #gamma=space['gamma'],
        #colsample_bytree=space['colsample_bytree'],
        #tree_method='gpu_hist'
        #)

    #evaluation = [(X_train, y_train), (X_test, y_test)]

    #model.fit(X_train, y_train,
              #eval_set=evaluation, eval_metric="rmse",
              #early_stopping_rounds=10, verbose=False)
    #print('Finished.')

    #pred = model.predict(X_test)
    #accuracy = accuracy_score(y_test, pred > 0.5)
    #print("SCORE:", accuracy)
    # change the metric if you like
    #return {'loss': -accuracy, 'status': STATUS_OK, 'model': model}


#space = {
        #'max_depth': hp.choice('max_depth', np.arange(10, 20, dtype=int)),
        #'min_child_weight': hp.quniform('min_child', 1, 30, 1),
        #'subsample': hp.uniform('subsample', 0.8, 1),
        #'n_estimators': hp.choice('n_estimators', np.arange(100, 10000, 100, dtype=int)),
        #'learning_rate': hp.quniform('learning_rate', 0.025, 0.5, 0.025),
        #'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
        #'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05)
    #}

#trials = Trials()

#best = fmin(fn=hyperparameter_tuning,
                #space=space,
                #algo=tpe.suggest,
                #max_evals=10,
                #trials=trials)

#print(best)

# Refrences 

[1]https://www.kaggle.com/hamditarek/market-prediction-xgboost-with-gpu-fit-in-1min

[2]https://machinelearningmastery.com/evaluate-gradient-boosting-models-xgboost-python/

[3]https://medium.com/analytics-vidhya/hyperparameter-tuning-hyperopt-bayesian-optimization-for-xgboost-and-neural-network-8aedf278a1c9

[4]https://www.kaggle.com/henrylidgley/xgboost-with-hyperopt-tuning
********