In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import pathlib
import treelite
import treelite_runtime 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import xgboost as xgb
import optuna   
import cudf
import warnings
warnings.filterwarnings("ignore")

In [None]:
traincudf = cudf.read_csv('/kaggle/input/jane-street-market-prediction/train.csv', nrows=100000)
traincudf.info()

In [None]:
train = traincudf.to_pandas()

In [None]:
del traincudf

In [None]:
train = train[train['weight']!=0]

In [None]:
train['action'] = train['resp'].apply(lambda x:x>0).astype(int)

In [None]:
#features = [col for col in list(train.columns) if 'feature' in col]

In [None]:
X = train.loc[:, train.columns.str.contains('feature')]
y = train['action']

# features

In [None]:
# Import the necessary libraries first
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import Lasso,LogisticRegression
from sklearn.metrics import roc_auc_score,accuracy_score

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
test_median = X.median()


In [None]:
train_median = x_train.median()
# Impute medians in both training set and the hold-out validation set
x_train = x_train.fillna(train_median)
x_test = x_test.fillna(train_median)

In [None]:
# linear models benefit from feature scaling
scaler=StandardScaler()
scaler.fit(x_train)

In [None]:
train_trans = scaler.transform(x_train)

In [None]:
from sklearn.model_selection import GridSearchCV


In [None]:
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [None]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0
    }

In [None]:
def objective(space):
    clf=xgb.XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( train_trans, y_train), ( scaler.transform(x_test), y_test)]
    
    clf.fit(train_trans, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict(scaler.transform(x_test))
    accuracy = accuracy_score(y_test, pred>0.5)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

In [None]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

In [None]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

In [None]:
# Fit the XGBoost classifier with optimal hyperparameters
clf = xgb.XGBClassifier(colsample_bytree = 0.871380814805137, gamma= 1.4630522178861056, max_depth = 9, min_child_weight = 4.0, reg_alpha = 50.0, reg_lambda= 0.8077808908136562)

In [None]:
clf.fit(train_trans, y_train)  #Used the whole training data

In [None]:
y_predict = clf.predict(scaler.transform(x_test))
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_predict))

In [None]:
import janestreet
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set

In [None]:
def fillna_npwhere(array, values):
    if np.isnan(array.sum()):
        array = np.where(np.isnan(array), values, array)
    return array

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    X_test = test_df.loc[:, test_df.columns.str.contains('feature')]
    x_test=fillna_npwhere(X_test.values,train_median)
    y_preds = clf.predict(scaler.transform(x_test))
    sample_prediction_df.action = y_preds
    env.predict(sample_prediction_df)