# 1) Import important libraries and packages

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import xgboost as xgb
import optuna

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 2) Load and clean dataset

In [None]:
# Import dataset as train
train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv',skiprows=range(1,1000000),nrows=1000000)
train.info()

In [None]:
# Drop rows with 'weight'=0 
# Trades with weight = 0 were intentionally included in the dataset for completeness, 
# although such trades will not contribute towards the scoring evaluation
train = train[train['weight']!=0]
features = [col for col in list(train.columns) if 'feature' in col]

# Create 'action' column (dependent variable)
# The 'action' column is defined as such because of the evaluation metric used for this project.
# We want to maximise the utility function and hence pi where pi=∑j(weightij∗respij∗actionij)
# Positive values of resp will increase pi

train['return'] = np.log(1+np.abs(train['resp']))
train['sign'] = np.where(train['resp']>0,1,0)
train_p = train[train['sign']==1]
train_n = train[train['sign']==0]

In [None]:
train_p[['return']].hist(bins=100);

In [None]:
train_n[['return']].hist(bins=100);

In [None]:
X = train[features]
y = train['sign']
# Next, we hold out part of the training data to form the hold-out validation set
train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.2)

In [None]:
train_median = train_x.median()
# Impute medians in both training set and the hold-out validation set
train_x = train_x.fillna(train_median)
valid_x = valid_x.fillna(train_median)

In [None]:
# Before we perform PCA, we need to normalise the features so that they have zero mean and unit variance
scaler = StandardScaler()
scaler.fit(train_x)
train_x_norm = scaler.transform(train_x)

# pca = PCA()
# comp = pca.fit(train_x_norm)
# # We plot a graph to show how the explained variation in the 129 features varies with the number of principal components
# plt.plot(np.cumsum(comp.explained_variance_ratio_))
# plt.grid()
# plt.xlabel('Number of Principal Components')
# plt.ylabel('Explained Variance')
# sns.despine();
# # The first 15 principal components explains about 80% of the variation
# # The first 40 principal components explains about 95% of the variation
# Using the first 60 principal components, we apply the PCA mapping on both the training and test set

pca = PCA(n_components=60).fit(train_x_norm)
train_x_transform = pca.transform(train_x_norm)
valid_x_transform = pca.transform(scaler.transform(valid_x))

In [None]:
# We create the XGboost-specific DMatrix data format from the numpy array. 
# This data structure is optimised for memory efficiency and training speed
dtrain = xgb.DMatrix(train_x_transform, label=train_y)
dvalid = xgb.DMatrix(valid_x_transform, label=valid_y)

In [None]:
# The objective function is passed an Optuna specific argument of trial
def objective(trial):
    
# params specifies the XGBoost hyperparameters to be tuned
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 600),
        'max_depth': trial.suggest_int('max_depth', 10, 25),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.1),
        'subsample': trial.suggest_uniform('subsample', 0.50, 1),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.50, 1),
        'gamma': trial.suggest_int('gamma', 0, 10),
        'tree_method': 'gpu_hist',  
        'objective': 'binary:logistic'
    }
    
    bst = xgb.train(params, dtrain)
    preds = bst.predict(dvalid)
    pred_labels = np.rint(preds)
# trials will be evaluated based on their accuracy on the test set
    accuracy = sklearn.metrics.accuracy_score(valid_y, pred_labels)
    return accuracy

In [None]:
if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=25, timeout=600)

#     print("Number of finished trials: ", len(study.trials))
#     print("Best trial:")
    trial = study.best_trial

#     print("  Value: {}".format(trial.value))
#     print("  Params: ")
#     for key, value in trial.params.items():
#         print("    {}: {}".format(key, value))

best_params = trial.params
best_params['tree_method'] = 'gpu_hist' 
best_params['objective'] = 'binary:logistic'
optimal_clf = xgb.XGBClassifier(**best_params)
optimal_clf.fit(train_x_transform, train_y)

In [None]:
# # # Plot how the best accuracy evolves with number of trials
# fig = optuna.visualization.plot_optimization_history(study)
# fig.show()

In [None]:
Xp, Xn = train_p[features],train_n[features]
yp, yn = train_p['return'],train_n['return']
# Next, we hold out part of the training data to form the hold-out validation set
train_xp, valid_xp, train_yp, valid_yp = train_test_split(Xp, yp, test_size=0.2)
train_xn, valid_xn, train_yn, valid_yn = train_test_split(Xn, yn, test_size=0.2)

In [None]:
train_median_p, train_median_n = train_xp.median(),train_xn.median()
# Impute medians in both training set and the hold-out validation set
train_xp, valid_xp = train_xp.fillna(train_median_p),valid_xp.fillna(train_median_p)
train_xn, valid_xn = train_xn.fillna(train_median_n),valid_xn.fillna(train_median_n) 

In [None]:
scaler_p,scaler_n = StandardScaler(),StandardScaler()
scaler_p.fit(train_xp)
scaler_n.fit(train_xn)

train_xp_norm = scaler_p.transform(train_xp)
train_xn_norm = scaler_n.transform(train_xn)
# pca_p,pca_n = PCA(),PCA()
# comp_p,comp_n = pca_p.fit(train_xp_norm), pca_n.fit(train_xn_norm)
# # We plot a graph to show how the explained variation in the 129 features varies with the number of principal components
# plt.plot(np.cumsum(comp.explained_variance_ratio_))
# plt.grid()
# plt.xlabel('Number of Principal Components')
# plt.ylabel('Explained Variance')
# sns.despine();
# # The first 15 principal components explains about 80% of the variation
# # The first 40 principal components explains about 95% of the variation
# Using the first 60 principal components, we apply the PCA mapping on both the training and test set
pca_p = PCA(n_components=60).fit(train_xp_norm)
pca_n = PCA(n_components=60).fit(train_xn_norm)
train_xp_transform = pca_p.transform(train_xp_norm)
valid_xp_transform = pca_p.transform(scaler_p.transform(valid_xp))
train_xn_transform = pca_n.transform(train_xn_norm)
valid_xn_transform = pca_n.transform(scaler_n.transform(valid_xn))

In [None]:
# We create the XGboost-specific DMatrix data format from the numpy array. 
# This data structure is optimised for memory efficiency and training speed
dtrain_p = xgb.DMatrix(train_xp_transform, label=train_yp)
dvalid_p = xgb.DMatrix(valid_xp_transform, label=valid_yp)
dtrain_n = xgb.DMatrix(train_xn_transform, label=train_yn)
dvalid_n = xgb.DMatrix(valid_xn_transform, label=valid_yn)

In [None]:
# The objective function is passed an Optuna specific argument of trial
def objective_p(trial):
    
# params specifies the XGBoost hyperparameters to be tuned
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 600),
        'max_depth': trial.suggest_int('max_depth', 10, 25),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.1),
        'subsample': trial.suggest_uniform('subsample', 0.50, 1),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.50, 1),
        'gamma': trial.suggest_int('gamma', 0, 10),
        'tree_method': 'gpu_hist',  
        "objective": "reg:squarederror",
    }
    
    bst = xgb.train(params, dtrain_p)
    preds = bst.predict(dvalid_p)
    rmse = np.sqrt(mean_squared_error(valid_yp, preds))
    return rmse

# The objective function is passed an Optuna specific argument of trial
def objective_n(trial):
    
# params specifies the XGBoost hyperparameters to be tuned
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 600),
        'max_depth': trial.suggest_int('max_depth', 10, 25),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.1),
        'subsample': trial.suggest_uniform('subsample', 0.50, 1),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.50, 1),
        'gamma': trial.suggest_int('gamma', 0, 10),
        'tree_method': 'gpu_hist',  
        "objective": "reg:squarederror",
    }
    
    bst = xgb.train(params, dtrain_n)
    preds = bst.predict(dvalid_n)
    rmse = np.sqrt(mean_squared_error(valid_yn, preds))
    return rmse

In [None]:
if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_p, n_trials=25, timeout=600)
    trial = study.best_trial
    
p_params = trial.params
p_params['tree_method'] = 'gpu_hist' 
p_params['objective'] = "reg:squarederror"
p_reg = xgb.XGBRegressor(**p_params)
p_reg.fit(train_xp_transform, train_yp)

In [None]:
if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_n, n_trials=25, timeout=600)
    trial = study.best_trial
    
n_params = trial.params
n_params['tree_method'] = 'gpu_hist' 
n_params['objective'] = "reg:squarederror"
n_reg = xgb.XGBRegressor(**n_params)
n_reg.fit(train_xn_transform, train_yn)

# 6) Fit classifier on test set

In [None]:
# We impute the missing values with the medians
def fillna_npwhere(array, values):
    if np.isnan(array.sum()):
        array = np.where(np.isnan(array), values, array)
    return array

In [None]:
import janestreet

env = janestreet.make_env()  # initialize the environment
iter_test = env.iter_test()  # an iterator which loops over the test set

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    wt = test_df.iloc[0].weight
    if wt == 0:
        sample_prediction_df.action = 0
    else:
        sign = optimal_clf.predict_proba(pca.transform(scaler.transform(fillna_npwhere(test_df[features].values,train_median[features].values))))
        predict_p = p_reg.predict(pca_p.transform(scaler_p.transform(fillna_npwhere(test_df[features].values,train_median_p[features].values))))
        predict_n = n_reg.predict(pca_n.transform(scaler_n.transform(fillna_npwhere(test_df[features].values,train_median_n[features].values))))
        sample_prediction_df.action = np.where((-sign[:, 0]*predict_n+sign[:, 1]*predict_p)>0,1,0)
    env.predict(sample_prediction_df)

# Acknowledgements
https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60

https://www.kaggle.com/saurabhshahane/voting-classifier-beginners

https://www.kaggle.com/harshitt21/jane-street-basic-eda-xgb-baseline

https://www.kaggle.com/eudmar/jane-street-eda-pca-ensemble-methods

https://www.kaggle.com/gogo827jz/optimise-speed-of-filling-nan-function?scriptVersionId=48926407

https://github.com/datacamp/Machine-Learning-With-XGboost-live-training/blob/master/notebooks/Machine-Learning-with-XGBoost-solution.ipynb