# import Package

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import TimeSeriesSplit, train_test_split,StratifiedKFold,cross_val_score,GridSearchCV
from sklearn.metrics import accuracy_score,balanced_accuracy_score,f1_score,log_loss,make_scorer,roc_auc_score
from sklearn.preprocessing import OneHotEncoder

import datatable as dt 
import pandas as pd
pd.set_option('display.max_columns', 500)

import xgboost as xgb
print("XGBoost version:", xgb.__version__)

import warnings 
warnings.filterwarnings("ignore")

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK, STATUS_FAIL
from functools import partial

from tqdm.auto import tqdm

try:
    print('choose accelerate...',end='')
    import cudf
    import cupy as cp
    if_gpu=True
    print('gpu ready.')
except:
    if_gpu=False
    print('cpu ready.')

# load data

In [None]:
# create the environment
import janestreet
print('Creating competition environment...', end='')
env = janestreet.make_env()
iter_test = env.iter_test()
print('Finished.')

In [None]:
%%time
if if_gpu:
    train_data_datatable=cudf.read_csv('../input/jane-street-market-prediction/train.csv')
else:
    train_data_datatable = dt.fread('../input/jane-street-market-prediction/train.csv')
train_data = train_data_datatable.to_pandas()
del train_data_datatable
features_with_tag = pd.read_csv('../input/jane-street-market-prediction/features.csv')
example_test = pd.read_csv('../input/jane-street-market-prediction/example_test.csv')
sample_prediction_df = pd.read_csv('../input/jane-street-market-prediction/example_sample_submission.csv')
print ("Data is loaded!")

filter data

In [None]:
train_data = train_data[train_data['weight'] != 0]

In [None]:
train_data

In [None]:
example_test.head()

# create features

In [None]:
def create_feature_features(data):
    feature_features=data.loc[:, data.columns.str.contains('feature')]
    def _fill_missing(feature_features):
        feature_features.fillna(feature_features.mean(axis=0),inplace=True)
        return feature_features
    feature_features=_fill_missing(feature_features)
    return feature_features

def create_lag_features(data):
    pass

def other_features(data):
    pass

def create_all_features(data):
    feature_features=create_feature_features(data)
    features=pd.concat([feature_features],axis=1)
    return features

In [None]:
features= create_all_features(train_data)

In [None]:
features.head()

# create target

In [None]:
def create_action(data,onehot=False):
#     action=(data['resp'].values > 0).astype('int')
    one_hot = OneHotEncoder()
    action=data['resp'].apply(lambda x: int(x>0))
    if onehot:
        action = pd.get_dummies(action)
    return action

In [None]:
target=create_action(train_data)

In [None]:
target.head()

# split_train_val

In [None]:
X_train, X_val, y_train, y_val = train_test_split(features,
                                                        target,
                                                        test_size=0.4,
                                                        shuffle=False)

# XGB

HyperOpt搜参

In [None]:
def train_xgb(params, X_train, y_train):
    # run XGBoost algorithm with hyperparameters optimization
    # this model outperforms the linear regression
    """
 Train XGBoost regressor using the parameters given as input. The model
 is validated using standard cross validation technique adapted for time series
 data. This function returns a friendly output for the hyperopt parameter optimization
 module.

 Parameters
 ----------
 params: dict with the parameters of the XGBoost regressor. For complete list see:
         https://xgboost.readthedocs.io/en/latest/parameter.html
 X_train: pd.DataFrame with the training set features
 y_train: pd.Series with the training set targets

 Returns
 -------
 dict with keys 'model' for the trained model, 'status' containing the hyperopt
 status string and 'loss' with the RMSE obtained from cross-validation
 """

    if if_gpu:
        params['tree_method']='gpu_hist'
#     try:
    model = xgb.XGBClassifier(seed =123,
                              **params
                              )

    result = model.fit(X_train,
                       y_train.values.ravel(),
                       eval_set=[(X_train, y_train.values.ravel())],
                       early_stopping_rounds=50,
                       verbose=False)

    # cross validate using the right iterator for time series
    cv_space = TimeSeriesSplit(n_splits=5)
    cv_score = cross_val_score(model,
                               X_train, y_train.values.ravel(),
                               cv=cv_space,
                               scoring='roc_auc')

    mean_of_cv_score = np.abs(np.mean(np.array(cv_score)))
    return {
        "loss": -mean_of_cv_score,
        "status": STATUS_OK,
        "model": model
    }

#     except ValueError as ex:
#         return {
#             "error": ex,
#             "status": STATUS_FAIL
#         }

def optimize_xgb(X_train, y_train, max_evals=10):
    """
 Run Bayesan optimization to find the optimal XGBoost algorithm
 hyperparameters.

 Parameters
 ----------
 X_train: pd.DataFrame with the training set features
 y_train: pd.Series with the training set targets
 max_evals: the maximum number of iterations in the Bayesian optimization method

 Returns
 -------
 best: dict with the best parameters obtained
 trials: a list of hyperopt Trials objects with the history of the optimization
 """

    space = {
        "n_estimators": hp.randint("n_estimators", 200, 600),
        "max_depth": hp.randint("max_depth", 2, 8),
        "learning_rate": hp.loguniform("learning_rate", -9, -1),
        "subsample": hp.uniform("subsample", 0.8, 1),
        'gamma': hp.uniform('gamma', 0, 10),
        'min_child_weight': hp.uniform('min_child_weight', 0, 10),
        'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1), 
    }

    objective_fn = partial(train_xgb,
                           X_train=X_train,
                           y_train=y_train)

    trials = Trials()
    best_params = fmin(fn=objective_fn,
                space=space,
                algo=tpe.suggest,
                max_evals=max_evals,
                trials=trials)
    if if_gpu:
        best_params['tree_method']='gpu_hist'
    print('Best parameters:',best_params)

    return best_params, trials

In [None]:
n_evals=100
%time params, trials = optimize_xgb(X_train.iloc[:,:], y_train.iloc[:], max_evals=n_evals)

模型拟合

In [None]:
#fix params
# params={'colsample_bytree': 0.5896884284128014, 'gamma': 1.5524549835224408, 'learning_rate': 0.8417086469365996, 'max_depth': 5, 'min_child_weight': 0.1680835208952257, 'n_estimators': 401, 'subsample': 0.8477610855386742, 'tree_method': 'gpu_hist'}

In [None]:
model=xgb.XGBClassifier(**params)
%time model.fit(X_train,y_train)

# 预测与评估

In [None]:
def evaluate(y, yhat,ifpri=True):
    summary = {}
    summary['accuracy'] = accuracy_score(y, yhat)
    summary['balanced_accuracy'] = balanced_accuracy_score(y, yhat)
    summary['f1'] = f1_score(y, yhat)
    summary['log_loss']=log_loss(y,yhat)
    summary['roc_auc_score']=roc_auc_score(y,yhat)
    if ifpri:
        print(summary)
    return summary

In [None]:
y_val_predict=model.predict(X_val)

In [None]:
summary=evaluate(y_val,y_val_predict)

# Finally forecast (not yet, for the final model)

In [None]:
for (test_df, sample_prediction_df) in tqdm(iter_test):
    X_test = create_all_features(test_df)
    y_preds = model.predict(X_test)
    sample_prediction_df.action = y_preds
    env.predict(sample_prediction_df)