In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import datatable as dtable
import sklearn
import lightgbm as lgb
import gc
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import optuna

In [None]:
from numba import njit

@njit
def fillna_npwhere_njit(array, values):
    if np.isnan(array.sum()):
        array = np.where(np.isnan(array), values, array)
    return array

In [None]:
%%time
##读取数据

print("loading.....")
df_read = dtable.fread('/kaggle/input/jane-street-market-prediction/train.csv').to_pandas()
print("fill data.....")
df_read = df_read[df_read["weight"] != 0]
df_read['action'] = ((df_read['weight'].values * df_read['resp'].values) > 0).astype('int')
features = [column for column in df_read.columns if "feature" in column]
train_data = df_read.loc[:,features]
train_data["action"] = np.where(df_read["resp"] >0 , 1, 0)
train_data = train_data.fillna(-999)
print("loading finish...")


In [None]:
print("The shape of data are {}".format(train_data.shape))

In [None]:
gc.collect()

In [None]:
X_train = train_data[features]
Y_train = train_data["action"]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X_train,Y_train,test_size = 0.2, shuffle = False)
# def objective(trial):
#     params = {
#         'objective': 'binary',
#         'metric': 'auc',
#         'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
#         'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
#         'num_leaves': trial.suggest_int('num_leaves', 2, 256),
#         'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
#         'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
#         'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
#         'min_child_samples': trial.suggest_int('min_child_samples', 5, 100)
#     }
    
#     trn_data = lgb.Dataset(x_train, label=y_train)
#     val_data = lgb.Dataset(x_test,label = y_test)
#     bst = lgb.train(params, trn_data, 10000, valid_sets=[trn_data, val_data],verbose_eval=-1, early_stopping_rounds = 10) 
#     preds = bst.predict(x_test)
#     pred_labels = np.rint(preds)
    
#     accuracy = sklearn.metrics.accuracy_score(y_test, pred_labels)
#     return accuracy
def objective(trial):
    params = {
              "boosting": "gbdt",  
              'num_leaves': trial.suggest_int("num_leaves",2,400), 
              'min_data_in_leaf': trial.suggest_int("min_data_in_leaf",2,400),
              'objective': 'binary', #定义的目标函数
              'max_depth': trial.suggest_int("max_depth",10,80),
              'learning_rate': 0.01,
              "min_sum_hessian_in_leaf": trial.suggest_int("min_sum_hessian_in_leaf",10,200),
              "bagging_freq": trial.suggest_int("bagging_freq",1,3),
              'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.6, 0.8),#提取的特征比率
              "bagging_seed": 2021,
              'lambda_l2':  0.01,
              "lambda_l1": 0.01,             #l1正则
              "nthread": -1,                #线程数量，-1表示全部线程，线程越多，运行的速度越快
              'metric': {'binary_logloss', 'auc'},  ##评价函数选择
              "random_state": 2021, #随机数种子，可以防止每次运行的结果不一致
#               'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
              }
    trn_data = lgb.Dataset(x_train, label=y_train)
    val_data = lgb.Dataset(x_test,label = y_test)
    bst = lgb.train(params, trn_data, 2000, valid_sets=[trn_data, val_data],verbose_eval=-1, early_stopping_rounds = 10) 
    preds = bst.predict(x_test)
    pred_labels = np.rint(preds)
    
    accuracy = sklearn.metrics.accuracy_score(y_test, pred_labels)
    return accuracy

In [None]:
if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=10, timeout=1200)
    
    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial
    
    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("{}: {}".format(key, value))

In [None]:
%%time
# best_params = {
#         "lambda_l1": 1.5825659672515704,
#     "lambda_l2": 9.840109778310884,
#     "num_leaves": 141,
#     "feature_fraction": 0.5102935735830129,
#     "bagging_fraction": 0.6475779414110898,
#     "bagging_freq": 1,
#     "min_child_samples": 99
#     }
##选取最好的参数 进行训练
best_params = trial.params
# best_params = trial.params
best_params["objective"] = "binary"
best_params["metric"] = {'binary_logloss', 'auc'}
best_params["random_state"] = 2021
best_params["device"] = "cpu"
best_params["nthread"] = -1
best_params["bagging_seed"] = 2021
best_params["boosting"] = "gbdt"
best_params["learning_rate"] = 0.01
trn_data = lgb.Dataset(x_train, label=y_train)
val_data = lgb.Dataset(x_test, label=y_test)

lg_model = lgb.train(best_params, trn_data, 10000, valid_sets = [trn_data, val_data], verbose_eval=-1, early_stopping_rounds = 20)

In [None]:
del df_read
del X_train

In [None]:
Threshold = (Y_train == 1).sum()/ len(Y_train)

In [None]:
import janestreet
env = janestreet.make_env()
iter_test = env.iter_test()

In [None]:
from tqdm import tqdm
for (test_df, sample_prediction_df) in tqdm(iter_test):
    test_df = fillna_npwhere_njit(test_df[features].values,-999)
    preds = np.where(lg_model.predict(test_df,num_iteration = lg_model.best_iteration) >= Threshold ,1,0).astype(int)
    sample_prediction_df.action = preds
    env.predict(sample_prediction_df)