diff --git a/examples/highfreq/workflow.py b/examples/highfreq/workflow.py index 5660ab2e9e..7bf5fd09a7 100644 --- a/examples/highfreq/workflow.py +++ b/examples/highfreq/workflow.py @@ -1,24 +1,13 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -import sys import fire -from pathlib import Path import qlib import pickle -import numpy as np -import pandas as pd from qlib.config import REG_CN, HIGH_FREQ_CONFIG -from qlib.contrib.model.gbdt import LGBModel -from qlib.contrib.data.handler import Alpha158 -from qlib.contrib.strategy.strategy import TopkDropoutStrategy -from qlib.contrib.evaluate import ( - backtest as normal_backtest, - risk_analysis, -) - -from qlib.utils import init_instance_by_config, exists_qlib_data + +from qlib.utils import init_instance_by_config from qlib.data.dataset.handler import DataHandlerLP from qlib.data.ops import Operators from qlib.data.data import Cal @@ -96,9 +85,7 @@ def _init_qlib(self): # use yahoo_cn_1min data QLIB_INIT_CONFIG = {**HIGH_FREQ_CONFIG, **self.SPEC_CONF} provider_uri = QLIB_INIT_CONFIG.get("provider_uri") - if not exists_qlib_data(provider_uri): - print(f"Qlib data is not found in {provider_uri}") - GetData().qlib_data(target_dir=provider_uri, interval="1min", region=REG_CN) + GetData().qlib_data(target_dir=provider_uri, interval="1min", region=REG_CN, exists_skip=True) qlib.init(**QLIB_INIT_CONFIG) def _prepare_calender_cache(self): diff --git a/examples/hyperparameter/LightGBM/hyperparameter_158.py b/examples/hyperparameter/LightGBM/hyperparameter_158.py index 5e4887a14f..89cc10cc6a 100644 --- a/examples/hyperparameter/LightGBM/hyperparameter_158.py +++ b/examples/hyperparameter/LightGBM/hyperparameter_158.py @@ -1,46 +1,9 @@ import qlib -from qlib.config import REG_CN -from qlib.utils import exists_qlib_data, init_instance_by_config import optuna - -provider_uri = "~/.qlib/qlib_data/cn_data" -if not exists_qlib_data(provider_uri): - print(f"Qlib data is not found in {provider_uri}") - sys.path.append(str(scripts_dir)) - from get_data import GetData - - GetData().qlib_data(target_dir=provider_uri, region="cn") -qlib.init(provider_uri=provider_uri, region="cn") - -market = "csi300" -benchmark = "SH000300" - -data_handler_config = { - "start_time": "2008-01-01", - "end_time": "2020-08-01", - "fit_start_time": "2008-01-01", - "fit_end_time": "2014-12-31", - "instruments": market, -} -dataset_task = { - "dataset": { - "class": "DatasetH", - "module_path": "qlib.data.dataset", - "kwargs": { - "handler": { - "class": "Alpha158", - "module_path": "qlib.contrib.data.handler", - "kwargs": data_handler_config, - }, - "segments": { - "train": ("2008-01-01", "2014-12-31"), - "valid": ("2015-01-01", "2016-12-31"), - "test": ("2017-01-01", "2020-08-01"), - }, - }, - }, -} -dataset = init_instance_by_config(dataset_task["dataset"]) +from qlib.config import REG_CN +from qlib.utils import init_instance_by_config +from qlib.tests.config import CSI300_DATASET_CONFIG +from qlib.tests.data import GetData def objective(trial): @@ -65,12 +28,19 @@ def objective(trial): }, }, } - evals_result = dict() model = init_instance_by_config(task["model"]) model.fit(dataset, evals_result=evals_result) return min(evals_result["valid"]) -study = optuna.Study(study_name="LGBM_158", storage="sqlite:///db.sqlite3") -study.optimize(objective, n_jobs=6) +if __name__ == "__main__": + + provider_uri = "~/.qlib/qlib_data/cn_data" + GetData().qlib_data(target_dir=provider_uri, region=REG_CN, exists_skip=True) + qlib.init(provider_uri=provider_uri, region="cn") + + dataset = init_instance_by_config(CSI300_DATASET_CONFIG) + + study = optuna.Study(study_name="LGBM_158", storage="sqlite:///db.sqlite3") + study.optimize(objective, n_jobs=6) diff --git a/examples/hyperparameter/LightGBM/hyperparameter_360.py b/examples/hyperparameter/LightGBM/hyperparameter_360.py index 8b498e912c..bc0cc245df 100644 --- a/examples/hyperparameter/LightGBM/hyperparameter_360.py +++ b/examples/hyperparameter/LightGBM/hyperparameter_360.py @@ -1,46 +1,11 @@ import qlib -from qlib.config import REG_CN -from qlib.utils import exists_qlib_data, init_instance_by_config import optuna +from qlib.config import REG_CN +from qlib.utils import init_instance_by_config +from qlib.tests.data import GetData +from qlib.tests.config import get_dataset_config, CSI300_MARKET, DATASET_ALPHA360_CLASS -provider_uri = "~/.qlib/qlib_data/cn_data" -if not exists_qlib_data(provider_uri): - print(f"Qlib data is not found in {provider_uri}") - sys.path.append(str(scripts_dir)) - from get_data import GetData - - GetData().qlib_data(target_dir=provider_uri, region="cn") -qlib.init(provider_uri=provider_uri, region="cn") - -market = "csi300" -benchmark = "SH000300" - -data_handler_config = { - "start_time": "2008-01-01", - "end_time": "2020-08-01", - "fit_start_time": "2008-01-01", - "fit_end_time": "2014-12-31", - "instruments": market, -} -dataset_task = { - "dataset": { - "class": "DatasetH", - "module_path": "qlib.data.dataset", - "kwargs": { - "handler": { - "class": "Alpha360", - "module_path": "qlib.contrib.data.handler", - "kwargs": data_handler_config, - }, - "segments": { - "train": ("2008-01-01", "2014-12-31"), - "valid": ("2015-01-01", "2016-12-31"), - "test": ("2017-01-01", "2020-08-01"), - }, - }, - }, -} -dataset = init_instance_by_config(dataset_task["dataset"]) +DATASET_CONFIG = get_dataset_config(market=CSI300_MARKET, dataset_class=DATASET_ALPHA360_CLASS) def objective(trial): @@ -72,5 +37,13 @@ def objective(trial): return min(evals_result["valid"]) -study = optuna.Study(study_name="LGBM_360", storage="sqlite:///db.sqlite3") -study.optimize(objective, n_jobs=6) +if __name__ == "__main__": + + provider_uri = "~/.qlib/qlib_data/cn_data" + GetData().qlib_data(target_dir=provider_uri, region=REG_CN, exists_skip=True) + qlib.init(provider_uri=provider_uri, region=REG_CN) + + dataset = init_instance_by_config(DATASET_CONFIG) + + study = optuna.Study(study_name="LGBM_360", storage="sqlite:///db.sqlite3") + study.optimize(objective, n_jobs=6) diff --git a/examples/model_interpreter/feature.py b/examples/model_interpreter/feature.py new file mode 100644 index 0000000000..a1288e07d2 --- /dev/null +++ b/examples/model_interpreter/feature.py @@ -0,0 +1,32 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + + +import qlib +from qlib.config import REG_CN + +from qlib.utils import init_instance_by_config +from qlib.tests.data import GetData +from qlib.tests.config import CSI300_GBDT_TASK + + +if __name__ == "__main__": + + # use default data + provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir + GetData().qlib_data(target_dir=provider_uri, region=REG_CN, exists_skip=True) + + qlib.init(provider_uri=provider_uri, region=REG_CN) + + ################################### + # train model + ################################### + # model initialization + model = init_instance_by_config(CSI300_GBDT_TASK["model"]) + dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"]) + model.fit(dataset) + + # get model feature importance + feature_importance = model.get_feature_importance() + print("feature importance:") + print(feature_importance) diff --git a/examples/model_rolling/task_manager_rolling.py b/examples/model_rolling/task_manager_rolling.py index 4f3ac04b15..9ef8694bf4 100644 --- a/examples/model_rolling/task_manager_rolling.py +++ b/examples/model_rolling/task_manager_rolling.py @@ -17,63 +17,7 @@ from qlib.workflow.task.collect import RecorderCollector from qlib.model.ens.group import RollingGroup from qlib.model.trainer import TrainerRM - - -data_handler_config = { - "start_time": "2008-01-01", - "end_time": "2020-08-01", - "fit_start_time": "2008-01-01", - "fit_end_time": "2014-12-31", - "instruments": "csi100", -} - -dataset_config = { - "class": "DatasetH", - "module_path": "qlib.data.dataset", - "kwargs": { - "handler": { - "class": "Alpha158", - "module_path": "qlib.contrib.data.handler", - "kwargs": data_handler_config, - }, - "segments": { - "train": ("2008-01-01", "2014-12-31"), - "valid": ("2015-01-01", "2016-12-31"), - "test": ("2017-01-01", "2020-08-01"), - }, - }, -} - -record_config = [ - { - "class": "SignalRecord", - "module_path": "qlib.workflow.record_temp", - }, - { - "class": "SigAnaRecord", - "module_path": "qlib.workflow.record_temp", - }, -] - -# use lgb -task_lgb_config = { - "model": { - "class": "LGBModel", - "module_path": "qlib.contrib.model.gbdt", - }, - "dataset": dataset_config, - "record": record_config, -} - -# use xgboost -task_xgboost_config = { - "model": { - "class": "XGBModel", - "module_path": "qlib.contrib.model.xgboost", - }, - "dataset": dataset_config, - "record": record_config, -} +from qlib.tests.config import CSI100_RECORD_LGB_TASK_CONFIG, CSI100_RECORD_XGBOOST_TASK_CONFIG class RollingTaskExample: @@ -85,11 +29,13 @@ def __init__( task_db_name="rolling_db", experiment_name="rolling_exp", task_pool="rolling_task", - task_config=[task_xgboost_config, task_lgb_config], + task_config=None, rolling_step=550, rolling_type=RollingGen.ROLL_SD, ): # TaskManager config + if task_config is None: + task_config = [CSI100_RECORD_XGBOOST_TASK_CONFIG, CSI100_RECORD_LGB_TASK_CONFIG] mongo_conf = { "task_url": task_url, "task_db_name": task_db_name, diff --git a/examples/online_srv/online_management_simulate.py b/examples/online_srv/online_management_simulate.py index 4bb5022ee0..8c9e77bf7f 100644 --- a/examples/online_srv/online_management_simulate.py +++ b/examples/online_srv/online_management_simulate.py @@ -13,63 +13,7 @@ from qlib.workflow.online.strategy import RollingStrategy from qlib.workflow.task.gen import RollingGen from qlib.workflow.task.manage import TaskManager - - -data_handler_config = { - "start_time": "2018-01-01", - "end_time": "2018-10-31", - "fit_start_time": "2018-01-01", - "fit_end_time": "2018-03-31", - "instruments": "csi100", -} - -dataset_config = { - "class": "DatasetH", - "module_path": "qlib.data.dataset", - "kwargs": { - "handler": { - "class": "Alpha158", - "module_path": "qlib.contrib.data.handler", - "kwargs": data_handler_config, - }, - "segments": { - "train": ("2018-01-01", "2018-03-31"), - "valid": ("2018-04-01", "2018-05-31"), - "test": ("2018-06-01", "2018-09-10"), - }, - }, -} - -record_config = [ - { - "class": "SignalRecord", - "module_path": "qlib.workflow.record_temp", - }, - { - "class": "SigAnaRecord", - "module_path": "qlib.workflow.record_temp", - }, -] - -# use lgb model -task_lgb_config = { - "model": { - "class": "LGBModel", - "module_path": "qlib.contrib.model.gbdt", - }, - "dataset": dataset_config, - "record": record_config, -} - -# use xgboost model -task_xgboost_config = { - "model": { - "class": "XGBModel", - "module_path": "qlib.contrib.model.xgboost", - }, - "dataset": dataset_config, - "record": record_config, -} +from qlib.tests.config import CSI100_RECORD_LGB_TASK_CONFIG, CSI100_RECORD_XGBOOST_TASK_CONFIG class OnlineSimulationExample: @@ -84,7 +28,7 @@ def __init__( rolling_step=80, start_time="2018-09-10", end_time="2018-10-31", - tasks=[task_xgboost_config, task_lgb_config], + tasks=None, ): """ Init OnlineManagerExample. @@ -101,6 +45,8 @@ def __init__( end_time (str, optional): the end time of simulating. Defaults to "2018-10-31". tasks (dict or list[dict]): a set of the task config waiting for rolling and training """ + if tasks is None: + tasks = [CSI100_RECORD_XGBOOST_TASK_CONFIG, CSI100_RECORD_LGB_TASK_CONFIG] self.exp_name = exp_name self.task_pool = task_pool self.start_time = start_time diff --git a/examples/online_srv/rolling_online_management.py b/examples/online_srv/rolling_online_management.py index 25b8b2a0c0..592f1f866c 100644 --- a/examples/online_srv/rolling_online_management.py +++ b/examples/online_srv/rolling_online_management.py @@ -17,62 +17,7 @@ from qlib.workflow.online.strategy import RollingStrategy from qlib.workflow.task.gen import RollingGen from qlib.workflow.online.manager import OnlineManager - -data_handler_config = { - "start_time": "2013-01-01", - "end_time": "2020-09-25", - "fit_start_time": "2013-01-01", - "fit_end_time": "2014-12-31", - "instruments": "csi100", -} - -dataset_config = { - "class": "DatasetH", - "module_path": "qlib.data.dataset", - "kwargs": { - "handler": { - "class": "Alpha158", - "module_path": "qlib.contrib.data.handler", - "kwargs": data_handler_config, - }, - "segments": { - "train": ("2013-01-01", "2014-12-31"), - "valid": ("2015-01-01", "2015-12-31"), - "test": ("2016-01-01", "2020-07-10"), - }, - }, -} - -record_config = [ - { - "class": "SignalRecord", - "module_path": "qlib.workflow.record_temp", - }, - { - "class": "SigAnaRecord", - "module_path": "qlib.workflow.record_temp", - }, -] - -# use lgb model -task_lgb_config = { - "model": { - "class": "LGBModel", - "module_path": "qlib.contrib.model.gbdt", - }, - "dataset": dataset_config, - "record": record_config, -} - -# use xgboost model -task_xgboost_config = { - "model": { - "class": "XGBModel", - "module_path": "qlib.contrib.model.xgboost", - }, - "dataset": dataset_config, - "record": record_config, -} +from qlib.tests.config import CSI100_RECORD_XGBOOST_TASK_CONFIG, CSI100_RECORD_LGB_TASK_CONFIG class RollingOnlineExample: @@ -83,9 +28,13 @@ def __init__( task_url="mongodb://10.0.0.4:27017/", task_db_name="rolling_db", rolling_step=550, - tasks=[task_xgboost_config], - add_tasks=[task_lgb_config], + tasks=None, + add_tasks=None, ): + if add_tasks is None: + add_tasks = [CSI100_RECORD_LGB_TASK_CONFIG] + if tasks is None: + tasks = [CSI100_RECORD_XGBOOST_TASK_CONFIG] mongo_conf = { "task_url": task_url, # your MongoDB url "task_db_name": task_db_name, # database name diff --git a/examples/online_srv/update_online_pred.py b/examples/online_srv/update_online_pred.py index 228bc0dacb..8afc665538 100644 --- a/examples/online_srv/update_online_pred.py +++ b/examples/online_srv/update_online_pred.py @@ -7,56 +7,19 @@ Firstly, we will finish the training and set the trained models to the `online` models. Next, we will finish updating online predictions. """ +import copy import fire import qlib from qlib.config import REG_CN from qlib.model.trainer import task_train from qlib.workflow.online.utils import OnlineToolR +from qlib.tests.config import CSI300_GBDT_TASK -data_handler_config = { - "start_time": "2008-01-01", - "end_time": "2020-08-01", - "fit_start_time": "2008-01-01", - "fit_end_time": "2014-12-31", - "instruments": "csi100", -} +task = copy.deepcopy(CSI300_GBDT_TASK) -task = { - "model": { - "class": "LGBModel", - "module_path": "qlib.contrib.model.gbdt", - "kwargs": { - "loss": "mse", - "colsample_bytree": 0.8879, - "learning_rate": 0.0421, - "subsample": 0.8789, - "lambda_l1": 205.6999, - "lambda_l2": 580.9768, - "max_depth": 8, - "num_leaves": 210, - "num_threads": 20, - }, - }, - "dataset": { - "class": "DatasetH", - "module_path": "qlib.data.dataset", - "kwargs": { - "handler": { - "class": "Alpha158", - "module_path": "qlib.contrib.data.handler", - "kwargs": data_handler_config, - }, - "segments": { - "train": ("2008-01-01", "2014-12-31"), - "valid": ("2015-01-01", "2016-12-31"), - "test": ("2017-01-01", "2020-08-01"), - }, - }, - }, - "record": { - "class": "SignalRecord", - "module_path": "qlib.workflow.record_temp", - }, +task["record"] = { + "class": "SignalRecord", + "module_path": "qlib.workflow.record_temp", } diff --git a/examples/rolling_process_data/workflow.py b/examples/rolling_process_data/workflow.py index 5757aaa876..387d5cde70 100644 --- a/examples/rolling_process_data/workflow.py +++ b/examples/rolling_process_data/workflow.py @@ -4,13 +4,11 @@ import qlib import fire import pickle -import pandas as pd from datetime import datetime from qlib.config import REG_CN from qlib.data.dataset.handler import DataHandlerLP -from qlib.contrib.data.handler import Alpha158 -from qlib.utils import exists_qlib_data, init_instance_by_config +from qlib.utils import init_instance_by_config from qlib.tests.data import GetData @@ -25,9 +23,7 @@ def _init_qlib(self): """initialize qlib""" # use yahoo_cn_1min data provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir - if not exists_qlib_data(provider_uri): - print(f"Qlib data is not found in {provider_uri}") - GetData().qlib_data(target_dir=provider_uri, region=REG_CN) + GetData().qlib_data(target_dir=provider_uri, region=REG_CN, exists_skip=True) qlib.init(provider_uri=provider_uri, region=REG_CN) def _dump_pre_handler(self, path): diff --git a/examples/run_all_model.py b/examples/run_all_model.py index d587eff155..c79fee004d 100644 --- a/examples/run_all_model.py +++ b/examples/run_all_model.py @@ -5,13 +5,11 @@ import sys import fire import time -import venv import glob import shutil import signal import inspect import tempfile -import traceback import functools import statistics import subprocess @@ -23,8 +21,7 @@ import qlib from qlib.config import REG_CN from qlib.workflow import R -from qlib.workflow.cli import workflow -from qlib.utils import exists_qlib_data +from qlib.tests.data import GetData # init qlib @@ -39,12 +36,8 @@ "default_exp_name": "Experiment", }, } -if not exists_qlib_data(provider_uri): - print(f"Qlib data is not found in {provider_uri}") - sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts"))) - from get_data import GetData - GetData().qlib_data(target_dir=provider_uri, region=REG_CN) +GetData().qlib_data(target_dir=provider_uri, region=REG_CN, exists_skip=True) qlib.init(provider_uri=provider_uri, region=REG_CN, exp_manager=exp_manager) # decorator to check the arguments diff --git a/examples/workflow_by_code.py b/examples/workflow_by_code.py index d5dab89178..1cdf2ac80f 100644 --- a/examples/workflow_by_code.py +++ b/examples/workflow_by_code.py @@ -1,82 +1,22 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -import sys -from pathlib import Path - import qlib -import pandas as pd from qlib.config import REG_CN -from qlib.contrib.model.gbdt import LGBModel -from qlib.contrib.data.handler import Alpha158 -from qlib.contrib.strategy.strategy import TopkDropoutStrategy -from qlib.contrib.evaluate import ( - backtest as normal_backtest, - risk_analysis, -) -from qlib.utils import exists_qlib_data, init_instance_by_config, flatten_dict +from qlib.utils import init_instance_by_config, flatten_dict from qlib.workflow import R from qlib.workflow.record_temp import SignalRecord, PortAnaRecord from qlib.tests.data import GetData +from qlib.tests.config import CSI300_BENCH, CSI300_GBDT_TASK + if __name__ == "__main__": # use default data provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir - if not exists_qlib_data(provider_uri): - print(f"Qlib data is not found in {provider_uri}") - GetData().qlib_data(target_dir=provider_uri, region=REG_CN) - + GetData().qlib_data(target_dir=provider_uri, region=REG_CN, exists_skip=True) qlib.init(provider_uri=provider_uri, region=REG_CN) - market = "csi300" - benchmark = "SH000300" - - ################################### - # train model - ################################### - data_handler_config = { - "start_time": "2008-01-01", - "end_time": "2020-08-01", - "fit_start_time": "2008-01-01", - "fit_end_time": "2014-12-31", - "instruments": market, - } - - task = { - "model": { - "class": "LGBModel", - "module_path": "qlib.contrib.model.gbdt", - "kwargs": { - "loss": "mse", - "colsample_bytree": 0.8879, - "learning_rate": 0.0421, - "subsample": 0.8789, - "lambda_l1": 205.6999, - "lambda_l2": 580.9768, - "max_depth": 8, - "num_leaves": 210, - "num_threads": 20, - }, - }, - "dataset": { - "class": "DatasetH", - "module_path": "qlib.data.dataset", - "kwargs": { - "handler": { - "class": "Alpha158", - "module_path": "qlib.contrib.data.handler", - "kwargs": data_handler_config, - }, - "segments": { - "train": ("2008-01-01", "2014-12-31"), - "valid": ("2015-01-01", "2016-12-31"), - "test": ("2017-01-01", "2020-08-01"), - }, - }, - }, - } - port_analysis_config = { "strategy": { "class": "TopkDropoutStrategy", @@ -90,7 +30,7 @@ "verbose": False, "limit_threshold": 0.095, "account": 100000000, - "benchmark": benchmark, + "benchmark": CSI300_BENCH, "deal_price": "close", "open_cost": 0.0005, "close_cost": 0.0015, @@ -100,8 +40,8 @@ } # model initialization - model = init_instance_by_config(task["model"]) - dataset = init_instance_by_config(task["dataset"]) + model = init_instance_by_config(CSI300_GBDT_TASK["model"]) + dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"]) # NOTE: This line is optional # It demonstrates that the dataset can be used standalone. @@ -110,7 +50,7 @@ # start exp with R.start(experiment_name="workflow"): - R.log_params(**flatten_dict(task)) + R.log_params(**flatten_dict(CSI300_GBDT_TASK)) model.fit(dataset) R.save_objects(**{"params.pkl": model}) diff --git a/qlib/contrib/model/catboost_model.py b/qlib/contrib/model/catboost_model.py index 98b9b9c2df..5138e0e6f0 100644 --- a/qlib/contrib/model/catboost_model.py +++ b/qlib/contrib/model/catboost_model.py @@ -10,9 +10,10 @@ from ...model.base import Model from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP +from ...model.interpret.base import FeatureInt -class CatBoostModel(Model): +class CatBoostModel(Model, FeatureInt): """CatBoost Model""" def __init__(self, loss="RMSE", **kwargs): @@ -69,6 +70,18 @@ def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) return pd.Series(self.model.predict(x_test.values), index=x_test.index) + def get_feature_importance(self, *args, **kwargs) -> pd.Series: + """get feature importance + + Notes + ----- + parameters references: + https://catboost.ai/docs/concepts/python-reference_catboost_get_feature_importance.html#python-reference_catboost_get_feature_importance + """ + return pd.Series( + data=self.model.get_feature_importance(*args, **kwargs), index=self.model.feature_names_ + ).sort_values(ascending=False) + if __name__ == "__main__": cat = CatBoostModel() diff --git a/qlib/contrib/model/double_ensemble.py b/qlib/contrib/model/double_ensemble.py index 4b267a2b00..d3ca898f87 100644 --- a/qlib/contrib/model/double_ensemble.py +++ b/qlib/contrib/model/double_ensemble.py @@ -1,251 +1,265 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import lightgbm as lgb -import numpy as np -import pandas as pd -from typing import Text, Union -from ...model.base import Model -from ...data.dataset import DatasetH -from ...data.dataset.handler import DataHandlerLP -from ...log import get_module_logger - - -class DEnsembleModel(Model): - """Double Ensemble Model""" - - def __init__( - self, - base_model="gbm", - loss="mse", - num_models=6, - enable_sr=True, - enable_fs=True, - alpha1=1.0, - alpha2=1.0, - bins_sr=10, - bins_fs=5, - decay=None, - sample_ratios=None, - sub_weights=None, - epochs=100, - **kwargs - ): - self.base_model = base_model # "gbm" or "mlp", specifically, we use lgbm for "gbm" - self.num_models = num_models # the number of sub-models - self.enable_sr = enable_sr - self.enable_fs = enable_fs - self.alpha1 = alpha1 - self.alpha2 = alpha2 - self.bins_sr = bins_sr - self.bins_fs = bins_fs - self.decay = decay - if sample_ratios is None: # the default values for sample_ratios - sample_ratios = [0.8, 0.7, 0.6, 0.5, 0.4] - if sub_weights is None: # the default values for sub_weights - sub_weights = [1.0, 0.2, 0.2, 0.2, 0.2, 0.2] - if not len(sample_ratios) == bins_fs: - raise ValueError("The length of sample_ratios should be equal to bins_fs.") - self.sample_ratios = sample_ratios - if not len(sub_weights) == num_models: - raise ValueError("The length of sub_weights should be equal to num_models.") - self.sub_weights = sub_weights - self.epochs = epochs - self.logger = get_module_logger("DEnsembleModel") - self.logger.info("Double Ensemble Model...") - self.ensemble = [] # the current ensemble model, a list contains all the sub-models - self.sub_features = [] # the features for each sub model in the form of pandas.Index - self.params = {"objective": loss} - self.params.update(kwargs) - self.loss = loss - - def fit(self, dataset: DatasetH): - df_train, df_valid = dataset.prepare( - ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L - ) - x_train, y_train = df_train["feature"], df_train["label"] - # initialize the sample weights - N, F = x_train.shape - weights = pd.Series(np.ones(N, dtype=float)) - # initialize the features - features = x_train.columns - pred_sub = pd.DataFrame(np.zeros((N, self.num_models), dtype=float), index=x_train.index) - # train sub-models - for k in range(self.num_models): - self.sub_features.append(features) - self.logger.info("Training sub-model: ({}/{})".format(k + 1, self.num_models)) - model_k = self.train_submodel(df_train, df_valid, weights, features) - self.ensemble.append(model_k) - # no further sample re-weight and feature selection needed for the last sub-model - if k + 1 == self.num_models: - break - - self.logger.info("Retrieving loss curve and loss values...") - loss_curve = self.retrieve_loss_curve(model_k, df_train, features) - pred_k = self.predict_sub(model_k, df_train, features) - pred_sub.iloc[:, k] = pred_k - pred_ensemble = pred_sub.iloc[:, : k + 1].mean(axis=1) - loss_values = pd.Series(self.get_loss(y_train.values.squeeze(), pred_ensemble.values)) - - if self.enable_sr: - self.logger.info("Sample re-weighting...") - weights = self.sample_reweight(loss_curve, loss_values, k + 1) - - if self.enable_fs: - self.logger.info("Feature selection...") - features = self.feature_selection(df_train, loss_values) - - def train_submodel(self, df_train, df_valid, weights, features): - dtrain, dvalid = self._prepare_data_gbm(df_train, df_valid, weights, features) - evals_result = dict() - model = lgb.train( - self.params, - dtrain, - num_boost_round=self.epochs, - valid_sets=[dtrain, dvalid], - valid_names=["train", "valid"], - verbose_eval=20, - evals_result=evals_result, - ) - evals_result["train"] = list(evals_result["train"].values())[0] - evals_result["valid"] = list(evals_result["valid"].values())[0] - return model - - def _prepare_data_gbm(self, df_train, df_valid, weights, features): - x_train, y_train = df_train["feature"].loc[:, features], df_train["label"] - x_valid, y_valid = df_valid["feature"].loc[:, features], df_valid["label"] - - # Lightgbm need 1D array as its label - if y_train.values.ndim == 2 and y_train.values.shape[1] == 1: - y_train, y_valid = np.squeeze(y_train.values), np.squeeze(y_valid.values) - else: - raise ValueError("LightGBM doesn't support multi-label training") - - dtrain = lgb.Dataset(x_train.values, label=y_train, weight=weights) - dvalid = lgb.Dataset(x_valid.values, label=y_valid) - return dtrain, dvalid - - def sample_reweight(self, loss_curve, loss_values, k_th): - """ - the SR module of Double Ensemble - :param loss_curve: the shape is NxT - the loss curve for the previous sub-model, where the element (i, t) if the error on the i-th sample - after the t-th iteration in the training of the previous sub-model. - :param loss_values: the shape is N - the loss of the current ensemble on the i-th sample. - :param k_th: the index of the current sub-model, starting from 1 - :return: weights - the weights for all the samples. - """ - # normalize loss_curve and loss_values with ranking - loss_curve_norm = loss_curve.rank(axis=0, pct=True) - loss_values_norm = (-loss_values).rank(pct=True) - - # calculate l_start and l_end from loss_curve - N, T = loss_curve.shape - part = np.maximum(int(T * 0.1), 1) - l_start = loss_curve_norm.iloc[:, :part].mean(axis=1) - l_end = loss_curve_norm.iloc[:, -part:].mean(axis=1) - - # calculate h-value for each sample - h1 = loss_values_norm - h2 = (l_end / l_start).rank(pct=True) - h = pd.DataFrame({"h_value": self.alpha1 * h1 + self.alpha2 * h2}) - - # calculate weights - h["bins"] = pd.cut(h["h_value"], self.bins_sr) - h_avg = h.groupby("bins")["h_value"].mean() - weights = pd.Series(np.zeros(N, dtype=float)) - for i_b, b in enumerate(h_avg.index): - weights[h["bins"] == b] = 1.0 / (self.decay ** k_th * h_avg[i_b] + 0.1) - return weights - - def feature_selection(self, df_train, loss_values): - """ - the FS module of Double Ensemble - :param df_train: the shape is NxF - :param loss_values: the shape is N - the loss of the current ensemble on the i-th sample. - :return: res_feat: in the form of pandas.Index - - """ - x_train, y_train = df_train["feature"], df_train["label"] - features = x_train.columns - N, F = x_train.shape - g = pd.DataFrame({"g_value": np.zeros(F, dtype=float)}) - M = len(self.ensemble) - - # shuffle specific columns and calculate g-value for each feature - x_train_tmp = x_train.copy() - for i_f, feat in enumerate(features): - x_train_tmp.loc[:, feat] = np.random.permutation(x_train_tmp.loc[:, feat].values) - pred = pd.Series(np.zeros(N), index=x_train_tmp.index) - for i_s, submodel in enumerate(self.ensemble): - pred += ( - pd.Series( - submodel.predict(x_train_tmp.loc[:, self.sub_features[i_s]].values), index=x_train_tmp.index - ) - / M - ) - loss_feat = self.get_loss(y_train.values.squeeze(), pred.values) - g.loc[i_f, "g_value"] = np.mean(loss_feat - loss_values) / (np.std(loss_feat - loss_values) + 1e-7) - x_train_tmp.loc[:, feat] = x_train.loc[:, feat].copy() - - # one column in train features is all-nan # if g['g_value'].isna().any() - g["g_value"].replace(np.nan, 0, inplace=True) - - # divide features into bins_fs bins - g["bins"] = pd.cut(g["g_value"], self.bins_fs) - - # randomly sample features from bins to construct the new features - res_feat = [] - sorted_bins = sorted(g["bins"].unique(), reverse=True) - for i_b, b in enumerate(sorted_bins): - b_feat = features[g["bins"] == b] - num_feat = int(np.ceil(self.sample_ratios[i_b] * len(b_feat))) - res_feat = res_feat + np.random.choice(b_feat, size=num_feat).tolist() - return pd.Index(res_feat) - - def get_loss(self, label, pred): - if self.loss == "mse": - return (label - pred) ** 2 - else: - raise ValueError("not implemented yet") - - def retrieve_loss_curve(self, model, df_train, features): - if self.base_model == "gbm": - num_trees = model.num_trees() - x_train, y_train = df_train["feature"].loc[:, features], df_train["label"] - # Lightgbm need 1D array as its label - if y_train.values.ndim == 2 and y_train.values.shape[1] == 1: - y_train = np.squeeze(y_train.values) - else: - raise ValueError("LightGBM doesn't support multi-label training") - - N = x_train.shape[0] - loss_curve = pd.DataFrame(np.zeros((N, num_trees))) - pred_tree = np.zeros(N, dtype=float) - for i_tree in range(num_trees): - pred_tree += model.predict(x_train.values, start_iteration=i_tree, num_iteration=1) - loss_curve.iloc[:, i_tree] = self.get_loss(y_train, pred_tree) - else: - raise ValueError("not implemented yet") - return loss_curve - - def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): - if self.ensemble is None: - raise ValueError("model is not fitted yet!") - x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) - pred = pd.Series(np.zeros(x_test.shape[0]), index=x_test.index) - for i_sub, submodel in enumerate(self.ensemble): - feat_sub = self.sub_features[i_sub] - pred += ( - pd.Series(submodel.predict(x_test.loc[:, feat_sub].values), index=x_test.index) - * self.sub_weights[i_sub] - ) - return pred - - def predict_sub(self, submodel, df_data, features): - x_data, y_data = df_data["feature"].loc[:, features], df_data["label"] - pred_sub = pd.Series(submodel.predict(x_data.values), index=x_data.index) - return pred_sub +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import lightgbm as lgb +import numpy as np +import pandas as pd +from typing import Text, Union +from ...model.base import Model +from ...data.dataset import DatasetH +from ...data.dataset.handler import DataHandlerLP +from ...model.interpret.base import FeatureInt +from ...log import get_module_logger + + +class DEnsembleModel(Model, FeatureInt): + """Double Ensemble Model""" + + def __init__( + self, + base_model="gbm", + loss="mse", + num_models=6, + enable_sr=True, + enable_fs=True, + alpha1=1.0, + alpha2=1.0, + bins_sr=10, + bins_fs=5, + decay=None, + sample_ratios=None, + sub_weights=None, + epochs=100, + **kwargs + ): + self.base_model = base_model # "gbm" or "mlp", specifically, we use lgbm for "gbm" + self.num_models = num_models # the number of sub-models + self.enable_sr = enable_sr + self.enable_fs = enable_fs + self.alpha1 = alpha1 + self.alpha2 = alpha2 + self.bins_sr = bins_sr + self.bins_fs = bins_fs + self.decay = decay + if sample_ratios is None: # the default values for sample_ratios + sample_ratios = [0.8, 0.7, 0.6, 0.5, 0.4] + if sub_weights is None: # the default values for sub_weights + sub_weights = [1.0, 0.2, 0.2, 0.2, 0.2, 0.2] + if not len(sample_ratios) == bins_fs: + raise ValueError("The length of sample_ratios should be equal to bins_fs.") + self.sample_ratios = sample_ratios + if not len(sub_weights) == num_models: + raise ValueError("The length of sub_weights should be equal to num_models.") + self.sub_weights = sub_weights + self.epochs = epochs + self.logger = get_module_logger("DEnsembleModel") + self.logger.info("Double Ensemble Model...") + self.ensemble = [] # the current ensemble model, a list contains all the sub-models + self.sub_features = [] # the features for each sub model in the form of pandas.Index + self.params = {"objective": loss} + self.params.update(kwargs) + self.loss = loss + + def fit(self, dataset: DatasetH): + df_train, df_valid = dataset.prepare( + ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L + ) + x_train, y_train = df_train["feature"], df_train["label"] + # initialize the sample weights + N, F = x_train.shape + weights = pd.Series(np.ones(N, dtype=float)) + # initialize the features + features = x_train.columns + pred_sub = pd.DataFrame(np.zeros((N, self.num_models), dtype=float), index=x_train.index) + # train sub-models + for k in range(self.num_models): + self.sub_features.append(features) + self.logger.info("Training sub-model: ({}/{})".format(k + 1, self.num_models)) + model_k = self.train_submodel(df_train, df_valid, weights, features) + self.ensemble.append(model_k) + # no further sample re-weight and feature selection needed for the last sub-model + if k + 1 == self.num_models: + break + + self.logger.info("Retrieving loss curve and loss values...") + loss_curve = self.retrieve_loss_curve(model_k, df_train, features) + pred_k = self.predict_sub(model_k, df_train, features) + pred_sub.iloc[:, k] = pred_k + pred_ensemble = pred_sub.iloc[:, : k + 1].mean(axis=1) + loss_values = pd.Series(self.get_loss(y_train.values.squeeze(), pred_ensemble.values)) + + if self.enable_sr: + self.logger.info("Sample re-weighting...") + weights = self.sample_reweight(loss_curve, loss_values, k + 1) + + if self.enable_fs: + self.logger.info("Feature selection...") + features = self.feature_selection(df_train, loss_values) + + def train_submodel(self, df_train, df_valid, weights, features): + dtrain, dvalid = self._prepare_data_gbm(df_train, df_valid, weights, features) + evals_result = dict() + model = lgb.train( + self.params, + dtrain, + num_boost_round=self.epochs, + valid_sets=[dtrain, dvalid], + valid_names=["train", "valid"], + verbose_eval=20, + evals_result=evals_result, + ) + evals_result["train"] = list(evals_result["train"].values())[0] + evals_result["valid"] = list(evals_result["valid"].values())[0] + return model + + def _prepare_data_gbm(self, df_train, df_valid, weights, features): + x_train, y_train = df_train["feature"].loc[:, features], df_train["label"] + x_valid, y_valid = df_valid["feature"].loc[:, features], df_valid["label"] + + # Lightgbm need 1D array as its label + if y_train.values.ndim == 2 and y_train.values.shape[1] == 1: + y_train, y_valid = np.squeeze(y_train.values), np.squeeze(y_valid.values) + else: + raise ValueError("LightGBM doesn't support multi-label training") + + dtrain = lgb.Dataset(x_train, label=y_train, weight=weights) + dvalid = lgb.Dataset(x_valid, label=y_valid) + return dtrain, dvalid + + def sample_reweight(self, loss_curve, loss_values, k_th): + """ + the SR module of Double Ensemble + :param loss_curve: the shape is NxT + the loss curve for the previous sub-model, where the element (i, t) if the error on the i-th sample + after the t-th iteration in the training of the previous sub-model. + :param loss_values: the shape is N + the loss of the current ensemble on the i-th sample. + :param k_th: the index of the current sub-model, starting from 1 + :return: weights + the weights for all the samples. + """ + # normalize loss_curve and loss_values with ranking + loss_curve_norm = loss_curve.rank(axis=0, pct=True) + loss_values_norm = (-loss_values).rank(pct=True) + + # calculate l_start and l_end from loss_curve + N, T = loss_curve.shape + part = np.maximum(int(T * 0.1), 1) + l_start = loss_curve_norm.iloc[:, :part].mean(axis=1) + l_end = loss_curve_norm.iloc[:, -part:].mean(axis=1) + + # calculate h-value for each sample + h1 = loss_values_norm + h2 = (l_end / l_start).rank(pct=True) + h = pd.DataFrame({"h_value": self.alpha1 * h1 + self.alpha2 * h2}) + + # calculate weights + h["bins"] = pd.cut(h["h_value"], self.bins_sr) + h_avg = h.groupby("bins")["h_value"].mean() + weights = pd.Series(np.zeros(N, dtype=float)) + for i_b, b in enumerate(h_avg.index): + weights[h["bins"] == b] = 1.0 / (self.decay ** k_th * h_avg[i_b] + 0.1) + return weights + + def feature_selection(self, df_train, loss_values): + """ + the FS module of Double Ensemble + :param df_train: the shape is NxF + :param loss_values: the shape is N + the loss of the current ensemble on the i-th sample. + :return: res_feat: in the form of pandas.Index + + """ + x_train, y_train = df_train["feature"], df_train["label"] + features = x_train.columns + N, F = x_train.shape + g = pd.DataFrame({"g_value": np.zeros(F, dtype=float)}) + M = len(self.ensemble) + + # shuffle specific columns and calculate g-value for each feature + x_train_tmp = x_train.copy() + for i_f, feat in enumerate(features): + x_train_tmp.loc[:, feat] = np.random.permutation(x_train_tmp.loc[:, feat].values) + pred = pd.Series(np.zeros(N), index=x_train_tmp.index) + for i_s, submodel in enumerate(self.ensemble): + pred += ( + pd.Series( + submodel.predict(x_train_tmp.loc[:, self.sub_features[i_s]].values), index=x_train_tmp.index + ) + / M + ) + loss_feat = self.get_loss(y_train.values.squeeze(), pred.values) + g.loc[i_f, "g_value"] = np.mean(loss_feat - loss_values) / (np.std(loss_feat - loss_values) + 1e-7) + x_train_tmp.loc[:, feat] = x_train.loc[:, feat].copy() + + # one column in train features is all-nan # if g['g_value'].isna().any() + g["g_value"].replace(np.nan, 0, inplace=True) + + # divide features into bins_fs bins + g["bins"] = pd.cut(g["g_value"], self.bins_fs) + + # randomly sample features from bins to construct the new features + res_feat = [] + sorted_bins = sorted(g["bins"].unique(), reverse=True) + for i_b, b in enumerate(sorted_bins): + b_feat = features[g["bins"] == b] + num_feat = int(np.ceil(self.sample_ratios[i_b] * len(b_feat))) + res_feat = res_feat + np.random.choice(b_feat, size=num_feat, replace=False).tolist() + return pd.Index(set(res_feat)) + + def get_loss(self, label, pred): + if self.loss == "mse": + return (label - pred) ** 2 + else: + raise ValueError("not implemented yet") + + def retrieve_loss_curve(self, model, df_train, features): + if self.base_model == "gbm": + num_trees = model.num_trees() + x_train, y_train = df_train["feature"].loc[:, features], df_train["label"] + # Lightgbm need 1D array as its label + if y_train.values.ndim == 2 and y_train.values.shape[1] == 1: + y_train = np.squeeze(y_train.values) + else: + raise ValueError("LightGBM doesn't support multi-label training") + + N = x_train.shape[0] + loss_curve = pd.DataFrame(np.zeros((N, num_trees))) + pred_tree = np.zeros(N, dtype=float) + for i_tree in range(num_trees): + pred_tree += model.predict(x_train.values, start_iteration=i_tree, num_iteration=1) + loss_curve.iloc[:, i_tree] = self.get_loss(y_train, pred_tree) + else: + raise ValueError("not implemented yet") + return loss_curve + + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): + if self.ensemble is None: + raise ValueError("model is not fitted yet!") + x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) + pred = pd.Series(np.zeros(x_test.shape[0]), index=x_test.index) + for i_sub, submodel in enumerate(self.ensemble): + feat_sub = self.sub_features[i_sub] + pred += ( + pd.Series(submodel.predict(x_test.loc[:, feat_sub].values), index=x_test.index) + * self.sub_weights[i_sub] + ) + return pred + + def predict_sub(self, submodel, df_data, features): + x_data, y_data = df_data["feature"].loc[:, features], df_data["label"] + pred_sub = pd.Series(submodel.predict(x_data.values), index=x_data.index) + return pred_sub + + def get_feature_importance(self, *args, **kwargs) -> pd.Series: + """get feature importance + + Notes + ----- + parameters reference: + https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Booster.html?highlight=feature_importance#lightgbm.Booster.feature_importance + """ + res = [] + for _model, _weight in zip(self.ensemble, self.sub_weights): + res.append(pd.Series(_model.feature_importance(*args, **kwargs), index=_model.feature_name()) * _weight) + return pd.concat(res, axis=1, sort=False).sum(axis=1).sort_values(ascending=False) diff --git a/qlib/contrib/model/gbdt.py b/qlib/contrib/model/gbdt.py index 463cf8f4fa..1a7cf7fba3 100644 --- a/qlib/contrib/model/gbdt.py +++ b/qlib/contrib/model/gbdt.py @@ -8,9 +8,10 @@ from ...model.base import ModelFT from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP +from ...model.interpret.base import LightGBMFInt -class LGBModel(ModelFT): +class LGBModel(ModelFT, LightGBMFInt): """LightGBM Model""" def __init__(self, loss="mse", **kwargs): @@ -33,8 +34,8 @@ def _prepare_data(self, dataset: DatasetH): else: raise ValueError("LightGBM doesn't support multi-label training") - dtrain = lgb.Dataset(x_train.values, label=y_train) - dvalid = lgb.Dataset(x_valid.values, label=y_valid) + dtrain = lgb.Dataset(x_train, label=y_train) + dvalid = lgb.Dataset(x_valid, label=y_valid) return dtrain, dvalid def fit( diff --git a/qlib/contrib/model/highfreq_gdbt_model.py b/qlib/contrib/model/highfreq_gdbt_model.py index 5a2eeb50a9..04d6ab9d58 100644 --- a/qlib/contrib/model/highfreq_gdbt_model.py +++ b/qlib/contrib/model/highfreq_gdbt_model.py @@ -1,17 +1,18 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. +import warnings import numpy as np import pandas as pd import lightgbm as lgb -from qlib.model.base import ModelFT -from qlib.data.dataset import DatasetH -from qlib.data.dataset.handler import DataHandlerLP -import warnings +from ...model.base import ModelFT +from ...data.dataset import DatasetH +from ...data.dataset.handler import DataHandlerLP +from ...model.interpret.base import LightGBMFInt -class HFLGBModel(ModelFT): +class HFLGBModel(ModelFT, LightGBMFInt): """LightGBM Model for high frequency prediction""" def __init__(self, loss="mse", **kwargs): @@ -97,8 +98,8 @@ def _prepare_data(self, dataset: DatasetH): else: raise ValueError("LightGBM doesn't support multi-label training") - dtrain = lgb.Dataset(x_train.values, label=y_train) - dvalid = lgb.Dataset(x_valid.values, label=y_valid) + dtrain = lgb.Dataset(x_train, label=y_train) + dvalid = lgb.Dataset(x_valid, label=y_valid) return dtrain, dvalid def fit( diff --git a/qlib/contrib/model/xgboost.py b/qlib/contrib/model/xgboost.py index cbba146782..2a38f4fe19 100755 --- a/qlib/contrib/model/xgboost.py +++ b/qlib/contrib/model/xgboost.py @@ -8,9 +8,10 @@ from ...model.base import Model from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP +from ...model.interpret.base import FeatureInt -class XGBModel(Model): +class XGBModel(Model, FeatureInt): """XGBModel Model""" def __init__(self, **kwargs): @@ -42,8 +43,8 @@ def fit( else: raise ValueError("XGBoost doesn't support multi-label training") - dtrain = xgb.DMatrix(x_train.values, label=y_train_1d) - dvalid = xgb.DMatrix(x_valid.values, label=y_valid_1d) + dtrain = xgb.DMatrix(x_train, label=y_train_1d) + dvalid = xgb.DMatrix(x_valid, label=y_valid_1d) self.model = xgb.train( self._params, dtrain=dtrain, @@ -62,3 +63,13 @@ def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): raise ValueError("model is not fitted yet!") x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) return pd.Series(self.model.predict(xgb.DMatrix(x_test.values)), index=x_test.index) + + def get_feature_importance(self, *args, **kwargs) -> pd.Series: + """get feature importance + + Notes + ------- + parameters reference: + https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.Booster.get_score + """ + return pd.Series(self.model.get_score(*args, **kwargs)).sort_values(ascending=False) diff --git a/qlib/model/interpret/__init__.py b/qlib/model/interpret/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/qlib/model/interpret/base.py b/qlib/model/interpret/base.py new file mode 100644 index 0000000000..57cc7929a9 --- /dev/null +++ b/qlib/model/interpret/base.py @@ -0,0 +1,40 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Interfaces to interpret models +""" + +import pandas as pd +from abc import abstractmethod + + +class FeatureInt: + """Feature (Int)erpreter""" + + @abstractmethod + def get_feature_importance(self) -> pd.Series: + """get feature importance + + Returns + ------- + The index is the feature name. + + The greater the value, the higher importance. + """ + + +class LightGBMFInt(FeatureInt): + """LightGBM (F)eature (Int)erpreter""" + + def get_feature_importance(self, *args, **kwargs) -> pd.Series: + """get feature importance + + Notes + ----- + parameters reference: + https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Booster.html?highlight=feature_importance#lightgbm.Booster.feature_importance + """ + return pd.Series(self.model.feature_importance(*args, **kwargs), index=self.model.feature_name()).sort_values( + ascending=False + ) diff --git a/qlib/tests/__init__.py b/qlib/tests/__init__.py index 8b53bc53a5..7f43cd99ac 100644 --- a/qlib/tests/__init__.py +++ b/qlib/tests/__init__.py @@ -1,6 +1,4 @@ -import sys import unittest -from ..utils import exists_qlib_data from .data import GetData from .. import init from ..config import REG_CN @@ -14,14 +12,13 @@ class TestAutoData(unittest.TestCase): @classmethod def setUpClass(cls) -> None: # use default data - if not exists_qlib_data(cls.provider_uri): - print(f"Qlib data is not found in {cls.provider_uri}") - GetData().qlib_data( - name="qlib_data_simple", - region="cn", - interval="1d", - target_dir=cls.provider_uri, - delete_old=False, - ) + GetData().qlib_data( + name="qlib_data_simple", + region=REG_CN, + interval="1d", + target_dir=cls.provider_uri, + delete_old=False, + exists_skip=True, + ) init(provider_uri=cls.provider_uri, region=REG_CN, **cls._setup_kwargs) diff --git a/qlib/tests/config.py b/qlib/tests/config.py new file mode 100644 index 0000000000..80461f6f9b --- /dev/null +++ b/qlib/tests/config.py @@ -0,0 +1,108 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +CSI300_MARKET = "csi300" +CSI100_MARKET = "csi100" + +CSI300_BENCH = "SH000300" + +DATASET_ALPHA158_CLASS = "Alpha158" +DATASET_ALPHA360_CLASS = "Alpha360" + +################################### +# config +################################### + + +GBDT_MODEL = { + "class": "LGBModel", + "module_path": "qlib.contrib.model.gbdt", + "kwargs": { + "loss": "mse", + "colsample_bytree": 0.8879, + "learning_rate": 0.0421, + "subsample": 0.8789, + "lambda_l1": 205.6999, + "lambda_l2": 580.9768, + "max_depth": 8, + "num_leaves": 210, + "num_threads": 20, + }, +} + + +RECORD_CONFIG = [ + { + "class": "SignalRecord", + "module_path": "qlib.workflow.record_temp", + }, + { + "class": "SigAnaRecord", + "module_path": "qlib.workflow.record_temp", + }, +] + + +def get_data_handler_config(market=CSI300_MARKET): + return { + "start_time": "2008-01-01", + "end_time": "2020-08-01", + "fit_start_time": "2008-01-01", + "fit_end_time": "2014-12-31", + "instruments": market, + } + + +def get_dataset_config(market=CSI300_MARKET, dataset_class=DATASET_ALPHA158_CLASS): + return { + "class": "DatasetH", + "module_path": "qlib.data.dataset", + "kwargs": { + "handler": { + "class": dataset_class, + "module_path": "qlib.contrib.data.handler", + "kwargs": get_data_handler_config(market), + }, + "segments": { + "train": ("2008-01-01", "2014-12-31"), + "valid": ("2015-01-01", "2016-12-31"), + "test": ("2017-01-01", "2020-08-01"), + }, + }, + } + + +def get_gbdt_task(market=CSI300_MARKET): + return { + "model": GBDT_MODEL, + "dataset": get_dataset_config(market), + } + + +def get_record_lgb_config(market=CSI300_MARKET): + return { + "model": { + "class": "LGBModel", + "module_path": "qlib.contrib.model.gbdt", + }, + "dataset": get_dataset_config(market), + "record": RECORD_CONFIG, + } + + +def get_record_xgboost_config(market=CSI300_MARKET): + return { + "model": { + "class": "XGBModel", + "module_path": "qlib.contrib.model.xgboost", + }, + "dataset": get_dataset_config(market), + "record": RECORD_CONFIG, + } + + +CSI300_DATASET_CONFIG = get_dataset_config(market=CSI300_MARKET) +CSI300_GBDT_TASK = get_gbdt_task(market=CSI300_MARKET) + +CSI100_RECORD_XGBOOST_TASK_CONFIG = get_record_xgboost_config(market=CSI100_MARKET) +CSI100_RECORD_LGB_TASK_CONFIG = get_record_lgb_config(market=CSI100_MARKET) diff --git a/qlib/tests/data.py b/qlib/tests/data.py index 3bf6a2c969..2bfe435906 100644 --- a/qlib/tests/data.py +++ b/qlib/tests/data.py @@ -10,6 +10,7 @@ from tqdm import tqdm from pathlib import Path from loguru import logger +from qlib.utils import exists_qlib_data class GetData: @@ -112,6 +113,7 @@ def qlib_data( interval="1d", region="cn", delete_old=True, + exists_skip=False, ): """download cn qlib data from remote @@ -129,6 +131,8 @@ def qlib_data( data region, value from [cn, us], by default cn delete_old: bool delete an existing directory, by default True + exists_skip: bool + exists skip, by default False Examples --------- @@ -140,6 +144,13 @@ def qlib_data( ------- """ + if exists_skip and exists_qlib_data(target_dir): + logger.warning( + f"Data already exists: {target_dir}, the data download will be skipped\n" + f"\tIf downloading is required: `exists_skip=False` or `change target_dir`" + ) + return + qlib_version = ".".join(re.findall(r"(\d+)\.+", qlib.__version__)) def _get_file_name(v): diff --git a/tests/dataset_tests/test_datalayer.py b/tests/dataset_tests/test_datalayer.py index 9d282b1672..bdd0d915bf 100644 --- a/tests/dataset_tests/test_datalayer.py +++ b/tests/dataset_tests/test_datalayer.py @@ -1,26 +1,10 @@ -import sys -from pathlib import Path -import qlib -from qlib.data import D -from qlib.config import REG_CN import unittest import numpy as np -from qlib.utils import exists_qlib_data - - -class TestDataset(unittest.TestCase): - @classmethod - def setUpClass(cls) -> None: - # use default data - provider_uri = "~/.qlib/qlib_data/cn_data_simple" # target_dir - if not exists_qlib_data(provider_uri): - print(f"Qlib data is not found in {provider_uri}") - sys.path.append(str(Path(__file__).resolve().parent.parent.parent.joinpath("scripts"))) - from get_data import GetData +from qlib.data import D +from qlib.tests import TestAutoData - GetData().qlib_data(name="qlib_data_simple", target_dir=provider_uri) - qlib.init(provider_uri=provider_uri, region=REG_CN) +class TestDataset(TestAutoData): def testCSI300(self): close_p = D.features(D.instruments("csi300"), ["$close"]) size = close_p.groupby("datetime").size() diff --git a/tests/test_all_pipeline.py b/tests/test_all_pipeline.py index d34c1773ad..4c20405fa7 100644 --- a/tests/test_all_pipeline.py +++ b/tests/test_all_pipeline.py @@ -12,55 +12,7 @@ from qlib.workflow import R from qlib.workflow.record_temp import SignalRecord, SigAnaRecord, PortAnaRecord from qlib.tests import TestAutoData - - -market = "csi300" -benchmark = "SH000300" - -################################### -# train model -################################### -data_handler_config = { - "start_time": "2008-01-01", - "end_time": "2020-08-01", - "fit_start_time": "2008-01-01", - "fit_end_time": "2014-12-31", - "instruments": market, -} - -task = { - "model": { - "class": "LGBModel", - "module_path": "qlib.contrib.model.gbdt", - "kwargs": { - "loss": "mse", - "colsample_bytree": 0.8879, - "learning_rate": 0.0421, - "subsample": 0.8789, - "lambda_l1": 205.6999, - "lambda_l2": 580.9768, - "max_depth": 8, - "num_leaves": 210, - "num_threads": 20, - }, - }, - "dataset": { - "class": "DatasetH", - "module_path": "qlib.data.dataset", - "kwargs": { - "handler": { - "class": "Alpha158", - "module_path": "qlib.contrib.data.handler", - "kwargs": data_handler_config, - }, - "segments": { - "train": ("2008-01-01", "2014-12-31"), - "valid": ("2015-01-01", "2016-12-31"), - "test": ("2017-01-01", "2020-08-01"), - }, - }, - }, -} +from qlib.tests.config import CSI300_GBDT_TASK, CSI300_BENCH port_analysis_config = { "strategy": { @@ -75,7 +27,7 @@ "verbose": False, "limit_threshold": 0.095, "account": 100000000, - "benchmark": benchmark, + "benchmark": CSI300_BENCH, "deal_price": "close", "open_cost": 0.0005, "close_cost": 0.0015, @@ -96,15 +48,15 @@ def train(): """ # model initiaiton - model = init_instance_by_config(task["model"]) - dataset = init_instance_by_config(task["dataset"]) + model = init_instance_by_config(CSI300_GBDT_TASK["model"]) + dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"]) # To test __repr__ print(dataset) print(R) # start exp with R.start(experiment_name="workflow"): - R.log_params(**flatten_dict(task)) + R.log_params(**flatten_dict(CSI300_GBDT_TASK)) model.fit(dataset) # prediction @@ -137,12 +89,12 @@ def train_with_sigana(): performance: dict model performance """ - model = init_instance_by_config(task["model"]) - dataset = init_instance_by_config(task["dataset"]) + model = init_instance_by_config(CSI300_GBDT_TASK["model"]) + dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"]) # start exp with R.start(experiment_name="workflow_with_sigana"): - R.log_params(**flatten_dict(task)) + R.log_params(**flatten_dict(CSI300_GBDT_TASK)) model.fit(dataset) # predict and calculate ic and ric @@ -171,7 +123,7 @@ def fake_experiment(): default_uri = R.get_uri() current_uri = "file:./temp-test-exp-mag" with R.start(experiment_name="fake_workflow_for_expm", uri=current_uri): - R.log_params(**flatten_dict(task)) + R.log_params(**flatten_dict(CSI300_GBDT_TASK)) current_uri_to_check = R.get_uri() default_uri_to_check = R.get_uri() diff --git a/tests/test_contrib_workflow.py b/tests/test_contrib_workflow.py index ccd3c6a901..9b1edbd4eb 100644 --- a/tests/test_contrib_workflow.py +++ b/tests/test_contrib_workflow.py @@ -1,73 +1,22 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -import sys import shutil import unittest from pathlib import Path -import qlib -from qlib.config import C from qlib.contrib.workflow import MultiSegRecord, SignalMseRecord from qlib.utils import init_instance_by_config, flatten_dict from qlib.workflow import R from qlib.tests import TestAutoData - - -market = "csi300" -benchmark = "SH000300" - -################################### -# train model -################################### -data_handler_config = { - "start_time": "2008-01-01", - "end_time": "2020-08-01", - "fit_start_time": "2008-01-01", - "fit_end_time": "2014-12-31", - "instruments": market, -} - -task = { - "model": { - "class": "LGBModel", - "module_path": "qlib.contrib.model.gbdt", - "kwargs": { - "loss": "mse", - "colsample_bytree": 0.8879, - "learning_rate": 0.0421, - "subsample": 0.8789, - "lambda_l1": 205.6999, - "lambda_l2": 580.9768, - "max_depth": 8, - "num_leaves": 210, - "num_threads": 20, - }, - }, - "dataset": { - "class": "DatasetH", - "module_path": "qlib.data.dataset", - "kwargs": { - "handler": { - "class": "Alpha158", - "module_path": "qlib.contrib.data.handler", - "kwargs": data_handler_config, - }, - "segments": { - "train": ("2008-01-01", "2014-12-31"), - "valid": ("2015-01-01", "2016-12-31"), - "test": ("2017-01-01", "2020-08-01"), - }, - }, - }, -} +from qlib.tests.config import CSI300_GBDT_TASK def train_multiseg(): - model = init_instance_by_config(task["model"]) - dataset = init_instance_by_config(task["dataset"]) + model = init_instance_by_config(CSI300_GBDT_TASK["model"]) + dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"]) with R.start(experiment_name="workflow"): - R.log_params(**flatten_dict(task)) + R.log_params(**flatten_dict(CSI300_GBDT_TASK)) model.fit(dataset) recorder = R.get_recorder() sr = MultiSegRecord(model, dataset, recorder) @@ -77,10 +26,10 @@ def train_multiseg(): def train_mse(): - model = init_instance_by_config(task["model"]) - dataset = init_instance_by_config(task["dataset"]) + model = init_instance_by_config(CSI300_GBDT_TASK["model"]) + dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"]) with R.start(experiment_name="workflow"): - R.log_params(**flatten_dict(task)) + R.log_params(**flatten_dict(CSI300_GBDT_TASK)) model.fit(dataset) recorder = R.get_recorder() sr = SignalMseRecord(recorder, model=model, dataset=dataset) diff --git a/tests/test_get_data.py b/tests/test_get_data.py index c511d1b910..93a852f554 100644 --- a/tests/test_get_data.py +++ b/tests/test_get_data.py @@ -1,16 +1,13 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -import sys import shutil import unittest from pathlib import Path -sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts"))) -from get_data import GetData - import qlib from qlib.data import D +from qlib.tests.data import GetData DATA_DIR = Path(__file__).parent.joinpath("test_get_data") SOURCE_DIR = DATA_DIR.joinpath("source") @@ -37,7 +34,9 @@ def tearDownClass(cls) -> None: def test_0_qlib_data(self): - GetData().qlib_data(name="qlib_data_simple", target_dir=QLIB_DIR, region="cn", interval="1d", delete_old=False) + GetData().qlib_data( + name="qlib_data_simple", target_dir=QLIB_DIR, region="cn", interval="1d", delete_old=False, exists_skip=True + ) df = D.features(D.instruments("csi300"), self.FIELDS) self.assertListEqual(list(df.columns), self.FIELDS, "get qlib data failed") self.assertFalse(df.dropna().empty, "get qlib data failed") diff --git a/tests/test_register_ops.py b/tests/test_register_ops.py index 7d3322ddcc..ac86be59ce 100644 --- a/tests/test_register_ops.py +++ b/tests/test_register_ops.py @@ -1,17 +1,12 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -import sys import unittest import numpy as np -import qlib from qlib.data import D from qlib.data.ops import ElemOperator, PairOperator -from qlib.config import REG_CN -from qlib.utils import exists_qlib_data from qlib.tests import TestAutoData -from qlib.tests.data import GetData class Diff(ElemOperator):