<a href="https://colab.research.google.com/github/microsoft/qlib/blob/main/examples/workflow_by_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#  Copyright (c) Microsoft Corporation.
#  Licensed under the MIT License.

In [1]:
import sys, site
from pathlib import Path

################################# NOTE #################################
#  Please be aware that if colab installs the latest numpy and pyqlib  #
#  in this cell, users should RESTART the runtime in order to run the  #
#  following cells successfully.                                       #
########################################################################

try:
    import qlib
except ImportError:
    # install qlib
    ! pip install --upgrade numpy
    ! pip install pyqlib
    if "google.colab" in sys.modules:
        # The Google colab environment is a little outdated. We have to downgrade the pyyaml to make it compatible with other packages
        ! pip install pyyaml==5.4.1
    # reload
    site.main()

scripts_dir = Path.cwd().parent.joinpath("scripts")
if not scripts_dir.joinpath("get_data.py").exists():
    # download get_data.py script
    scripts_dir = Path("~/tmp/qlib_code/scripts").expanduser().resolve()
    scripts_dir.mkdir(parents=True, exist_ok=True)
    import requests

    with requests.get("https://raw.githubusercontent.com/microsoft/qlib/main/scripts/get_data.py", timeout=10) as resp:
        with open(scripts_dir.joinpath("get_data.py"), "wb") as fp:
            fp.write(resp.content)

Collecting numpy
  Downloading numpy-2.3.0-cp311-cp311-macosx_14_0_arm64.whl.metadata (62 kB)
Downloading numpy-2.3.0-cp311-cp311-macosx_14_0_arm64.whl (5.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
numba 0.61.2 requires numpy<2.3,>=1.24, but you have numpy 2.3.0 which is incompatible.
tensorflow 2.19.0 requires numpy<2.2.0,>=1.26.0, but you have numpy 2.3.0 which is incompatible.
langchain 0.0.226 requires numpy<2,>=1, but you have numpy 2.3.0 which is incompatible.
langchain 0.0.226 requires pydantic<2,>=1, but you have py

In [2]:
import qlib
import pandas as pd
from qlib.constant import REG_CN
from qlib.utils import exists_qlib_data, init_instance_by_config
from qlib.workflow import R
from qlib.workflow.record_temp import SignalRecord, PortAnaRecord
from qlib.utils import flatten_dict

In [3]:
# use default data
# NOTE: need to download data from remote: python scripts/get_data.py qlib_data_cn --target_dir ~/.qlib/qlib_data/cn_data
provider_uri = "~/.qlib/qlib_data/cn_data"  # target_dir
if not exists_qlib_data(provider_uri):
    print(f"Qlib data is not found in {provider_uri}")
    sys.path.append(str(scripts_dir))
    from get_data import GetData

    GetData().qlib_data(target_dir=provider_uri, region=REG_CN)
qlib.init(provider_uri=provider_uri, region=REG_CN)

Qlib data is not found in ~/.qlib/qlib_data/cn_data


[32m2025-06-14 01:41:38.883[0m | [1mINFO    [0m | [36mqlib.tests.data[0m:[36mdownload[0m:[36m65[0m - [1m20250614014136_qlib_data_cn_1d_latest.zip downloading......[0m
  0%|          | 0/196549189 [00:00<?, ?it/s][32m2025-06-14 01:41:38.883[0m | [1mINFO    [0m | [36mqlib.tests.data[0m:[36mdownload[0m:[36m65[0m - [1m20250614014136_qlib_data_cn_1d_latest.zip downloading......[0m
196549632it [01:06, 2963909.55it/s]                               
[32m2025-06-14 01:42:45.212[0m | [1mINFO    [0m | [36mqlib.tests.data[0m:[36m_unzip[0m:[36m128[0m - [1m/Users/fan/.qlib/qlib_data/cn_data/20250614014136_qlib_data_cn_1d_latest.zip unzipping......[0m
100%|██████████| 31008/31008 [00:05<00:00, 5785.69it/s]
[35627:MainThread](2025-06-14 01:42:50,739) INFO - qlib.Initialization - [config.py:420] - default_conf: client.
[35627:MainThread](2025-06-14 01:42:50,741) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[3

In [4]:
market = "csi300"
benchmark = "SH000300"

# train model

In [5]:
###################################
# train model
###################################
data_handler_config = {
    "start_time": "2008-01-01",
    "end_time": "2020-08-01",
    "fit_start_time": "2008-01-01",
    "fit_end_time": "2014-12-31",
    "instruments": market,
}

task = {
    "model": {
        "class": "LGBModel",
        "module_path": "qlib.contrib.model.gbdt",
        "kwargs": {
            "loss": "mse",
            "colsample_bytree": 0.8879,
            "learning_rate": 0.0421,
            "subsample": 0.8789,
            "lambda_l1": 205.6999,
            "lambda_l2": 580.9768,
            "max_depth": 8,
            "num_leaves": 210,
            "num_threads": 20,
        },
    },
    "dataset": {
        "class": "DatasetH",
        "module_path": "qlib.data.dataset",
        "kwargs": {
            "handler": {
                "class": "Alpha158",
                "module_path": "qlib.contrib.data.handler",
                "kwargs": data_handler_config,
            },
            "segments": {
                "train": ("2008-01-01", "2014-12-31"),
                "valid": ("2015-01-01", "2016-12-31"),
                "test": ("2017-01-01", "2020-08-01"),
            },
        },
    },
}

# model initialization
model = init_instance_by_config(task["model"])
dataset = init_instance_by_config(task["dataset"])

# start exp to train model
with R.start(experiment_name="train_model"):
    R.log_params(**flatten_dict(task))
    model.fit(dataset)
    R.save_objects(trained_model=model)
    rid = R.get_recorder().id

ModuleNotFoundError. CatBoostModel are skipped. (optional: maybe installing CatBoostModel can fix it.)
ModuleNotFoundError. XGBModel is skipped(optional: maybe installing xgboost can fix it).
ModuleNotFoundError.  PyTorch models are skipped (optional: maybe installing pytorch can fix it).


[33472:MainThread](2025-05-30 23:43:00,252) INFO - qlib.timer - [log.py:127] - Time cost: 65.258s | Loading data Done
[33472:MainThread](2025-05-30 23:43:02,030) INFO - qlib.timer - [log.py:127] - Time cost: 0.558s | DropnaLabel Done
[33472:MainThread](2025-05-30 23:43:04,959) INFO - qlib.timer - [log.py:127] - Time cost: 2.928s | CSZScoreNorm Done
[33472:MainThread](2025-05-30 23:43:05,005) INFO - qlib.timer - [log.py:127] - Time cost: 4.751s | fit & process data Done
[33472:MainThread](2025-05-30 23:43:05,006) INFO - qlib.timer - [log.py:127] - Time cost: 70.012s | Init data Done
[33472:MainThread](2025-05-30 23:43:05,030) INFO - qlib.workflow - [exp.py:258] - Experiment 273733318884080872 starts running ...
[33472:MainThread](2025-05-30 23:43:05,221) INFO - qlib.workflow - [recorder.py:345] - Recorder 25fe78515c914ae39cc00aca189b12b6 starts running under Experiment 273733318884080872 ...


Training until validation scores don't improve for 50 rounds
[20]	train's l2: 0.990585	valid's l2: 0.994309
[40]	train's l2: 0.986931	valid's l2: 0.993693
[60]	train's l2: 0.984352	valid's l2: 0.99349
[80]	train's l2: 0.982319	valid's l2: 0.993382
[100]	train's l2: 0.980442	valid's l2: 0.99331
[120]	train's l2: 0.97871	valid's l2: 0.993247
[140]	train's l2: 0.976987	valid's l2: 0.993334
[160]	train's l2: 0.97536	valid's l2: 0.993338
Early stopping, best iteration is:
[122]	train's l2: 0.978519	valid's l2: 0.993238


[33472:MainThread](2025-05-30 23:43:27,818) INFO - qlib.timer - [log.py:127] - Time cost: 0.373s | waiting `async_log` Done


# prediction, backtest & analysis

In [6]:
###################################
# prediction, backtest & analysis
###################################
port_analysis_config = {
    "executor": {
        "class": "SimulatorExecutor",
        "module_path": "qlib.backtest.executor",
        "kwargs": {
            "time_per_step": "day",
            "generate_portfolio_metrics": True,
        },
    },
    "strategy": {
        "class": "TopkDropoutStrategy",
        "module_path": "qlib.contrib.strategy.signal_strategy",
        "kwargs": {
            "model": model,
            "dataset": dataset,
            "topk": 50,
            "n_drop": 5,
        },
    },
    "backtest": {
        "start_time": "2017-01-01",
        "end_time": "2020-08-01",
        "account": 100000000,
        "benchmark": benchmark,
        "exchange_kwargs": {
            "freq": "day",
            "limit_threshold": 0.095,
            "deal_price": "close",
            "open_cost": 0.0005,
            "close_cost": 0.0015,
            "min_cost": 5,
        },
    },
}

# backtest and analysis
with R.start(experiment_name="backtest_analysis"):
    recorder = R.get_recorder(recorder_id=rid, experiment_name="train_model")
    model = recorder.load_object("trained_model")

    # prediction
    recorder = R.get_recorder()
    ba_rid = recorder.id
    sr = SignalRecord(model, dataset, recorder)
    sr.generate()

    # backtest & analysis
    par = PortAnaRecord(recorder, port_analysis_config, "day")
    par.generate()

[33472:MainThread](2025-05-30 23:43:58,792) INFO - qlib.workflow - [exp.py:258] - Experiment 737731714663440462 starts running ...
[33472:MainThread](2025-05-30 23:43:58,818) INFO - qlib.workflow - [recorder.py:345] - Recorder c73b9c93875347d99b9014d22559f1e5 starts running under Experiment 737731714663440462 ...
[33472:MainThread](2025-05-30 23:44:00,203) INFO - qlib.workflow - [record_temp.py:198] - Signal record 'pred.pkl' has been saved as the artifact of the Experiment 737731714663440462
[33472:MainThread](2025-05-30 23:44:00,279) INFO - qlib.backtest caller - [__init__.py:93] - Create new exchange


'The following are prediction results of the LGBModel model.'
                          score
datetime   instrument          
2017-01-03 SH600000   -0.042865
           SH600008    0.005925
           SH600009    0.030596
           SH600010   -0.013973
           SH600015   -0.141758




backtest loop:   0%|          | 0/871 [00:00<?, ?it/s]

  return np.nanmean(self.data)
  return np.nanmean(self.data)
  return np.nanmean(self.data)
[33472:MainThread](2025-05-30 23:45:03,067) INFO - qlib.workflow - [record_temp.py:515] - Portfolio analysis record 'port_analysis_1day.pkl' has been saved as the artifact of the Experiment 737731714663440462
[33472:MainThread](2025-05-30 23:45:03,074) INFO - qlib.workflow - [record_temp.py:540] - Indicator analysis record 'indicator_analysis_1day.pkl' has been saved as the artifact of the Experiment 737731714663440462


'The following are analysis results of benchmark return(1day).'
                       risk
mean               0.000477
std                0.012295
annualized_return  0.113561
information_ratio  0.598699
max_drawdown      -0.370479
'The following are analysis results of the excess return without cost(1day).'
                       risk
mean               0.000730
std                0.005705
annualized_return  0.173708
information_ratio  1.973639
max_drawdown      -0.057265
'The following are analysis results of the excess return with cost(1day).'
                       risk
mean               0.000535
std                0.005703
annualized_return  0.127230
information_ratio  1.446068
max_drawdown      -0.066180
'The following are analysis results of indicators(1day).'
     value
ffr    1.0
pa     0.0
pos    0.0


[33472:MainThread](2025-05-30 23:45:04,023) INFO - qlib.timer - [log.py:127] - Time cost: 0.000s | waiting `async_log` Done


# analyze graphs

In [7]:
from qlib.contrib.report import analysis_model, analysis_position
from qlib.data import D

recorder = R.get_recorder(recorder_id=ba_rid, experiment_name="backtest_analysis")
print(recorder)
pred_df = recorder.load_object("pred.pkl")
report_normal_df = recorder.load_object("portfolio_analysis/report_normal_1day.pkl")
positions = recorder.load_object("portfolio_analysis/positions_normal_1day.pkl")
analysis_df = recorder.load_object("portfolio_analysis/port_analysis_1day.pkl")

ModuleNotFoundError: No module named 'plotly'

## analysis position

### report

In [None]:
analysis_position.report_graph(report_normal_df)

### risk analysis

In [None]:
analysis_position.risk_analysis_graph(analysis_df, report_normal_df)

## analysis model

In [None]:
label_df = dataset.prepare("test", col_set="label")
label_df.columns = ["label"]

### score IC

In [None]:
pred_label = pd.concat([label_df, pred_df], axis=1, sort=True).reindex(label_df.index)
analysis_position.score_ic_graph(pred_label)

### model performance

In [None]:
analysis_model.model_performance_graph(pred_label)