Skip to content

Commit

Permalink
Fix high-freq data (microsoft#702)
Browse files Browse the repository at this point in the history
* fix the collector.py yahoo 1min factor calculation

* fix HFSignalRecord
  • Loading branch information
zhupr committed Nov 20, 2021
1 parent 1547355 commit 680d8cb
Show file tree
Hide file tree
Showing 6 changed files with 112 additions and 22 deletions.
3 changes: 2 additions & 1 deletion examples/highfreq/README.md
Expand Up @@ -30,6 +30,7 @@ Run the example by running the following command:
## Benchmarks Performance
### Signal Test
Here are the results of signal test for benchmark models. We will keep updating benchmark models in future.

| Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Long precision| Short Precision | Long-Short Average Return | Long-Short Average Sharpe |
|---|---|---|---|---|---|---|---|---|---|
| LightGBM | Alpha158 | 0.3042±0.00 | 1.5372±0.00| 0.3117±0.00 | 1.6258±0.00 | 0.6720±0.00 | 0.6870±0.00 | 0.000769±0.00 | 1.0190±0.00 |
| LightGBM | Alpha158 | 0.0349±0.00 | 0.3805±0.00| 0.0435±0.00 | 0.4724±0.00 | 0.5111±0.00 | 0.5428±0.00 | 0.000074±0.00 | 0.2677±0.00 |
9 changes: 2 additions & 7 deletions qlib/workflow/record_temp.py
@@ -1,27 +1,21 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

from qlib.backtest import executor
import re
import logging
import warnings
import pandas as pd
from pathlib import Path
from pprint import pprint
from typing import Union, List, Optional
from collections import defaultdict

from qlib.utils.exceptions import LoadObjectError
from ..contrib.evaluate import indicator_analysis, risk_analysis, indicator_analysis
from ..contrib.evaluate import risk_analysis, indicator_analysis

from ..data.dataset import DatasetH
from ..data.dataset.handler import DataHandlerLP
from ..backtest import backtest as normal_backtest
from ..utils import init_instance_by_config, get_module_by_module_path
from ..log import get_module_logger
from ..utils import flatten_dict, class_casting
from ..utils.time import Freq
from ..strategy.base import BaseStrategy
from ..contrib.eva.alpha import calc_ic, calc_long_short_return, calc_long_short_prec


Expand Down Expand Up @@ -215,6 +209,7 @@ class HFSignalRecord(SignalRecord):
"""

artifact_path = "hg_sig_analysis"
depend_cls = SignalRecord

def __init__(self, recorder, **kwargs):
super().__init__(recorder=recorder)
Expand Down
32 changes: 30 additions & 2 deletions scripts/data_collector/cn_index/collector.py
Expand Up @@ -96,6 +96,27 @@ def html_table_index(self) -> int:
"""
raise NotImplementedError()

def format_datetime(self, inst_df: pd.DataFrame) -> pd.DataFrame:
"""formatting the datetime in an instrument
Parameters
----------
inst_df: pd.DataFrame
inst_df.columns = [self.SYMBOL_FIELD_NAME, self.START_DATE_FIELD, self.END_DATE_FIELD]
Returns
-------
"""
if self.freq != "day":
inst_df[self.START_DATE_FIELD] = inst_df[self.START_DATE_FIELD].apply(
lambda x: (pd.Timestamp(x) + pd.Timedelta(hours=9, minutes=30)).strftime("%Y-%m-%d %H:%M:%S")
)
inst_df[self.END_DATE_FIELD] = inst_df[self.END_DATE_FIELD].apply(
lambda x: (pd.Timestamp(x) + pd.Timedelta(hours=15, minutes=0)).strftime("%Y-%m-%d %H:%M:%S")
)
return inst_df

def get_changes(self) -> pd.DataFrame:
"""get companies changes
Expand Down Expand Up @@ -284,7 +305,12 @@ def html_table_index(self):


def get_instruments(
qlib_dir: str, index_name: str, method: str = "parse_instruments", request_retry: int = 5, retry_sleep: int = 3
qlib_dir: str,
index_name: str,
method: str = "parse_instruments",
freq: str = "day",
request_retry: int = 5,
retry_sleep: int = 3,
):
"""
Expand All @@ -296,6 +322,8 @@ def get_instruments(
index name, value from ["csi100", "csi300"]
method: str
method, value from ["parse_instruments", "save_new_companies"]
freq: str
freq, value from ["day", "1min"]
request_retry: int
request retry, by default 5
retry_sleep: int
Expand All @@ -312,7 +340,7 @@ def get_instruments(
"""
_cur_module = importlib.import_module("data_collector.cn_index.collector")
obj = getattr(_cur_module, f"{index_name.upper()}")(
qlib_dir=qlib_dir, index_name=index_name, request_retry=request_retry, retry_sleep=retry_sleep
qlib_dir=qlib_dir, index_name=index_name, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep
)
getattr(obj, method)()

Expand Down
28 changes: 27 additions & 1 deletion scripts/data_collector/index.py
Expand Up @@ -26,7 +26,14 @@ class IndexBase:
ADD = "add"
INST_PREFIX = ""

def __init__(self, index_name: str, qlib_dir: [str, Path] = None, request_retry: int = 5, retry_sleep: int = 3):
def __init__(
self,
index_name: str,
qlib_dir: [str, Path] = None,
freq: str = "day",
request_retry: int = 5,
retry_sleep: int = 3,
):
"""
Parameters
Expand All @@ -35,6 +42,8 @@ def __init__(self, index_name: str, qlib_dir: [str, Path] = None, request_retry:
index name
qlib_dir: str
qlib directory, by default Path(__file__).resolve().parent.joinpath("qlib_data")
freq: str
freq, value from ["day", "1min"]
request_retry: int
request retry, by default 5
retry_sleep: int
Expand All @@ -49,6 +58,7 @@ def __init__(self, index_name: str, qlib_dir: [str, Path] = None, request_retry:
self.cache_dir.mkdir(exist_ok=True, parents=True)
self._request_retry = request_retry
self._retry_sleep = retry_sleep
self.freq = freq

@property
@abc.abstractmethod
Expand Down Expand Up @@ -106,6 +116,21 @@ def get_changes(self) -> pd.DataFrame:
"""
raise NotImplementedError("rewrite get_changes")

@abc.abstractmethod
def format_datetime(self, inst_df: pd.DataFrame) -> pd.DataFrame:
"""formatting the datetime in an instrument
Parameters
----------
inst_df: pd.DataFrame
inst_df.columns = [self.SYMBOL_FIELD_NAME, self.START_DATE_FIELD, self.END_DATE_FIELD]
Returns
-------
"""
raise NotImplementedError("rewrite format_datetime")

def save_new_companies(self):
"""save new companies
Expand Down Expand Up @@ -206,6 +231,7 @@ def parse_instruments(self):
_inst_prefix = self.INST_PREFIX.strip()
if _inst_prefix:
inst_df["save_inst"] = inst_df[self.SYMBOL_FIELD_NAME].apply(lambda x: f"{_inst_prefix}{x}")
inst_df = self.format_datetime(inst_df)
inst_df.to_csv(
self.instruments_dir.joinpath(f"{self.index_name.lower()}.txt"), sep="\t", index=False, header=None
)
Expand Down
44 changes: 38 additions & 6 deletions scripts/data_collector/us_index/collector.py
Expand Up @@ -37,9 +37,16 @@ class WIKIIndex(IndexBase):
# https://superuser.com/questions/613313/why-cant-we-make-con-prn-null-folder-in-windows
INST_PREFIX = ""

def __init__(self, index_name: str, qlib_dir: [str, Path] = None, request_retry: int = 5, retry_sleep: int = 3):
def __init__(
self,
index_name: str,
qlib_dir: [str, Path] = None,
freq: str = "day",
request_retry: int = 5,
retry_sleep: int = 3,
):
super(WIKIIndex, self).__init__(
index_name=index_name, qlib_dir=qlib_dir, request_retry=request_retry, retry_sleep=retry_sleep
index_name=index_name, qlib_dir=qlib_dir, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep
)

self._target_url = f"{WIKI_URL}/{WIKI_INDEX_NAME_MAP[self.index_name.upper()]}"
Expand Down Expand Up @@ -71,6 +78,24 @@ def get_changes(self) -> pd.DataFrame:
"""
raise NotImplementedError("rewrite get_changes")

def format_datetime(self, inst_df: pd.DataFrame) -> pd.DataFrame:
"""formatting the datetime in an instrument
Parameters
----------
inst_df: pd.DataFrame
inst_df.columns = [self.SYMBOL_FIELD_NAME, self.START_DATE_FIELD, self.END_DATE_FIELD]
Returns
-------
"""
if self.freq != "day":
inst_df[self.END_DATE_FIELD] = inst_df[self.END_DATE_FIELD].apply(
lambda x: (pd.Timestamp(x) + pd.Timedelta(hours=23, minutes=59)).strftime("%Y-%m-%d %H:%M:%S")
)
return inst_df

@property
def calendar_list(self) -> List[pd.Timestamp]:
"""get history trading date
Expand Down Expand Up @@ -245,7 +270,12 @@ def parse_instruments(self):


def get_instruments(
qlib_dir: str, index_name: str, method: str = "parse_instruments", request_retry: int = 5, retry_sleep: int = 3
qlib_dir: str,
index_name: str,
method: str = "parse_instruments",
freq: str = "day",
request_retry: int = 5,
retry_sleep: int = 3,
):
"""
Expand All @@ -257,6 +287,8 @@ def get_instruments(
index name, value from ["SP500", "NASDAQ100", "DJIA", "SP400"]
method: str
method, value from ["parse_instruments", "save_new_companies"]
freq: str
freq, value from ["day", "1min"]
request_retry: int
request retry, by default 5
retry_sleep: int
Expand All @@ -265,15 +297,15 @@ def get_instruments(
Examples
-------
# parse instruments
$ python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/cn_data --method parse_instruments
$ python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/us_data --method parse_instruments
# parse new companies
$ python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/cn_data --method save_new_companies
$ python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/us_data --method save_new_companies
"""
_cur_module = importlib.import_module("data_collector.us_index.collector")
obj = getattr(_cur_module, f"{index_name.upper()}Index")(
qlib_dir=qlib_dir, index_name=index_name, request_retry=request_retry, retry_sleep=retry_sleep
qlib_dir=qlib_dir, index_name=index_name, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep
)
getattr(obj, method)()

Expand Down
18 changes: 13 additions & 5 deletions scripts/data_collector/yahoo/collector.py
Expand Up @@ -601,11 +601,19 @@ def adjusted_price(self, df: pd.DataFrame) -> pd.DataFrame:
# - Close price adjusted for splits. Adjusted close price adjusted for both dividends and splits.
# - data_1d.adjclose: Adjusted close price adjusted for both dividends and splits.
# - data_1d.close: `data_1d.adjclose / (close for the first trading day that is not np.nan)`
df["date_tmp"] = df[self._date_field_name].apply(lambda x: pd.Timestamp(x).date())
df.set_index("date_tmp", inplace=True)
df.loc[:, "factor"] = data_1d["close"] / df["close"]
df.loc[:, "paused"] = data_1d["paused"]
df.reset_index("date_tmp", drop=True, inplace=True)
def _calc_factor(df_1d: pd.DataFrame):
try:
_date = pd.Timestamp(pd.Timestamp(df_1d[self._date_field_name].iloc[0]).date())
df_1d["factor"] = (
data_1d.loc[_date]["close"] / df_1d.loc[df_1d["close"].last_valid_index()]["close"]
)
df_1d["paused"] = data_1d.loc[_date]["paused"]
except Exception:
df_1d["factor"] = np.nan
df_1d["paused"] = np.nan
return df_1d

df = df.groupby([df[self._date_field_name].dt.date]).apply(_calc_factor)

if self.CONSISTENT_1d:
# the date sequence is consistent with 1d
Expand Down

0 comments on commit 680d8cb

Please sign in to comment.