## 定义挖掘过程中会使用的相关类和方法

  - 如要修改相关参数可直接修改 Config 类中的配置项

In [4]:
import inspect
import multiprocessing
import operator
import random
import time

import bottleneck as bn
import numpy as np
import pandas as pd
from deap import base, creator, gp, tools
from numpy.lib.stride_tricks import as_strided
import empyrical as em

# 调用平台 log 对象，方便打印日志
log = T.BigLogger()

class Config:
    # 设定短区间数据读取时间范围
    start_date = '2017-01-01'
    end_date = '2018-11-01'
    # 设定全区间数据读取时间范围
    all_start_date = '2016-10-01'  # 因子分析模块也会使用该配置进行分析
    all_end_date = '2020-11-05'  # 因子分析模块也会使用该配置进行分析
    # 以 bar1d_wap_CN_STOCK_A_adj 表什么字段计算收益，默认以全天 vwap 买入价格进行计算
    return_field = 'wap_3_vwap_buy'
    # 要读取的 barra 数据表名（建投内部使用 barra_factor_cne5 表）
    barra_table_name = 'factor_data_CN_STOCK_A'   #'barra_factor_cne5'
    # 收益计算周期（因子挖掘时也使用这个参数作为收益周期）
    rebalance_period = 1  # 因子分析模块也会使用该配置进行分析
    # 设定多进程会使用到的核数
    pool_processes_limit = 12
    # =========== 因子分析模块参数 ==========
    # 股票池，可配置为【全市场 / 沪深 300 / 中证 500 / 中证 800】中的任一项
    stock_pool = '全市场'  # 只有因子分析模块会使用该配置进行分析
    # 分组数量
    quantile_count = 5  # 只有因子分析模块会使用该配置进行分析
    # 手续费及滑点
    commission_rate = 0.0016  # 只有因子分析模块会使用该配置进行分析
    # 移除涨跌停股票
    drop_price_limit_stocks = True  # 只有因子分析模块会使用该配置进行分析
    # 移除 ST 股票
    drop_st_stocks = True  # 只有因子分析模块会使用该配置进行分析
    # 移除新股
    drop_new_stocks = True  # 只有因子分析模块会使用该配置进行分析
    # 要计算的指标，只有因子分析模块会使用该配置进行分析
    metrics = ['因子表现概览', '因子分布', '因子行业分布', '因子市值分布', 'IC分析', '买入信号重合分析', '因子估值分析', '因子拥挤度分析', '因子值最大/最小股票', '多因子相关性分析']
    # =========== 因子分析模块参数 ==========

    # =========== 因子挖掘相关参数 ==========
    # 因子挖掘循环次数
    mtime = 3
    # 训练集：测试集
    train_test_data_retio = 3/4
    # 训练集：验证集（该处的训练集是从上面的训练集分出来的）
    train_validate_data_ratio = 3/4
    # 初始种群数量
    init_ind_num = 20
    # 种群代数
    ngen = 2
    # 短区间训练适应度标准
    train_fitness = 5
    # 全区间验证适应度标准
    test_fitness = 3
    # 适应度函数选取，可选"icir","mutual_info","long_sharpe","longshort_vol", "long_vol", "longshort_sharpe"，"long_return","longshort_return"
    # 在设置该参数时，还需对上面的标准进行修改
    fitness = "icir"
    # IR 计算方式
    # TODO：rank_ir也作为一种fitness进行设置
    ir_type = 'ir'  # 可填写 ir 或者 rank_ir，只有当fitness选择iric才有效
    # 交叉概率
    cxpb = 0.8
    # 子树变异概率
    mutpb = 0.6
    # 提升变异概率
    mutspb = 0.6
    # 点变异概率
    mutnrpb = 0.6
    # 常数项
    constant_nums = list(range(1, 11))
    # =========== 因子挖掘相关参数 ==========


class StopWatch:
    """时间记录类"""
    def __init__(self):
        self._start = time.perf_counter()
        self._end = None

    @property
    def duration(self):
        return self._end - self._start if self._end else time.perf_counter() - self._start

    @property
    def running(self):
        return not self._end

    def restart(self):
        self._start = time.perf_counter()
        self._end = None

    def reset(self):
        self._start = time.perf_counter()
        self._end = self._start

    def start(self):
        if not self.running:
            self._start = time.perf_counter() - self.duration
            self._end = None

    def stop(self):
        if self.running:
            self._end = time.perf_counter()

    def __str__(self):
        time = self.duration * 1000
        if time >= 1000:
            return f'{time / 1000:.4f}s'
        if time >= 1:
            return f'{time:.4f}ms'
        return '{time * 1000:.4f}μs'


class Functions:
    """ 设定在因子挖掘中会用到的所有函数
    要添加相关函数实现可继续在此类中以静态方法的方式添加具体实现（'_'下划线开头的方法会被忽略，不会加入到遗传算法原语集中）
    注意一定要加上对应的输入输出类型注解，否则 deap 包无法正常 compile 进行因子值的计算
    """
    @staticmethod
    def _rolling_window(data, window_size):
        shape = (data.shape[0] - window_size + 1, window_size) + data.shape[1:]
        strides = (data.strides[0],) + data.strides
        return as_strided(data, shape=shape, strides=strides)

    @staticmethod
    def prod(x : np.ndarray, n : int) -> np.ndarray:
        res = np.full(x.shape, np.nan)
        rolling_data = Functions._rolling_window(x, window_size=n)
        rolling_res = np.prod(rolling_data, axis=1)
        res[n - 1:] = rolling_res
        return res

    @staticmethod
    def rank(x: np.ndarray) -> np.ndarray:
        res = bn.nanrankdata(x, axis=1)
        return res

    @staticmethod
    def max(x: np.ndarray, y: np.ndarray) -> np.ndarray:
        res = np.full(x.shape, np.nan)
        bool_ = x >= y
        res[bool_] = x[bool_]
        res[~bool_] = y[~bool_]
        return res

    @staticmethod
    def min(x: np.ndarray, y: np.ndarray) -> np.ndarray:
        res = np.full(x.shape, np.nan)
        bool_ = x >= y
        res[~bool_] = x[~bool_]
        res[bool_] = y[bool_]
        return res

    @staticmethod
    def delay(x: np.ndarray, n: int) -> np.ndarray:
        res = np.full(x.shape, np.nan)
        res[n:] = x[:-n]
        return res

    @staticmethod
    def stddev(x: np.ndarray, n: int) -> np.ndarray:
        res = bn.move_std(x, n, min_count=max(1, n // 2), axis=0)
        return res

    @staticmethod
    def corr(x: np.ndarray,y: np.ndarray, n: int) -> np.ndarray:
        min_count = max(1, n // 2)

        c = x * y
        d_count = np.ones(c.shape)
        d_count[np.isnan(c)] = np.nan
        d_count = bn.move_sum(d_count, window=n, min_count=1, axis=0)

        ab_sum = bn.move_sum((c), window=n, min_count=min_count, axis=0)
        a_sum = bn.move_sum((x), window=n, min_count=min_count, axis=0)
        b_sum = bn.move_sum((y), window=n, min_count=min_count, axis=0)
        aa_sum = bn.move_sum((x * x), window=n, min_count=min_count, axis=0)
        bb_sum = bn.move_sum((y * y), window=n, min_count=min_count, axis=0)

        res = (ab_sum * d_count - a_sum * b_sum) / (np.sqrt(d_count * aa_sum - a_sum ** 2) * np.sqrt(d_count * bb_sum - b_sum ** 2))
        return res

    @staticmethod
    def delta(x: np.ndarray, n: int) -> np.ndarray:
        res = x - Functions.delay(x, n)
        return res

    @staticmethod
    def vadd(x: np.ndarray, y: np.ndarray) -> np.ndarray:
        res = x + y
        return res

    @staticmethod
    def vsub(x: np.ndarray, y: np.ndarray) -> np.ndarray:
        res = x - y
        return res

    @staticmethod
    def vmul(x: np.ndarray, y: np.ndarray) -> np.ndarray:
        res = x * y
        return res

    @staticmethod
    def vdiv(x: np.ndarray, y: np.ndarray) -> np.ndarray:
        res = x / y
        res[np.isinf(res)] = np.nan
        return res

    @staticmethod
    def log(x: np.ndarray) -> np.ndarray:
        res = np.log(x + min(0,np.nanmin(x)) + 1e-20)
        return res

    @staticmethod
    def abs(x: np.ndarray) -> np.ndarray:
        res = np.abs(x)
        return res

    @staticmethod
    def vneg(x: np.ndarray) -> np.ndarray:
        res = x * -1
        return res

    @staticmethod
    def sign(x: np.ndarray) -> np.ndarray:
        res = np.sign(x)
        return res
    @staticmethod
    # 反正切函数
    def arctan(x : np.ndarray) -> np.ndarray:
        res = np.arctan(x)
        return res
    
#     @staticmethod
#     # 工具函数，把数据转化为3d数据
#     def rolling_to_3d(mat, window, chunk_num=1):
#         s0, s1 = mat.strides
#         r, c = mat.shape
#         max_chunk_num = int(np.floor(r/window))
#         chunk_num = min(max_chunk_num, chunk_num)

#         def rolling(m):
#             shape0 = m.shape[0]-window+1
#             if shape0 <= 0:
#                 shape0 = m.shape[0]
#             return as_strided(m, shape=(shape0, window, c), strides=(s0, s0, s1), writeable=False)

#         if chunk_num == 1:
#             yield rolling(mat)
#         else:
#             chunk_size = r // chunk_num
#             first_chunk = mat[:chunk_size]
#             yield rolling(first_chunk)

#             chunks = as_strided(
#                             mat[chunk_size-window+1:], 
#                             shape=(chunk_num-1, chunk_size+window-1, c), 
#                             strides=(s0*chunk_size, s0, s1), 
#                             writeable=False
#                             )
#             for sub_mat in chunks:
#                 yield rolling(sub_mat)

#             left_rows = r % chunk_num
#             if left_rows > 0:
#                 sub_mat = mat[-(left_rows+window-1):]
#                 yield rolling(sub_mat)
    
    
    #@staticmethod
    #  半衰加权平均，alpha参数为n/m 原函数wma含义不明确，改成半衰加权
    #def wma(x : np.ndarray, n : int, m : int) -> np.ndarray:
    #    res = np.full(x.shape, np.nan)
    #    temp = np.array([[(1/2)**(i/m) for i in range(n - 1, -1, -1)]] * (x.shape[1] + x.shape[0] - 1))
    #    temp3d = Functions.rolling_to_3d(temp, window = x.shape[0], chunk_num = 1)
    #    temp = next(temp3d)
    #    temp = temp.swapaxes(1,2)
    #    temp = temp.swapaxes(0,2)
    #    x3d = Functions.rolling_to_3d(x, window = n, chunk_num = max(1, n//10))
    #    iv = []
    #    while True:
    #        try:
    #            resid = next(x3d)
    #            iv.append(resid)
    #        except StopIteration:
    #            break
    #    iv = np.concatenate(iv, axis=0)
    #    res[n - 1:] = np.sum(iv * temp[n - 1:], axis = 1)
    #    return res

    @staticmethod
    def decay_linear(x: np.ndarray, n: int) -> np.ndarray:
        res = bn.move_mean(x, window=n, min_count=max(1, n // 2), axis=0)
        return res

    @staticmethod
    def cov(x: np.ndarray, y: np.ndarray, n: int) -> np.ndarray:
        res = np.full(x.shape, np.nan)
        min_count = max(1, n // 2)
        a, b = x, y
        c = a * b
        d_count = np.ones(c.shape)
        d_count[np.isnan(c)] = np.nan
        d_count = bn.move_sum(d_count, window=n, min_count=1, axis=0)
        ab_sum = bn.move_sum((c), window=n, min_count=min_count, axis=0)
        a_sum = bn.move_sum(a, window=n, min_count=min_count, axis=0)
        b_sum = bn.move_sum(b, window=n, min_count=min_count, axis=0)
        res = (ab_sum * d_count - a_sum * b_sum) / ((d_count-1) * d_count)
        return res

    @staticmethod
    def ts_sum(x: np.ndarray, n: int) -> np.ndarray:
        res = bn.move_sum(x, window=n, min_count=max(1, n // 2), axis=0)
        return res

    @staticmethod
    def sma(x: np.ndarray, n: int) -> np.ndarray:
        res = bn.move_mean(x, window=n, min_count=max(1, n // 2), axis=0)
        return res

    @staticmethod
    def ts_rank(x: np.ndarray, n: int) -> np.ndarray:
        res = bn.move_rank(x, window=n, min_count=max(1, n // 2), axis=0)
        return res

    @staticmethod
    def ts_min(x: np.ndarray, n: int) -> np.ndarray:
        res = bn.move_min(x, window=n, min_count=max(1, n // 2), axis=0)
        return res

    @staticmethod
    def ts_max(x: np.ndarray, n: int) -> np.ndarray:
        res = bn.move_max(x, window=n, min_count=max(1, n // 2), axis=0)
        return res

    @staticmethod
    def mean2(x: np.ndarray, y: np.ndarray) -> np.ndarray:
        res = (x + y) / 2
        return res

    @staticmethod
    def mean3(x: np.ndarray, y: np.ndarray, z: np.ndarray) -> np.ndarray:
        res = (x + y + z) / 3
        return res

    @staticmethod
    def argmax(x: np.ndarray, n: int) -> np.ndarray:
        res = bn.move_argmax(x, window=n, min_count=max(1, n // 2), axis=0)
        return res

    @staticmethod
    def argmin(x: np.ndarray, n: int) -> np.ndarray:
        res = bn.move_argmin(x, window=n, min_count=max(1, n // 2), axis=0)
        return res    
    #@staticmethod
    #def clear_by_cond(x: np.ndarray, y: np.ndarray, z: np.ndarray) -> np.ndarray:
    #    res = np.full(x.shape, np.nan)
    #    bool_ = x < y
    #    x_10 = np.ceil(np.log10(x))
    #    y_10 = np.ceil(np.log10(y))
    #    x_10[np.isinf(x_10)] = np.nan
    #    y_10[np.isinf(y_10)] = np.nan
    #    if abs(bn.nanmean(x_10) - bn.nanmean(y_10)) > 2 or np.isnan(bool_).sum() == (bool_.shape[0] * bool_.shape[1]):
    #        return res
    #    res[bool_] = 0
    #    res[~bool_] = z[~bool_]
    #    return res

    @staticmethod
    def power(x: np.ndarray, y: np.ndarray) -> np.ndarray:
        y_a_min, y_a_max = bn.nanmin(y, axis=1), bn.nanmax(y, axis=1)
        z = ((y.T - y_a_min) / (y_a_max - y_a_min)).T
        x_a_min, x_a_max = bn.nanmin(x, axis=1), bn.nanmax(x, axis=1)
        x = ((x.T - x_a_min) / (x_a_max - x_a_min)).T + 1
        res = x ** z
        return res

    #@staticmethod
    #def if_then_else(x: np.ndarray, y: np.ndarray, z: np.ndarray, r : np.ndarray) -> np.ndarray:
    #    res = np.full(x.shape, np.nan)
    #    # TODO：讨论关于函数归一化的操作，是直接对基础因子进行归一化，还是在可能出现该情况的函数中进行归一化
    #    bool_ = x < y
    #    x_10 = np.ceil(np.log10(x))
    #    y_10 = np.ceil(np.log10(y))
    #    x_10[np.isinf(x_10)] = np.nan
    #    y_10[np.isinf(y_10)] = np.nan
    #    if abs(bn.nanmean(x_10) - bn.nanmean(y_10)) > 2 or np.isnan(bool_).sum() == (bool_.shape[0] * bool_.shape[1]):
    #        return res
    #    res[bool_] = z[bool_]
    #    res[~bool_] = r[~bool_]
    #    return res
    
    @staticmethod
    def constant(type_int : int) -> int:
        return type_int

    @staticmethod
    def standardation(x: np.ndarray) -> np.ndarray:
        """标准化算子"""
        mean = bn.nanmean(x, axis=1).reshape(-1, 1)
        std = bn.nanstd(x, axis=1, ddof=1).reshape(-1, 1)
        with np.errstate(invalid='ignore'):
            res = (x - mean) / std
        return res

    @staticmethod
    def normalization(x: np.ndarray) -> np.ndarray:
        """归一化算子"""
        x_min = bn.nanmin(x, axis=1).reshape(-1, 1)
        x_max = bn.nanmax(x, axis=1).reshape(-1 ,1)
        with np.errstate(invalid='ignore'):
            res = (x - x_min) / (x_max - x_min)
        return res
    
#     @staticmethod
#     def delay(x: np.ndarray, n: int) -> np.ndarray:
#         res = np.full(x.shape, np.nan)
#         res[n:] = x[:-n]
#         return res
    
#     @staticmethod
#     def delta(x: np.ndarray, n: int) -> np.ndarray:
#         res = x - Functions.delay(x, n)
#         return res
    
    # @staticmethod
    # 过去n天变化率
    # def pctchange_ts(n : int, x : np.ndarray) -> np.ndarray:
    #     res = Functions.delta(n, x) / x
    #     return res
    
    @staticmethod
    # 过去n天回归系数
    def regbeta_ts(n : int, x : np.ndarray, y : np.ndarray) -> np.ndarray:
        c = x * y
        d_count = np.ones(c.shape)
        d_count[np.isnan(c)] = np.nan
        d_count = bn.move_sum(d_count, window=n, min_count=1, axis=0)
        #bn.move_min(x_m, window=d, min_count=min_count, axis=0)
        ab_sum = bn.move_sum((c), window=n, min_count=None, axis=0)
        a_sum = bn.move_sum((x), window=n, min_count=None, axis=0)
        b_sum = bn.move_sum((y), window=n, min_count=None, axis=0)
        aa_sum = bn.move_sum((x * x), window=n, min_count=None, axis=0)

        beta = (ab_sum * d_count - a_sum * b_sum) / \
                 (d_count * aa_sum - (a_sum) ** 2)
        beta[np.isinf(beta)] = np.nan
        return beta

    # @staticmethod
    # 过去n天回归残差
    # def regresid_ts(n : int, x : np.ndarray, y : np.ndarray) -> np.ndarray:
    #     beta = y - x * Functions.regbeta_ts(n, x, y)
    #     return beta
    


class DataProcessor:
    """数据加载及处理类"""
    def __init__(self, config, data_type):
        # 数据开始日期
        self.start_date = config.start_date
        # 数据结束日期
        self.end_date = config.end_date
        if data_type == 'all':
            # 数据开始日期
            self.start_date = config.all_start_date
            # 数据结束日期
            self.end_date = config.all_end_date
        # 整个数据中的训练集比例
        self.train_test_data_retio = config.train_test_data_retio
        self.train_validate_data_ratio = config.train_validate_data_ratio
        self.config = config
        # 键值对存储的基础因子数据
        self.data = {}
        # 数据的 index 和 columns，方便后续进行对齐操作
        self.data_index = None
        self.data_cols = None
        # 键值对方式存储的 barra 数据
        self.barra = {}
        # 第一次加载完 barra 数据后有的数据字段
        self._barra_cols = list(self.barra.keys())
        # 原始收益率数据（dataframe 格式）
        self.raw_ret = pd.DataFrame()
        # np.ndarray 类型的收益率数据（正交后的）
        self._ret = None
        # 加载数据
        self._load_data()
        # 根据指定的训练集比例进行短区间内的训练集与测试集划分
        self._split_train_and_test(self.raw_ret.shape[0], self.train_test_data_retio, self.train_validate_data_ratio)

    def _split_train_and_test(self, length, train_test_ratio, train_validate_ratio):
        full_list = list(range(length))
        offset_train_test = int(length * train_test_ratio)
        offset_train_validate = int(offset_train_test * train_validate_ratio)
        if length == 0 or offset_train_test < 1:
            return [], full_list
        self.train_series = full_list[:offset_train_validate]
        self.val_series = full_list[offset_train_validate:offset_train_test]
        self.test_series = full_list[offset_train_test:]
        self.full_series = full_list

    @property
    def industy_filter(self):
        return (self.barra_values.sum(axis=2) != 0).T

    @property
    def train_data_values(self):
        return np.array([value.values for key, value in self.data.items()]).astype(np.float64)

    @property
    def ret_values(self):
        now_barra_cols = list(self.barra.keys())
        if len(now_barra_cols) != len(self._barra_cols) or self._ret is None:
            self._ret = self.neutralization(self.raw_ret.values.astype(np.float64))
            self._barra_cols = now_barra_cols
        return self._ret

    @property
    def barra_values(self):
        return np.array([value.values for key, value in self.barra.items()]).astype(np.float64)

    @property
    def train_ret(self):
        return self.ret_values[self.train_series]

    @property
    def train_barra(self):
        return self.barra_values[:, self.train_series, :]

    @property
    def train_industy_filter(self):
        return self.industy_filter[self.train_series]

    @property
    def validate_ret(self):
        return self.ret_values[self.val_series]

    @property
    def validate_barra(self):
        return self.barra_values[:, self.val_series, :]

    @property
    def validate_industy_filter(self):
        return self.industy_filter[self.val_series]

    @property
    def test_ret(self):
        return self.ret_values[self.test_series]

    @property
    def test_barra(self):
        return self.barra_values[:, self.test_series, :]

    @property
    def test_industy_filter(self):
        return self.industy_filter[self.test_series]

    @property
    def ret_values_for_count_nan(self):
        now_barra_cols = list(self.barra.keys())
        if len(now_barra_cols) != len(self._barra_cols) or self._ret is None:
            self._barra_cols = now_barra_cols
        return self._ret
    
    @property
    def not_nan_num(self):
        return pd.DataFrame(self.ret_values_for_count_nan).count(axis=1).values

    def add_barra(self, new_barras):
        # 加入传入的 barra_dict
        if isinstance(new_barras, dict):
            for name, data in new_barras.items():
                if name in list(self.barra.keys()):
                    continue
                self.barra[name] = data
        # 数据对齐
        for name in list(self.barra.keys()):
            self.barra[name] = self.barra[name].reindex(index=self.data_index, columns=self.data_cols)

    # 这里面基本
    def _load_data(self):
        log.info(f'loading data from {self.start_date} to {self.end_date}...')
        # 读取基础因子数据
        bar1d_data_cols = ['close', 'high', 'low', 'open', 'volume', 'turn']
        bar1d_data = DataSource('bar1d_CN_STOCK_A').read(start_date=self.start_date, end_date=self.end_date, fields=bar1d_data_cols)
        # 读取A股每日时间/成交量加权平均价格行情数据(后复权)表
        bar1d_wap_adj_data = DataSource('bar1d_wap_CN_STOCK_A_adj').read(start_date=self.start_date, end_date=self.end_date, fields=['wap_3_vwap_buy'] + [self.config.return_field])
        # 将 vwap 数据加入基础训练数据中（以计算收益的字段所在 dataframe 为基准进行 merge）
        merged_data = pd.merge(bar1d_wap_adj_data, bar1d_data, how='left', on=['date', 'instrument'])
        # 过滤 ST、停牌、一字板
        status_data = DataSource('stock_status_CN_STOCK_A').read(start_date=self.start_date, end_date=self.end_date, fields=['price_limit_status', 'st_status', 'suspended'])
        merged_data = pd.merge(merged_data, status_data, how='left', on=['date', 'instrument'])
        merged_data['return'] = merged_data.groupby('instrument')['wap_3_vwap_buy'].apply(lambda x: x.pct_change(self.config.rebalance_period).shift(-1 * (self.config.rebalance_period + 1)))
        # TODO 暂时手动对数据做去除 NaN 值处理
        merged_data.dropna(subset=['price_limit_status', 'st_status', 'suspended'], inplace=True)
        merged_data['suspended'] = merged_data['suspended'].astype('bool')
        condition = (merged_data['price_limit_status'] == 2) & (merged_data['st_status'] == 0) & (merged_data['suspended'] == False) & ~(merged_data['high'] == merged_data['low'])
        merged_data.loc[~condition, 'return'] = np.nan
        merged_data_pivot_table = merged_data.pivot_table(index='date', columns='instrument')
        raw_ret = merged_data_pivot_table['return']
        self.raw_ret = raw_ret.fillna(method="ffill", axis=0)

        # 判断是否已有 data_index 和 data_cols，没有则以收益数据的 index 和 columns 为标准，后续数据均以此进行 reindex
        if self.data_index is None or self.data_cols is None:
            self.data_index = self.raw_ret.index
            self.data_cols = self.raw_ret.columns
        # 将基础因子数据加入 data 中
        data_cols = [*bar1d_data_cols, 'wap_3_vwap_buy']
        for col in data_cols:
            self.data[col] = merged_data_pivot_table[col]
            self.data[col] = self.data[col].reindex(index=self.data_index, columns=self.data_cols)
        # 读取 barra 数据
        barra_data = DataSource(self.config.barra_table_name).read(start_date=self.start_date, end_date=self.end_date)
        barra_cols = [i for i in barra_data.columns if not i.startswith('Rank_')]
        barra_df = barra_data[barra_cols]
        barra_cols_set = set(barra_cols) - {'date', 'instrument'}
        for col in barra_cols_set:
            one_barra = barra_df[['date', 'instrument', col]].pivot_table(index='date', columns='instrument', values=col)
            one_barra = one_barra.reindex(index=self.data_index, columns=self.data_cols)
            self.barra[col] = one_barra
        log.info(f'loaded data from {self.start_date} to {self.end_date} successfully')

    def neutralization(self, data, data_type='all'):
        
        def matrixMul(A, B):
            try:
                nshape = B.shape[1]
            except IndexError as e:
                B = B.reshape(len(B),1)
    
            if len(A[0]) == len(B):
                res = [[0] * len(B[0]) for i in range(len(A))]
                for i in range(len(A)):
                    for j in range(len(B[0])):
                        for k in range(len(B)):
                            res[i][j] += A[i][k] * B[k][j]
                return np.array(res).reshape(len(res))
            return ('输入矩阵有误！')
        
        """对传入数据进行中性化"""
        barra_values = self.barra_values
        industy_filter = self.industy_filter
        if data_type == 'train':
            barra_values = self.train_barra
            industy_filter = self.train_industy_filter
        elif data_type == 'val':
            barra_values = self.validate_barra
            industy_filter = self.validate_industy_filter
        elif data_type == 'test':
            barra_values = self.test_barra
            industy_filter = self.test_industy_filter
            
        ret = np.full(data.shape, np.nan)
        tmp = np.append(barra_values, data)
        tmp = tmp.reshape(barra_values.shape[0] + 1, barra_values.shape[1], barra_values.shape[2]).swapaxes(0,1)
        bool_ = np.isnan(tmp).sum(axis=1) == 0
        time_dot = 0
        time_lstsq = 0
        
        for i in range(data.shape[0]):
            try:
                Y = data[i, bool_[i]]
                X = barra_values[industy_filter[i], i, :].T
                
#                 time1 = stop_watch.duration
#                 x2 = np.linalg.lstsq(X[bool_[i], :], Y)[0]
#                 time2 = stop_watch.duration
#                 x1 = np.dot(X[bool_[i], :], x2)
#                 time3 = stop_watch.duration
#                 time_dot += time3 - time2
#                 time_lstsq += time2 - time1
#                 ret[i, bool_[i]] = Y - x1
                
                ret[i, bool_[i]] = Y - matrixMul(X[bool_[i], :], np.linalg.lstsq(X[bool_[i], :], Y)[0])
            except:
                pass
        # print(f"time_dot: {time_dot},  time_lstsq: {time_lstsq}")
        return ret

    def outlier_limit(self, data, n_extremum=5):
        """对传入数据进行去极值"""
        median = bn.nanmedian(data, axis=1).reshape(-1, 1)
        Dmad = bn.nanmedian(abs(data - median), axis=1).reshape(-1, 1)
        upper = (median + n_extremum * Dmad)
        lower = (median - n_extremum * Dmad)
        with np.errstate(invalid='ignore'):
            res = np.clip(data, lower, upper)
        return res

    def normalize(self, data):
        """对传入数据进行标准化"""
        mean = bn.nanmean(data, axis=1).reshape(-1, 1)
        std = bn.nanstd(data, axis=1, ddof=1).reshape(-1, 1)
        with np.errstate(invalid='ignore'):
            res = (data - mean) / std
        return res

    def check_nan_num(self, df):
        """对传入的 dataframe 进行 NaN 值检查，NaN 值大于一定比例则返回 False"""
        pos_num = np.argwhere(np.product(np.isnan(df.values), axis=1) == 0)  # 得到存在 nan 的时间截面的 index
        if len(pos_num) == 0 or pos_num[0][0] > 200:   # if pos_num[0][0] > 200: 
            return False
        not_na_df = df.count(axis = 1).values
        percent = (1 - not_na_df / self.not_nan_num)[pos_num[0][0]:]
        if 1 in percent:
            return False
        if (percent> 0.2).sum() / len(self.not_nan_num[pos_num[0][0]:]) > 0.05:
            return False
        return True


class Fitnesses(object):
    """适应度函数类"""
    _methods = ["icir", "mutual_info", "long_sharpe", "long_vol", "longshort_vol", "longshort_sharpe", "long_return","longshort_return"]

    def __init__(self, config):
        # 适应度函数选取
        self.fitness = config.fitness
        if self.fitness not in self._methods:
            raise Exception("请输入正确的适应度函数类型")
        self.ir_type = config.ir_type

    def _nan_drop(self, x, y):
        """删除缺失值的处理方法"""
        merged = np.vstack((x, y)).T
        merged = merged[~np.isnan(merged).any(1)].T
        if merged.size == 0:
            return None
        return merged

    def _nan_fill(self, arr):
        """向前填充的处理方法"""
        # TODO: 关于填充这块还要再作改进，1、这里是前向填充，如果开头为nan的是填充不到的，2、存在一列全为nan的情况，还是应该进行删除操作
        arr = arr.T
        mask = np.isnan(arr)
        idx = np.where(~mask, np.arange(mask.shape[1]), 0)
        return arr[np.arange(idx.shape[0])[:, None], idx].T

    def _pct_change(self, x):
        b = np.roll(x, 1)
        try:
            b[0] = np.nan
        except:
            return []        
        return (x-b)/b

    def _calculate_longshort_return(self, factor, ret_values):
        """计算每天的多空收益，返回列表"""
        merged = self._nan_drop(factor, ret_values)
        if merged is None:
            return np.nan
        merged = merged.T
        sort = merged[merged[:,0].argsort()]  # 升序排列
        amount = len(sort)
        groups = int(np.ceil(amount/10)) # 10组
        first_group = sort[-groups:]
        last_group = sort[:groups]
        longshort_ret = np.mean(first_group[:, 1]) - np.mean(last_group[:, 1])
        return longshort_ret

    def _calculate_long_return(self, factor, ret_values):
        """计算每天的多头收益，返回列表"""
        merged = self._nan_drop(factor, ret_values)
        if merged is None:
            return np.nan
        merged = merged.T
        sort = merged[merged[:,0].argsort()]
        amount = len(sort)
        groups = int(np.ceil(amount/10))
        first_group = sort[-groups:]
        long_ret = np.mean(first_group[:, 1])
        return long_ret

    def calculate_longshort_index(self, factor, longshort=True, eval_type='all'):
        """计算多空收益、多头收益的sharpe、总收益、波动率指标"""
        # TODO：将data作为一个参数传入，不用在每个函数中进行筛选
        data = all_data_dp
        ret_values = data.ret_values
        if eval_type == 'train':
            data = data_dp
            ret_values = data.train_ret
        elif eval_type == "val":
            data = data_dp
            ret_values = data.validate_ret
        elif eval_type == 'test':
            data = data_dp
            ret_values = data.test_ret
        if longshort:
            returns = list(map(lambda x, y: self._calculate_longshort_return(x, y), factor, ret_values))
        else:
            returns = list(map(lambda x, y: self._calculate_long_return(x, y), factor, ret_values))     
        returns = np.array(returns)
        returns = returns[~np.isnan(returns)]
        if returns.size == 0:
            return np.nan
        if self.fitness.endswith("sharpe"):
            fit = em.sharpe_ratio(returns, 0.035/252)
        elif self.fitness.endswith("vol"):
            fit = em.annual_volatility(returns)
        elif self.fitness.endswith("return"):
            fit = np.cumsum(returns)[-1]
        return fit

    def calculate_factor_ir(self, factor, eval_type='all'):
        """计算传入因子值的 IR 值"""
        data = all_data_dp
        ret_values = data.ret_values
        if eval_type == 'train':
            data = data_dp
            ret_values = data.train_ret
        elif eval_type == "val":
            data = data_dp
            ret_values = data.validate_ret
        elif eval_type == 'test':
            data = data_dp
            ret_values = data.test_ret
        IC = bn.nanmean((factor - bn.nanmean(factor, axis=1).reshape(-1, 1)) * (ret_values - bn.nanmean(ret_values, axis=1).reshape(-1, 1)), axis=1) / (bn.nanstd(factor, axis=1, ddof=1) * bn.nanstd(ret_values, axis=1, ddof=1))
        IR = 0
        try:
            IR = np.abs(bn.nanmean(IC) / bn.nanstd(IC) * np.sqrt(250))
            if np.isnan(IR):
                IR = 0
        except:
            pass

        return IR

    def _entropy(self, c):
        c_normalized = c / float(np.sum(c))
        c_normalized = c_normalized[np.nonzero(c_normalized)]
        entropy = -sum(c_normalized * np.log2(c_normalized))  
        return entropy

    def _cal_mutual_info(self, x, y):
        merged = self._nan_drop(x, y)
        if merged is None:
            return np.nan
        x = merged[0]
        y = merged[1]

        c_xy = np.histogram2d(x, y, 10)[0]
        c_x = np.histogram(x, 10)[0]
        c_y = np.histogram(y, 10)[0]

        h_x = self._entropy(c_x)
        h_y = self._entropy(c_y)
        h_xy = self._entropy(c_xy)

        mutual_info = h_x + h_y - h_xy
        mutual_info = 2 * (mutual_info / (h_x + h_y))
        return mutual_info

    def calculate_factor_mutual_info(self, factor, eval_type='all'):
        """互信息计算"""
        data = all_data_dp
        ret_values = data.ret_values
        if eval_type == 'train':
            data = data_dp
            ret_values = data.train_ret
        elif eval_type == "val":
            data = data_dp
            ret_values = data.validate_ret
        elif eval_type == 'test':
            data = data_dp
            ret_values = data.test_ret

        mutual_info = list(map(lambda x, y: self._cal_mutual_info(x, y), factor, ret_values))
        mean_mutual_info = bn.nanmean(mutual_info)
        return mean_mutual_info

    def fitness_choose(self, factor, eval_type='all'):
        if self.fitness == "icir":
            fit = self.calculate_factor_ir(factor=factor, eval_type=eval_type)
        elif self.fitness == "mutual_info":
            fit = self.calculate_factor_mutual_info(factor=factor, eval_type=eval_type)
        elif self.fitness.startswith("longshort"):
            fit = self.calculate_longshort_index(factor=factor, longshort=True, eval_type=eval_type)
        elif self.fitness.startswith("long"):
            fit = self.calculate_longshort_index(factor=factor, longshort=False, eval_type=eval_type)
        else:
            raise Exception("适应度函数名错误")
        return fit

    def evaluate_fitness(self, individual, eval_type='all'):
        """计算传入个体的因子值并计算 IR 值"""
        factor_result = self.evaluate_factor(individual=individual, eval_type=eval_type)
        factor = factor_result[0]
        if self.ir_type == 'rank_ir' and self.fitness == "icir":
            factor = factor_result[1]
        if isinstance(factor, float):
            if factor != factor:
                return factor, individual
        factor = self._nan_fill(factor)
        # 根据self.fitness的类型选择对应的适应度函数
        fit = self.fitness_choose(factor, eval_type=eval_type)
        return fit, individual
    
    def evaluate_factor(self, individual, eval_type='all'):
        """计算传入个体的因子值"""
        data = all_data_dp
        index_series = data.full_series
        ret_values = data.ret_values
        if eval_type == 'train':
            data = data_dp
            index_series = data.train_series
            ret_values = data.train_ret
        elif eval_type == "val":
            data = data_dp
            index_series = data.val_series
            ret_values = data.validate_ret
        elif eval_type == 'test':
            data = data_dp
            index_series = data.test_series
            ret_values = data.test_ret
        # 将个体转换为表达式函数
        func = toolbox.compile(expr=individual)
        # 获取不同函数的参数
        func_names = list(inspect.signature(func).parameters.keys())
        # 根据获取的函数参数构建对应的键值对
        # 传入数据前记得转换数据类型为 np.float64
        param = {i: data.data[i].values.astype(np.float64) for i in func_names}
        # 将构建好的键值对解包传入 func 中，进行对应表达式的因子值计算
        factor = func(**param)[index_series]
        # 出现 np.inf 大部分是因为 float 类型的问题，numpy 默认转换出来的 float 是 float32，需要自行转换成 float64 避免溢出为 np.inf
        factor[np.isinf(factor)] = np.nan
        if (factor != factor).sum() == (factor.shape[0] * factor.shape[1]):
            return np.nan, None
        if len(np.unique(factor[~np.isnan(factor)])) < 10000:
            factor = data.normalize(factor)
        else:
            factor = data.normalize(data.outlier_limit(factor))
        factor = data.neutralization(factor, data_type=eval_type)
        rank_factor = factor
        if self.ir_type == 'rank_ir' and self.fitness == "icir":
            rank_factor = (factor.argsort(1)).argsort(1) + 1
            # factor_df = pd.DataFrame(factor)
            # factor = factor_df.rank(axis=1).values.astype(np.float64)
        return factor, rank_factor

    def compare_fitness(self, Threshold, fit):
        """比较阈值和适应度值"""
        if self.fitness.endswith("vol"):
            if Threshold > fit:
                return True
        elif Threshold < fit:
                return True
        else:
            return False
    
def varAnd(population):
    """对传入的种群进行交叉、子树变异、提升变异以及点变异等操作"""
    offspring = [toolbox.clone(ind) for ind in population]
    # Apply crossover and mutation on the offspring
    # 交叉
    for i in range(1, len(offspring), 2):
        if random.random() < config.cxpb:
            offspring[i - 1], offspring[i] = toolbox.mate(offspring[i - 1], offspring[i])
            del offspring[i - 1].fitness.values, offspring[i].fitness.values
    # 变异
    for i in range(len(offspring)):
        if random.random() < config.mutpb:
            offspring[i], = toolbox.mutate(offspring[i])
            del offspring[i].fitness.values
    # 提升变异
    for i in range(len(offspring)):
        if random.random() < config.mutspb:
            offspring[i], = toolbox.mutate_shrink(offspring[i])
            del offspring[i].fitness.values
    # 点变异
    for i in range(len(offspring)):
        if random.random() < config.mutnrpb:
            offspring[i], = toolbox.mutate_NodeReplacement(offspring[i])
            del offspring[i].fitness.values
    return offspring

def drop_duplicates(individuals):
    """移除生成表达式相同的个体"""
    ind_dict = {}
    for ind in individuals:
        expr = str(ind)
        if expr in list(ind_dict.keys()):
            continue
        ind_dict[expr] = ind
    return list(ind_dict.values())

def check_average_fitness(average_fitness):
    """检查每代平均适应度是否明显收敛"""
    # TODO：此处判断收敛的方法还有待考虑：是否去掉每代适应度值的一些噪声之类的
    #       or： 进行归一化后求后面几个数的标准差，再进行标准差的判断
    length = len(average_fitness)
    cal_length = int(np.ceil(length/4))
    compared = average_fitness[-cal_length]
    for i in range(1, cal_length+1):
        flag = (average_fitness[-i] - compared)/compared < 0.05
        if ~flag:
            return False
    return True

# def run_factorlens(factor):
#     """ 根据传入的因子名（形如alpha000X）运行因子分析模块"""
#     log.info(f'开始运行因子分析：{factor}')
#     factor_df = saved_factor_values[factor].stack().reset_index(name=factor)
#     factor_ds = DataSource.write_pickle(factor_df)
#     factorlens_module = M.factorlens.v1(
#         features=[factor],
#         user_factor_data=factor_ds,
#         title='因子分析: {factor_name}',
#         start_date=all_data_dp.start_date,
#         end_date=all_data_dp.end_date,
#         rebalance_period=config.rebalance_period,
#         stock_pool=config.stock_pool,
#         quantile_count=config.quantile_count,
#         commission_rate=config.commission_rate,
#         drop_price_limit_stocks=config.drop_price_limit_stocks,
#         drop_st_stocks=config.drop_st_stocks,
#         drop_new_stocks=config.drop_new_stocks,
#         # 标准化和中性化默认不启用，因为挖掘到的因子已经在计算因子值的过程中做了相应处理了
#         normalization=False,
#         neutralization=[],
#         metrics=config.metrics
#     )


## 初始化相关类实例，定义遗传算法结构

  - 实例化 Config 配置类
  - 实例化数据读取处理类，加载短区间和全区间数据
  - 定义遗传算法结构及相关工具方法

In [5]:
# 初始化相关实例，定义遗传算法结构
# 设置随机数种子
random.seed(101)
# config 配置类，遗传算法运行中的相关参数均由该 config 实例控制
config = Config()
# 短区间的数据处理类
data_dp = DataProcessor(config, data_type='short')
# 全区间的数据处理类
all_data_dp = DataProcessor(config, data_type='all')
# 声明时间记录类实例
stop_watch = StopWatch()
# 适应度函数类
fitness = Fitnesses(config)
# 创建个体
# 创建种群个体并设置适应度判定方式
if config.fitness.endswith("vol"):
    creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
    creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMin)
else:
    creator.create("FitnessMax", base.Fitness, weights=(1.0,))
    creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMax)
# 构建原语集
# 设置遗传算法原语集，将相关函数、常数项及待计算的基础因子注入到 pset 对象中
pset = gp.PrimitiveSetTyped("MAIN", (np.ndarray,) * len(list(data_dp.data.keys())), np.ndarray)
funcs = inspect.getmembers(Functions)
for name, func in funcs:
    if not name.startswith('_'):
        bool_ = True
#         print("func,name:",func,name)
        sig = inspect.signature(func)
        params=sig.parameters
        pa = []
        for param in list(params.keys()):
            if params[param].annotation == inspect._empty:
                bool_ = False
            pa.append(params[param].annotation)
        if bool_:
            pset.addPrimitive(func, pa, sig.return_annotation)
for i in config.constant_nums:
    pset.addTerminal(i, int, str(i))
args_dict = {f'ARG{index}': key_name for index, key_name in enumerate(list(data_dp.data.keys()))}
pset.renameArguments(**args_dict)
# 构建工具箱
# 设置遗传算法中会用到的相关工具方法，注册到 toolbox 中，方便后续直接调用
toolbox = base.Toolbox()
toolbox.register("expr", gp.genHalfAndHalf, pset=pset, min_=1, max_=2)
toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.expr)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("compile", gp.compile, pset=pset)
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("mate", gp.cxOnePoint)
toolbox.register("expr_mut", gp.genFull, min_=0, max_=2)
toolbox.register("mutate", gp.mutUniform, expr=toolbox.expr_mut, pset=pset)
toolbox.register("mutate_shrink", gp.mutShrink)
toolbox.register("mutate_NodeReplacement", gp.mutNodeReplacement, pset = pset)

toolbox.decorate("mate", gp.staticLimit(key=operator.attrgetter("height"), max_value=10))
toolbox.decorate("mutate", gp.staticLimit(key=operator.attrgetter("height"), max_value=10))
toolbox.decorate("mutate_shrink", gp.staticLimit(key=operator.attrgetter("height"), max_value=10))
toolbox.decorate("mutate_NodeReplacement", gp.staticLimit(key=operator.attrgetter("height"), max_value=10))

toolbox.register("evaluate_train", fitness.evaluate_fitness, eval_type='train')
toolbox.register("evaluate_val", fitness.evaluate_fitness, eval_type='val')
toolbox.register("evaluate_test", fitness.evaluate_fitness, eval_type='test')
toolbox.register("evaluate_factor", fitness.evaluate_factor, eval_type='all')
# 声明多进程 pool 对象实例
# pool = multiprocessing.Pool(processes=config.pool_processes_limit)
# toolbox.register("map", pool.map)




[2021-05-14 11:08:58.812465] INFO: : loading data from 2017-01-01 to 2018-11-01...

[2021-05-14 11:09:37.040329] INFO: : loaded data from 2017-01-01 to 2018-11-01 successfully

[2021-05-14 11:09:37.171658] INFO: : loading data from 2016-10-01 to 2020-11-05...

[2021-05-14 11:10:56.016226] INFO: : loaded data from 2016-10-01 to 2020-11-05 successfully

## 开始使用定义的遗传算法进行因子挖掘

In [6]:
# 开始使用定义的遗传算法进行因子挖掘
# 保存符合条件的因子表达式
saved_factor_exprs = {}
# 保存因子表达式对应的因子值（中性化后的，中性化处理是在构建因子表达式计算的时候一起进行的），传入后续因子挖掘迭代中
saved_factor_values = {}
# 声明时间记录类实例
# stop_watch = StopWatch()
for mt in range(1, config.mtime + 1):
    mt_duration = stop_watch.duration
    log.info(f"== 开始第「{mt}」次因子挖掘 ==")

    # 初始化种群
    population = toolbox.population(n=config.init_ind_num)
    
    # 记录每一代的平均适应度
    average_fitness = []

    # 声明统计指标
    stats = tools.Statistics(key=lambda ind: ind.fitness.values)
    stats.register("avg", bn.nanmean)
    stats.register("std", bn.nanstd)
    stats.register("min", bn.nanmin)
    stats.register("max", bn.nanmax)

    # 日志记录，与指标记录一起使用，可方便打印指标相关记录
    logbook = tools.Logbook()
    logbook.header = ['gen', 'nevals'] + (stats.fields if stats else [])

    # Begin the generational process
    for gen in range(1, config.ngen + 1):
        # 个体优选器（能自动按给定的最大长度将适应度最优的个体从大到小排序存放，根据你的目标进行判定）
        pass_hall_of_fame = tools.HallOfFame(config.init_ind_num)
        final_hall_of_fame = tools.HallOfFame(int(config.init_ind_num * 0.1))
        log.info(f"-- 开始第「{mt}」次循环第「{gen}」代挖掘 --")
        print(f"-- 开始第「{mt}」次循环第「{gen}」代挖掘 --")
        # Evaluate the individuals with an invalid fitness
        log.info('-- 开始使用训练数据计算表达式适应度··· --')
        train_duration = stop_watch.duration
        invalid_ind = [ind for ind in population if not ind.fitness.valid]
        # 开始计算之前预先针对相同表达式去重，减少重复计算量
        print(f"去重前的个体数{len(invalid_ind)}")
        invalid_ind = drop_duplicates(invalid_ind)
        print(f"去重后的个体数{len(invalid_ind)}")
        # 开始计算因子在短区间上的适应度值
        data_dp.train_ret
        data_dp.validate_ret
        data_dp.test_ret
        with multiprocessing.Pool(processes=config.pool_processes_limit) as pool:
            fitnesses = pool.map(toolbox.evaluate_train, invalid_ind)
          
        with multiprocessing.Pool(processes=config.pool_processes_limit) as pool:
            fitnesses_val = pool.map(toolbox.evaluate_val, invalid_ind)
        
        average_fitness.append(bn.nanmean([fit for fit, ind in fitnesses_val]))
        print(f"每代的平均适应度：{average_fitness}")

        pass_population = []
        record_population = []
        # 根据指定的训练 IR 值筛选符合条件的个体并存入 pass_population 中
        for fit, ind in fitnesses:
            ind.fitness.values = (fit,)
            record_population.append(ind)
            # print(f"因子{ind}在训练集适应度值为{fit}")
            if fitness.compare_fitness(config.train_fitness, fit):
                pass_population.append(ind)
        log.info(f'因子适应度在训练数据上检测完成，共耗时 [{stop_watch.duration - train_duration}s]')
        log.info(f'共「{len(pass_population)}」个表达式：{[str(pop) for pop in pass_population]}通过训练数据检测')
        log.info('-- 使用训练数据计算表达式适应度完成 --')

        # Update the pass_hall_of_fame with the generated individuals
        pass_hall_of_fame.update(pass_population)

        log.info('-- 开始使用测试数据计算表达式适应度··· --')
        test_duration = stop_watch.duration
        # 开始计算通过训练数据的个体
        with multiprocessing.Pool(processes=config.pool_processes_limit) as pool:
            pass_fitnesses = pool.map(toolbox.evaluate_test, pass_population)
        final_population = []
        # 根据指定的测试 IR 值筛选符合条件的个体并存入 final_population 中
        for fit, ind in pass_fitnesses:
            ind.fitness.values = (fit,)
            print(f"因子{ind}在测试集适应度值为{fit}")
            if fitness.compare_fitness(config.test_fitness, fit):
                final_population.append(ind)
        log.info(f'因子适应度在测试数据上检测完成，共耗时 [{stop_watch.duration - test_duration}s]')
        log.info(f'共「{len(final_population)}」个表达式：{[str(pop) for pop in final_population]}通过测试数据检测')
        log.info('-- 使用测试数据计算表达式适应度完成 --')

        # Update the final_hall_of_fame with the generated individuals
        final_hall_of_fame.update(final_population)

        log.info('-- 开始使用全区间数据计算表达式适应度 --')
        all_duration = stop_watch.duration
        # 是否保存了因子的标志
        save_flag = False
        # 遍历最终个体优选器，依次计算长时间全区间的因子值和 IR 值，找出符合条件的个体存入 saved_factor_values 和 saved_factor_exprs 中
        # 若在最终优选器长度范围内没有满足条件的个体则进行下一代挖掘
        for final_pop in final_hall_of_fame.items:
            alpha_name = f"alpha{len(saved_factor_exprs) + 1:0>4}"
            factor_expr = str(final_pop)
            if factor_expr in saved_factor_exprs.values():
                log.info(f'表达式[{factor_expr}]已在已保存表达式列表中，跳过该表达式...')
                continue
            fianl_factor_result = toolbox.evaluate_factor(final_pop)
            fianl_factor = fianl_factor_result[0]
            fianl_factor = pd.DataFrame(fianl_factor, index=all_data_dp.data_index, columns=all_data_dp.data_cols)
            save_condition = all_data_dp.check_nan_num(fianl_factor)
            if save_condition:
                if fitness.ir_type == 'rank_ir' and fitness.fitness == "icir":
                    fit_all = fitness.calculate_factor_ir(factor=fianl_factor_result[1], eval_type='all')
                else:
                    fit_all = fitness.fitness_choose(factor=fianl_factor.values.astype(np.float64), eval_type='all')
                if fitness.compare_fitness(config.test_fitness, fit_all):
                    saved_factor_values[alpha_name] = fianl_factor
                    saved_factor_exprs[alpha_name] = factor_expr
                    save_flag = True
                    log.info(f'添加表达式[{factor_expr}]至表达式保存列表中')
                    break
            else:
                log.info(f'表达式[{factor_expr}]因子值缺失值过多，跳过该因子...')
        log.info(f'因子适应度在全区间数据上检测完成，共耗时 [{stop_watch.duration - all_duration}s]')
        log.info('-- 使用全区间数据计算表达式适应度完成 --')
        if not save_flag:
            log.info('未找到适应度满足条件的表达式，故本代挖掘未添加任何表达式至表达式保存列表中')

        data_dp.add_barra(saved_factor_values)
        all_data_dp.add_barra(saved_factor_values)

        # Replace the current population by the pass_population
        # 使用通过训练集 IR 值检测的个体替换原有的随机个体
        population = pass_population
        # 判断当前种群是否为空，为空则拿到本代父代种群，进行交叉变异
        if not population:
            population = record_population
        # 判断平均适应度是否明显收敛，若是，则跳出这次挖掘
        if check_average_fitness(average_fitness):
            log.info('种群平均适应度已明显收敛，跳出该次挖掘，进入下一次挖掘')
            break
        print(f"pass:{len(pass_population)}, record:{len(record_population)}, population: {len(population)}")
        # Append the current generation statistics to the logbook
        record = stats.compile(population) if stats else {}
        logbook.record(gen=gen, nevals=len(invalid_ind), **record)
        log.info(f'因子挖掘过程中相关指标：{repr(logbook)}')

        log.info('-- 开始进行下一代因子挖掘 --')
        # Select the next generation individuals
        offspring = toolbox.select(population, config.init_ind_num)
        # Vary the pool of individuals
        offspring = varAnd(offspring)
        # Replace the current population by the offspring
        population[:] = offspring
        print(f"下一代挖掘的个体数：{len(offspring)}")
    """绘制每代的统计指标折线图"""
    df = pd.DataFrame(logbook)
    df.set_index("gen", inplace=True)
    T.plot(df)
    log.info(f'== 第「{mt}」次因子挖掘循环结束，共耗时 [{stop_watch.duration - mt_duration}s] ==')
log.info(f'所有通过检验的因子共「{len(saved_factor_exprs)}」个：{saved_factor_exprs}')
log.info(f'本次因子挖掘共循环「{config.mtime}」次，每次进行种群数量为「{config.init_ind_num}」个的挖掘共「{config.ngen}」代，共耗时 [{str(stop_watch)}]')


[2021-05-14 11:10:56.697633] INFO: : == 开始第「1」次因子挖掘 ==

[2021-05-14 11:10:56.699643] INFO: : -- 开始第「1」次循环第「1」代挖掘 --

-- 开始第「1」次循环第「1」代挖掘 --


[2021-05-14 11:10:56.701789] INFO: : -- 开始使用训练数据计算表达式适应度··· --

去重前的个体数20
去重后的个体数19


每代的平均适应度：[3.939780349249654]


[2021-05-14 11:23:21.557794] INFO: : 因子适应度在训练数据上检测完成，共耗时 [744.8537080967799s]

[2021-05-14 11:23:21.562934] INFO: : 共「6」个表达式：['standardation(turn)', 'ts_rank(turn, 6)', 'delta(mean3(turn, volume, open), constant(2))', 'standardation(argmin(volume, 9))', 'argmin(mean3(low, turn, wap_3_vwap_buy), constant(2))', 'argmax(standardation(high), constant(8))']通过训练数据检测

[2021-05-14 11:23:21.564774] INFO: : -- 使用训练数据计算表达式适应度完成 --

[2021-05-14 11:23:21.566611] INFO: : -- 开始使用测试数据计算表达式适应度··· --

因子standardation(turn)在测试集适应度值为6.054938369502235
因子ts_rank(turn, 6)在测试集适应度值为11.991338865535779
因子delta(mean3(turn, volume, open), constant(2))在测试集适应度值为12.12722293507771
因子standardation(argmin(volume, 9))在测试集适应度值为4.837533137447563
因子argmin(mean3(low, turn, wap_3_vwap_buy), constant(2))在测试集适应度值为18.81557066210864
因子argmax(standardation(high), constant(8))在测试集适应度值为8.444582103616625


[2021-05-14 11:26:32.050317] INFO: : 因子适应度在测试数据上检测完成，共耗时 [190.48139811865985s]

[2021-05-14 11:26:32.054828] INFO: : 共「6」个表达式：['standardation(turn)', 'ts_rank(turn, 6)', 'delta(mean3(turn, volume, open), constant(2))', 'standardation(argmin(volume, 9))', 'argmin(mean3(low, turn, wap_3_vwap_buy), constant(2))', 'argmax(standardation(high), constant(8))']通过测试数据检测

[2021-05-14 11:26:32.057135] INFO: : -- 使用测试数据计算表达式适应度完成 --

[2021-05-14 11:26:32.059620] INFO: : -- 开始使用全区间数据计算表达式适应度 --

[2021-05-14 11:29:04.319750] INFO: : 添加表达式[argmin(mean3(low, turn, wap_3_vwap_buy), constant(2))]至表达式保存列表中

[2021-05-14 11:29:04.321139] INFO: : 因子适应度在全区间数据上检测完成，共耗时 [152.25981116481125s]

[2021-05-14 11:29:04.322002] INFO: : -- 使用全区间数据计算表达式适应度完成 --

pass:6, record:19, population: 6


[2021-05-14 11:29:04.530303] INFO: : 因子挖掘过程中相关指标：[{'gen': 1, 'nevals': 19, 'avg': 8.427380498335753, 'std': 2.724287768979169, 'min': 5.618257041631649, 'max': 12.720923746924477}]

[2021-05-14 11:29:04.540162] INFO: : -- 开始进行下一代因子挖掘 --

下一代挖掘的个体数：20


[2021-05-14 11:29:04.546369] INFO: : -- 开始第「1」次循环第「2」代挖掘 --

-- 开始第「1」次循环第「2」代挖掘 --


[2021-05-14 11:29:04.550543] INFO: : -- 开始使用训练数据计算表达式适应度··· --

去重前的个体数20
去重后的个体数18


每代的平均适应度：[3.939780349249654, 8.234642500767869]


[2021-05-14 11:41:27.346737] INFO: : 因子适应度在训练数据上检测完成，共耗时 [742.7921564597636s]

[2021-05-14 11:41:27.351308] INFO: : 共「14」个表达式：['delta(mean3(turn, volume, low), constant(2))', 'delta(mean3(mean3(turn, volume, open), volume, open), constant(constant(constant(3))))', 'delta(normalization(decay_linear(volume, 9)), constant(2))', 'delta(turn, constant(2))', 'standardation(turn)', 'ts_rank(turn, 6)', 'delta(mean3(turn, volume, high), constant(2))', 'ts_rank(turn, constant(constant(5)))', 'delta(high, 6)', 'ts_rank(vdiv(open, high), 2)', 'ts_rank(volume, 6)', 'delta(mean3(turn, turn, low), 2)', 'delta(mean3(mean3(turn, volume, open), volume, open), 2)', 'delta(volume, 10)']通过训练数据检测

[2021-05-14 11:41:27.353369] INFO: : -- 使用训练数据计算表达式适应度完成 --

[2021-05-14 11:41:27.356446] INFO: : -- 开始使用测试数据计算表达式适应度··· --

因子delta(mean3(turn, volume, low), constant(2))在测试集适应度值为10.567466732289335
因子delta(mean3(mean3(turn, volume, open), volume, open), constant(constant(constant(3))))在测试集适应度值为10.598379214626004
因子delta(normalization(decay_linear(volume, 9)), constant(2))在测试集适应度值为9.131351231794627
因子delta(turn, constant(2))在测试集适应度值为11.49694341323391
因子standardation(turn)在测试集适应度值为7.649560496710479
因子ts_rank(turn, 6)在测试集适应度值为10.229477196796596
因子delta(mean3(turn, volume, high), constant(2))在测试集适应度值为10.56746510116417
因子ts_rank(turn, constant(constant(5)))在测试集适应度值为10.127618209103764
因子delta(high, 6)在测试集适应度值为12.221136927790834
因子ts_rank(vdiv(open, high), 2)在测试集适应度值为10.952556517195317
因子ts_rank(volume, 6)在测试集适应度值为10.076842836170739
因子delta(mean3(turn, turn, low), 2)在测试集适应度值为15.113188619417809
因子delta(mean3(mean3(turn, volume, open), volume, open), 2)在测试集适应度值为10.567460901276593
因子delta(volume, 10)在测试集适应度值为10.923261511247638


[2021-05-14 11:46:49.092850] INFO: : 因子适应度在测试数据上检测完成，共耗时 [321.73382049798965s]

[2021-05-14 11:46:49.095887] INFO: : 共「14」个表达式：['delta(mean3(turn, volume, low), constant(2))', 'delta(mean3(mean3(turn, volume, open), volume, open), constant(constant(constant(3))))', 'delta(normalization(decay_linear(volume, 9)), constant(2))', 'delta(turn, constant(2))', 'standardation(turn)', 'ts_rank(turn, 6)', 'delta(mean3(turn, volume, high), constant(2))', 'ts_rank(turn, constant(constant(5)))', 'delta(high, 6)', 'ts_rank(vdiv(open, high), 2)', 'ts_rank(volume, 6)', 'delta(mean3(turn, turn, low), 2)', 'delta(mean3(mean3(turn, volume, open), volume, open), 2)', 'delta(volume, 10)']通过测试数据检测

[2021-05-14 11:46:49.097191] INFO: : -- 使用测试数据计算表达式适应度完成 --

[2021-05-14 11:46:49.098533] INFO: : -- 开始使用全区间数据计算表达式适应度 --

[2021-05-14 11:49:25.906404] INFO: : 添加表达式[delta(mean3(turn, turn, low), 2)]至表达式保存列表中

[2021-05-14 11:49:25.908350] INFO: : 因子适应度在全区间数据上检测完成，共耗时 [156.80876073613763s]

[2021-05-14 11:49:25.909345] INFO: : -- 使用全区间数据计算表达式适应度完成 --

pass:14, record:18, population: 14


[2021-05-14 11:49:26.074006] INFO: : 因子挖掘过程中相关指标：[{'gen': 1, 'nevals': 19, 'avg': 8.427380498335753, 'std': 2.724287768979169, 'min': 5.618257041631649, 'max': 12.720923746924477}, {'gen': 2, 'nevals': 18, 'avg': 10.066762359692985, 'std': 2.478534520459629, 'min': 5.264795447027606, 'max': 12.501465657378377}]

[2021-05-14 11:49:26.076288] INFO: : -- 开始进行下一代因子挖掘 --

下一代挖掘的个体数：20


[2021-05-14 11:49:26.192562] INFO: : == 第「1」次因子挖掘循环结束，共耗时 [2309.4949275748804s] ==

[2021-05-14 11:49:26.193839] INFO: : == 开始第「2」次因子挖掘 ==

[2021-05-14 11:49:26.195420] INFO: : -- 开始第「2」次循环第「1」代挖掘 --

-- 开始第「2」次循环第「1」代挖掘 --


[2021-05-14 11:49:26.196693] INFO: : -- 开始使用训练数据计算表达式适应度··· --

去重前的个体数20
去重后的个体数20


每代的平均适应度：[3.050069923488978]


[2021-05-14 12:02:23.355457] INFO: : 因子适应度在训练数据上检测完成，共耗时 [777.1567080309615s]

[2021-05-14 12:02:23.361395] INFO: : 共「1」个表达式：['corr(close, high, 10)']通过训练数据检测

[2021-05-14 12:02:23.364598] INFO: : -- 使用训练数据计算表达式适应度完成 --

[2021-05-14 12:02:23.366467] INFO: : -- 开始使用测试数据计算表达式适应度··· --

因子corr(close, high, 10)在测试集适应度值为7.7327944594710765


[2021-05-14 12:04:05.661593] INFO: : 因子适应度在测试数据上检测完成，共耗时 [102.29391523078084s]

[2021-05-14 12:04:05.665456] INFO: : 共「1」个表达式：['corr(close, high, 10)']通过测试数据检测

[2021-05-14 12:04:05.668426] INFO: : -- 使用测试数据计算表达式适应度完成 --

[2021-05-14 12:04:05.671413] INFO: : -- 开始使用全区间数据计算表达式适应度 --

[2021-05-14 12:06:59.381744] INFO: : 添加表达式[corr(close, high, 10)]至表达式保存列表中

[2021-05-14 12:06:59.383767] INFO: : 因子适应度在全区间数据上检测完成，共耗时 [173.7095200046897s]

[2021-05-14 12:06:59.384873] INFO: : -- 使用全区间数据计算表达式适应度完成 --

pass:1, record:20, population: 1


[2021-05-14 12:06:59.557403] INFO: : 因子挖掘过程中相关指标：[{'gen': 1, 'nevals': 20, 'avg': 5.5361709763315625, 'std': 0.0, 'min': 5.5361709763315625, 'max': 5.5361709763315625}]

[2021-05-14 12:06:59.560698] INFO: : -- 开始进行下一代因子挖掘 --

下一代挖掘的个体数：20


[2021-05-14 12:06:59.567635] INFO: : -- 开始第「2」次循环第「2」代挖掘 --

-- 开始第「2」次循环第「2」代挖掘 --


[2021-05-14 12:06:59.569950] INFO: : -- 开始使用训练数据计算表达式适应度··· --

去重前的个体数20
去重后的个体数16


每代的平均适应度：[3.050069923488978, 2.489128389393789]


[2021-05-14 12:19:58.847405] INFO: : 因子适应度在训练数据上检测完成，共耗时 [779.2747653769329s]

[2021-05-14 12:19:58.852629] INFO: : 共「1」个表达式：['corr(close, wap_3_vwap_buy, 10)']通过训练数据检测

[2021-05-14 12:19:58.854891] INFO: : -- 使用训练数据计算表达式适应度完成 --

[2021-05-14 12:19:58.857346] INFO: : -- 开始使用测试数据计算表达式适应度··· --

因子corr(close, wap_3_vwap_buy, 10)在测试集适应度值为11.36495769484284


[2021-05-14 12:21:36.948224] INFO: : 因子适应度在测试数据上检测完成，共耗时 [98.08741973526776s]

[2021-05-14 12:21:36.953626] INFO: : 共「1」个表达式：['corr(close, wap_3_vwap_buy, 10)']通过测试数据检测

[2021-05-14 12:21:36.956294] INFO: : -- 使用测试数据计算表达式适应度完成 --

[2021-05-14 12:21:36.958621] INFO: : -- 开始使用全区间数据计算表达式适应度 --

[2021-05-14 12:24:26.484456] INFO: : 表达式[corr(close, wap_3_vwap_buy, 10)]因子值缺失值过多，跳过该因子...

[2021-05-14 12:24:26.486680] INFO: : 因子适应度在全区间数据上检测完成，共耗时 [169.52576338034123s]

[2021-05-14 12:24:26.487993] INFO: : -- 使用全区间数据计算表达式适应度完成 --

[2021-05-14 12:24:26.489017] INFO: : 未找到适应度满足条件的表达式，故本代挖掘未添加任何表达式至表达式保存列表中

pass:1, record:16, population: 1


[2021-05-14 12:24:26.648842] INFO: : 因子挖掘过程中相关指标：[{'gen': 1, 'nevals': 20, 'avg': 5.5361709763315625, 'std': 0.0, 'min': 5.5361709763315625, 'max': 5.5361709763315625}, {'gen': 2, 'nevals': 16, 'avg': 5.716971812006682, 'std': 0.0, 'min': 5.716971812006682, 'max': 5.716971812006682}]

[2021-05-14 12:24:26.652599] INFO: : -- 开始进行下一代因子挖掘 --

下一代挖掘的个体数：20


[2021-05-14 12:24:26.668015] INFO: : == 第「2」次因子挖掘循环结束，共耗时 [2100.474160777405s] ==

[2021-05-14 12:24:26.669372] INFO: : == 开始第「3」次因子挖掘 ==

[2021-05-14 12:24:26.671019] INFO: : -- 开始第「3」次循环第「1」代挖掘 --

-- 开始第「3」次循环第「1」代挖掘 --


[2021-05-14 12:24:26.672144] INFO: : -- 开始使用训练数据计算表达式适应度··· --

去重前的个体数20
去重后的个体数20


每代的平均适应度：[3.7372960967246414]


[2021-05-14 12:27:06.894788] INFO: : 因子适应度在训练数据上检测完成，共耗时 [160.22106436546892s]

[2021-05-14 12:27:06.898223] INFO: : 共「3」个表达式：['argmax(vmul(open, turn), constant(5))', 'argmax(volume, 3)', 'vneg(ts_rank(turn, 6))']通过训练数据检测

[2021-05-14 12:27:06.900102] INFO: : -- 使用训练数据计算表达式适应度完成 --

[2021-05-14 12:27:06.901399] INFO: : -- 开始使用测试数据计算表达式适应度··· --

因子argmax(vmul(open, turn), constant(5))在测试集适应度值为6.443659948237765
因子argmax(volume, 3)在测试集适应度值为3.4821561307868993
因子vneg(ts_rank(turn, 6))在测试集适应度值为6.229283147695572


[2021-05-14 12:27:22.314170] INFO: : 因子适应度在测试数据上检测完成，共耗时 [15.411682927981019s]

[2021-05-14 12:27:22.318358] INFO: : 共「3」个表达式：['argmax(vmul(open, turn), constant(5))', 'argmax(volume, 3)', 'vneg(ts_rank(turn, 6))']通过测试数据检测

[2021-05-14 12:27:22.320412] INFO: : -- 使用测试数据计算表达式适应度完成 --

[2021-05-14 12:27:22.322635] INFO: : -- 开始使用全区间数据计算表达式适应度 --

[2021-05-14 12:28:51.752818] INFO: : 添加表达式[argmax(vmul(open, turn), constant(5))]至表达式保存列表中

[2021-05-14 12:28:51.755819] INFO: : 因子适应度在全区间数据上检测完成，共耗时 [89.43127163592726s]

[2021-05-14 12:28:51.757827] INFO: : -- 使用全区间数据计算表达式适应度完成 --

pass:3, record:20, population: 3


[2021-05-14 12:28:51.969559] INFO: : 因子挖掘过程中相关指标：[{'gen': 1, 'nevals': 20, 'avg': 6.13449974001107, 'std': 0.8918504377500125, 'min': 5.40729277150386, 'max': 7.390556765737983}]

[2021-05-14 12:28:51.973429] INFO: : -- 开始进行下一代因子挖掘 --

下一代挖掘的个体数：20


[2021-05-14 12:28:51.979821] INFO: : -- 开始第「3」次循环第「2」代挖掘 --

-- 开始第「3」次循环第「2」代挖掘 --


[2021-05-14 12:28:51.981966] INFO: : -- 开始使用训练数据计算表达式适应度··· --

去重前的个体数20
去重后的个体数18


每代的平均适应度：[3.7372960967246414, 2.735008714793727]


[2021-05-14 12:44:44.761925] INFO: : 因子适应度在训练数据上检测完成，共耗时 [952.7778451805934s]

[2021-05-14 12:44:44.765748] INFO: : 共「1」个表达式：['vneg(ts_rank(turn, 2))']通过训练数据检测

[2021-05-14 12:44:44.766812] INFO: : -- 使用训练数据计算表达式适应度完成 --

[2021-05-14 12:44:44.769962] INFO: : -- 开始使用测试数据计算表达式适应度··· --

因子vneg(ts_rank(turn, 2))在测试集适应度值为5.365488489614403


[2021-05-14 12:46:33.310612] INFO: : 因子适应度在测试数据上检测完成，共耗时 [108.53898783121258s]

[2021-05-14 12:46:33.313925] INFO: : 共「1」个表达式：['vneg(ts_rank(turn, 2))']通过测试数据检测

[2021-05-14 12:46:33.315182] INFO: : -- 使用测试数据计算表达式适应度完成 --

[2021-05-14 12:46:33.316229] INFO: : -- 开始使用全区间数据计算表达式适应度 --

[2021-05-14 12:49:45.660246] INFO: : 添加表达式[vneg(ts_rank(turn, 2))]至表达式保存列表中

[2021-05-14 12:49:45.663260] INFO: : 因子适应度在全区间数据上检测完成，共耗时 [192.34616725426167s]

[2021-05-14 12:49:45.665393] INFO: : -- 使用全区间数据计算表达式适应度完成 --

pass:1, record:18, population: 1


[2021-05-14 12:49:45.895580] INFO: : 因子挖掘过程中相关指标：[{'gen': 1, 'nevals': 20, 'avg': 6.13449974001107, 'std': 0.8918504377500125, 'min': 5.40729277150386, 'max': 7.390556765737983}, {'gen': 2, 'nevals': 18, 'avg': 7.903748204572364, 'std': 0.0, 'min': 7.903748204572364, 'max': 7.903748204572364}]

[2021-05-14 12:49:45.897826] INFO: : -- 开始进行下一代因子挖掘 --

下一代挖掘的个体数：20


[2021-05-14 12:49:45.908280] INFO: : == 第「3」次因子挖掘循环结束，共耗时 [1519.2388978376985s] ==

[2021-05-14 12:49:45.909602] INFO: : 所有通过检验的因子共「5」个：{'alpha0001': 'argmin(mean3(low, turn, wap_3_vwap_buy), constant(2))', 'alpha0002': 'delta(mean3(turn, turn, low), 2)', 'alpha0003': 'corr(close, high, 10)', 'alpha0004': 'argmax(vmul(open, turn), constant(5))', 'alpha0005': 'vneg(ts_rank(turn, 2))'}

[2021-05-14 12:49:45.910660] INFO: : 本次因子挖掘共循环「3」次，每次进行种群数量为「20」个的挖掘共「2」代，共耗时 [5929.5319s]

In [7]:
# save data
for key,values in saved_factor_values.items():
    saved_factor_values_T = saved_factor_values[key].stack().reset_index(name=key)
    saved_factor_values_T.to_csv("{}+{}.csv".format(key,saved_factor_exprs[key]))

In [8]:
def run_factorlens(factor,coverage=0.2):
    """ 根据传入的因子名（形如alpha000X）运行因子分析模块"""
    log.info(f'开始运行因子分析：{factor}')
    factor_df = saved_factor_values[factor].stack().reset_index(name=factor)
    factor_ds = DataSource.write_pickle(factor_df)
    factorlens_module = M.factorlens.v1(
        features=[factor],
        user_factor_data=factor_ds,
        title='因子分析: {factor_name}',
        start_date=all_data_dp.start_date,
        end_date=all_data_dp.end_date,
        rebalance_period=config.rebalance_period,
        stock_pool=config.stock_pool,
        quantile_count=config.quantile_count,
        commission_rate=config.commission_rate,
        drop_price_limit_stocks=config.drop_price_limit_stocks,
        drop_st_stocks=config.drop_st_stocks,
        drop_new_stocks=config.drop_new_stocks,
        # 标准化和中性化默认不启用，因为挖掘到的因子已经在计算因子值的过程中做了相应处理了
        normalization=False,
        neutralization=[],
        metrics=config.metrics,
        factor_coverage=coverage,
    )

# 指定要分析的因子名，运行因子分析，coverage为传入覆盖率，遗传算法挖掘的为结构化多因子，因此该值不应该过高，default=0.2
run_factorlens('alpha0001',0.2)

[2021-05-14 12:53:24.411355] INFO: : 开始运行因子分析：alpha0001

[2021-05-14 12:53:26.051039] INFO: moduleinvoker: factorlens.v1 开始运行..

[2021-05-14 12:53:28.271911] INFO: 因子分析: batch_process start

[2021-05-14 12:53:28.274717] INFO: 因子分析: load_instruments 2016-10-01, 2020-11-05

[2021-05-14 12:53:35.447801] INFO: 因子分析: load_instruments, 4113 rows.

[2021-05-14 12:53:35.450582] INFO: 因子分析: load_benchmark_data 2016-10-01, 2020-11-05

[2021-05-14 12:53:35.789321] INFO: 因子分析: load_benchmark_data, 2982 rows.

[2021-05-14 12:53:35.790729] INFO: 因子分析: StockPool.before_load_general_feature_data

[2021-05-14 12:53:35.791775] INFO: 因子分析: UserDataMerge.before_load_general_feature_data

[2021-05-14 12:53:35.792907] INFO: 因子分析: DropSTStocks.before_load_general_feature_data

[2021-05-14 12:53:35.793848] INFO: 因子分析: DropNewStocks.before_load_general_feature_data

[2021-05-14 12:53:35.794638] INFO: 因子分析: Neutralization.before_load_general_feature_data

[2021-05-14 12:53:35.795538] INFO: 因子分析: DelayRebalanceDays.before_load_general_feature_data

[2021-05-14 12:53:35.796497] INFO: 因子分析: RebalancePeriod.before_load_general_feature_data

[2021-05-14 12:53:35.798059] INFO: 因子分析: RebalancePrice.before_load_general_feature_data

[2021-05-14 12:53:35.799036] INFO: 因子分析: FactorCoverage.before_load_general_feature_data

[2021-05-14 12:53:35.799899] INFO: 因子分析: Industry.before_load_general_feature_data

[2021-05-14 12:53:35.801280] INFO: 因子分析: PBRatio.before_load_general_feature_data

[2021-05-14 12:53:35.802136] INFO: 因子分析: Turnover.before_load_general_feature_data

[2021-05-14 12:53:35.802988] INFO: 因子分析: MarketCap.before_load_general_feature_data

[2021-05-14 12:53:35.803896] INFO: 因子分析: load_general_feature_data, load data

[2021-05-14 12:54:41.161715] INFO: 因子分析: RebalancePeriod.after_load_general_feature_data

[2021-05-14 12:54:41.397253] INFO: 因子分析: RebalancePeriodsReturns.after_load_general_feature_data

[2021-05-14 12:55:49.056904] INFO: 因子分析: RebalancePrice.after_load_general_feature_data

[2021-05-14 12:55:49.059300] INFO: 因子分析: load_general_feature_data, 4017980 rows.

[2021-05-14 12:55:49.062792] INFO: 因子分析: load_derived_feature_data, 4017980 rows, 20 columns.

[2021-05-14 12:55:49.064959] INFO: 因子分析: process, alpha0001

[2021-05-14 12:55:49.066449] INFO: 因子分析: calculate_factor, alpha0001

[2021-05-14 12:55:50.198852] INFO: 因子分析: calculate_factor, done

[2021-05-14 12:55:50.768405] INFO: 因子分析: QuantileReturns.before_process

[2021-05-14 12:55:50.770347] INFO: 因子分析: IC.before_process

[2021-05-14 12:55:50.773210] INFO: 因子分析: BasicDescription.before_process

[2021-05-14 12:55:50.774964] INFO: 因子分析: Industry.before_process

[2021-05-14 12:55:50.776731] INFO: 因子分析: RebalanceOverlap.before_process

[2021-05-14 12:55:50.778237] INFO: 因子分析: PBRatio.before_process

[2021-05-14 12:55:50.779336] INFO: 因子分析: Turnover.before_process

[2021-05-14 12:55:50.780500] INFO: 因子分析: Stocks.before_process

[2021-05-14 12:55:50.782095] INFO: 因子分析: MarketCap.before_process

[2021-05-14 12:55:50.783163] INFO: 因子分析: FactorPairwiseCorrelationMerged.before_process

[2021-05-14 12:55:51.082931] INFO: 因子分析: process metrics, start ..

[2021-05-14 12:55:52.131705] INFO: 因子分析: process, 3210889/3393687 rows ..

[2021-05-14 12:55:52.134310] INFO: 因子分析: BacktestInterval.process, 0.000s

[2021-05-14 12:55:52.136905] INFO: 因子分析: Benchmark.process, 0.000s

[2021-05-14 12:55:52.139275] INFO: 因子分析: StockPool.process, 0.000s

[2021-05-14 12:55:52.141229] INFO: 因子分析: UserDataMerge.process, 0.000s

[2021-05-14 12:55:52.143082] INFO: 因子分析: DropSTStocks.process, 0.000s

[2021-05-14 12:55:52.145183] INFO: 因子分析: DropPriceLimitStocks.process, 0.000s

[2021-05-14 12:55:52.147560] INFO: 因子分析: DropNewStocks.process, 0.000s

[2021-05-14 12:55:52.149657] INFO: 因子分析: DropSuspendedStocks.process, 0.000s

[2021-05-14 12:55:52.151203] INFO: 因子分析: QuantileCount.process, 0.000s

[2021-05-14 12:55:52.153039] INFO: 因子分析: CommissionRates.process, 0.000s

[2021-05-14 12:55:52.154602] INFO: 因子分析: Normalization.process, 0.000s

[2021-05-14 12:55:52.156001] INFO: 因子分析: Neutralization.process, 0.000s

[2021-05-14 12:55:52.157559] INFO: 因子分析: DelayRebalanceDays.process, 0.000s

[2021-05-14 12:57:03.323161] INFO: 因子分析: RebalancePeriod.process, 71.164s

[2021-05-14 12:57:03.326126] INFO: 因子分析: RebalancePeriodsReturns.process, 0.000s

[2021-05-14 12:57:03.327517] INFO: 因子分析: RebalancePrice.process, 0.000s

[2021-05-14 12:57:03.329221] INFO: 因子分析: ReturnsCalculationMethod.process, 0.000s

[2021-05-14 12:57:03.331587] INFO: 因子分析: FactorCoverage.process, 0.000s

[2021-05-14 12:57:03.553702] INFO: 因子分析: QuantileReturns.process, 0.220s

[2021-05-14 12:57:27.865370] INFO: 因子分析: IC.process, 24.309s

[2021-05-14 12:57:28.778781] INFO: 因子分析: BasicDescription.process, 0.911s

[2021-05-14 12:57:30.164541] INFO: 因子分析: Industry.process, 1.384s

[2021-05-14 12:57:34.146014] INFO: 因子分析: RebalanceOverlap.process, 3.980s

[2021-05-14 12:57:34.475657] INFO: 因子分析: PBRatio.process, 0.327s

[2021-05-14 12:57:34.670180] INFO: 因子分析: Turnover.process, 0.193s

[2021-05-14 12:57:34.715758] INFO: 因子分析: Stocks.process, 0.044s

[2021-05-14 12:57:37.633993] INFO: 因子分析: MarketCap.process, 2.916s

[2021-05-14 12:57:38.562642] INFO: 因子分析: process metrics, 107.480s

[2021-05-14 12:57:38.686085] INFO: 因子分析: QuantileReturns.after_process

[2021-05-14 12:57:38.688348] INFO: 因子分析: IC.after_process

[2021-05-14 12:57:38.689906] INFO: 因子分析: BasicDescription.after_process

[2021-05-14 12:57:38.691428] INFO: 因子分析: Industry.after_process

[2021-05-14 12:57:38.692690] INFO: 因子分析: RebalanceOverlap.after_process

[2021-05-14 12:57:38.693786] INFO: 因子分析: PBRatio.after_process

[2021-05-14 12:57:38.695164] INFO: 因子分析: Turnover.after_process

[2021-05-14 12:57:38.696248] INFO: 因子分析: Stocks.after_process

[2021-05-14 12:57:38.697220] INFO: 因子分析: MarketCap.after_process

[2021-05-14 12:57:38.698663] INFO: 因子分析: FactorPairwiseCorrelationMerged.after_process

[2021-05-14 12:57:38.700147] INFO: 因子分析: QuantileReturns.before_merged_process

[2021-05-14 12:57:38.701378] INFO: 因子分析: IC.before_merged_process

[2021-05-14 12:57:38.702601] INFO: 因子分析: BasicDescription.before_merged_process

[2021-05-14 12:57:38.703510] INFO: 因子分析: Industry.before_merged_process

[2021-05-14 12:57:38.704367] INFO: 因子分析: RebalanceOverlap.before_merged_process

[2021-05-14 12:57:38.705468] INFO: 因子分析: PBRatio.before_merged_process

[2021-05-14 12:57:38.706592] INFO: 因子分析: Turnover.before_merged_process

[2021-05-14 12:57:38.707468] INFO: 因子分析: Stocks.before_merged_process

[2021-05-14 12:57:38.708267] INFO: 因子分析: MarketCap.before_merged_process

[2021-05-14 12:57:38.709287] INFO: 因子分析: FactorPairwiseCorrelationMerged.before_merged_process

[2021-05-14 12:57:38.710348] INFO: 因子分析: QuantileReturns.after_merged_process

[2021-05-14 12:57:38.711189] INFO: 因子分析: IC.after_merged_process

[2021-05-14 12:57:38.712057] INFO: 因子分析: BasicDescription.after_merged_process

[2021-05-14 12:57:38.713287] INFO: 因子分析: Industry.after_merged_process

[2021-05-14 12:57:38.714134] INFO: 因子分析: RebalanceOverlap.after_merged_process

[2021-05-14 12:57:38.714966] INFO: 因子分析: PBRatio.after_merged_process

[2021-05-14 12:57:38.716166] INFO: 因子分析: Turnover.after_merged_process

[2021-05-14 12:57:38.717069] INFO: 因子分析: Stocks.after_merged_process

[2021-05-14 12:57:38.717901] INFO: 因子分析: MarketCap.after_merged_process

[2021-05-14 12:57:38.718829] INFO: 因子分析: FactorPairwiseCorrelationMerged.after_merged_process

[2021-05-14 12:57:38.825083] INFO: 因子分析: batch_process ended, 250.553s

Unnamed: 0,累计收益,近1年收益,近3月收益,近1月收益,近1周收益,昨日收益,最大回撤,盈亏比,胜率,夏普比率,收益波动率
最小分位,-89.92%,-32.85%,-13.19%,-4.07%,-2.56%,-0.56%,90.07%,0.8,0.44,-2.72,21.83%
最大分位,-96.47%,-48.25%,-17.92%,-5.07%,-3.30%,-0.46%,96.50%,0.77,0.4,-3.88,22.13%
多空组合,68.24%,13.73%,2.85%,0.54%,0.38%,-0.05%,2.38%,1.28,0.62,3.13,3.12%

股票名称,股票代码,因子值
中国人寿,601628.SHA,-1.8929
中国银河,601881.SHA,-1.8423
山西证券,002500.SZA,-1.8358
浙商证券,601878.SHA,-1.8185
东吴证券,601555.SHA,-1.8008
第一创业,002797.SZA,-1.7959
同花顺,300033.SZA,-1.795
石基信息,002153.SZA,-1.7844
长城证券,002939.SZA,-1.7518
索通发展,603612.SHA,-1.7435

股票名称,股票代码,因子值
国城矿业,000688.SZA,1.3419
聚灿光电,300708.SZA,1.3656
测绘股份,300826.SZA,1.3771
中天火箭,003009.SZA,1.3856
米奥会展,300795.SZA,1.3987
卡倍亿,300863.SZA,1.4183
华夏航空,002928.SZA,1.4277
硕世生物,688399.SHA,1.4344
汇金科技,300561.SZA,1.4494
润禾材料,300727.SZA,1.4643


[2021-05-14 12:57:39.288907] INFO: moduleinvoker: factorlens.v1 运行完成[253.237846s].

In [9]:
from datetime import datetime
import os
str = '/home/dltest/caffe/examples/sgg_datas/images/result_test/zutest/' + datetime.now().strftime("%Y%m%d_%H%M%S")
 
/#判断文件是否存在，若文件存在则继续，直到该文件夹下不包含该文件名
while True==os.path.exists(str):
    str = str + datetime.now().strftime("%Y%m%d_%H%M%S")
   
os.makedirs(str)#创建文件夹


PermissionError: [Errno 13] Permission denied: '/home/dltest'

In [None]:
al = ['ts_sum(mean2(close, open), 8)', 'vmul(sma(wap_3_vwap_buy, 5), max(turn, turn))', 'rank(vsub(volume, low))', 'mean3(ts_sum(turn, 2), abs(low), vdiv(low, open))', 'arctan(close)', 'min(wap_3_vwap_buy, wap_3_vwap_buy)', 'corr(low, volume, 6)', 'max(high, high)', 'argmax(sma(low, 2), 6)', 'mean2(open, open)', 'vneg(close)', 'ts_min(vadd(open, high), constant(8))', 'mean2(open, turn)', 'mean2(volume, low)', 'ts_min(volume, constant(1))', 'delta(wap_3_vwap_buy, 7)', 'normalization(close)', 'power(decay_linear(close, 6), standardation(turn))', 'cov(high, wap_3_vwap_buy, 1)', 'vadd(turn, close)', 'vadd(turn, close)']

def drop_duplicates(individuals):
    """移除生成表达式相同的个体"""
    individuals = [str(ind) for ind in individuals]
    print(individuals)
    return list(set(individuals))

al2= drop_duplicates(al)

print(len(al),len(al2))