# 因子构造
输入原始数据，要求：1.所有记录周期相同(日、分钟、周)；2.列名称已修改得符合我们的规范(high, close, volume)；3.缺失值已处理

# 使用指南
1. 选择第二或第三个cell
2. 点击菜单栏Cell->Run All Below
3. 回到第一个cell，根据需要修改factorInfo和labelInfo字典，格式参考示例(第二个cell)
3. run一次第一个cell即可

In [13]:
factorInfo = {
    'close': {
        'window': 20
    },
    'atr': {
        'time period': 14,
        'window': 6
    },
    'rsi': {
        'time period': 14,
        'window': 6
    }
}

labelInfo = {
    'future return': {
        'future period': [3, 5]
    }
}

factor_construct(dataPath='C:/Users/ShaunMarx/实习_量化坊/02甲醇期货技术指标/IF.csv',
                outPath='C:/Users/ShaunMarx/实习_量化坊/03因子检验程序',
                factorInfo=factorInfo,
                labelInfo=labelInfo)

author: Siltka (Shi Yaoen)
update time: 2022/2/18

calling factor_construct

calling load_data
data path is: C:/Users/ShaunMarx/实习_量化坊/02甲醇期货技术指标/IF.csv
calling factor_hub
constructing No.1 factor:
calling close
{'window': 20}
calling factor_window
factor name: Index(['close'], dtype='object')
window: 20
constructing No.2 factor:
calling atr
{'time period': 14, 'window': 6}
calling factor_window
factor name: Index(['atr14'], dtype='object')
window: 6
constructing No.3 factor:
calling rsi
{'time period': 14, 'window': 6}
calling factor_window
factor name: Index(['rsi14'], dtype='object')
window: 6

calling label_hub
constructing No.1 label:
calling future_return
{'future period': [3, 5]}

concating label and factor
future_return3
future_return5


# factorInfo与labelInfo格式示例
示范字典格式所用，真实要跑的参数不要在这里改

In [None]:
factorInfo = {
    'close': {
        'window': 20
    },
    'atr': {
        'time period': 14,
        'window': 6
    },
    'rsi': {
        'time period': 14,  # 改成和future return一样的list格式
        'window': 6
    }
}

labelInfo = {
    'future return': {
        'future period': [3, 5]
    }
}

# import必要的库

In [1]:
import pandas as pd 
import numpy as np
import time
import talib

import warnings
warnings.filterwarnings('ignore')

# define外部所调用的接口函数

In [12]:
def factor_construct(dataPath:str, outPath:str, factorInfo:dict, labelInfo:dict):
    
    print('author: Siltka (Shi Yaoen)')
    print('update time: 2022/2/18')
    print()
    print('calling factor_construct')
    print()
    
    # 载入数据
    data = load_data(dataPath)
    
    # 检查是否有缺失值
    if data.isna().any().any():
        print('ERROR: data has missing value(s), please fill it.')
        return
    
    # 构建因子
    factor_df = factor_hub(data, factorInfo)
        
    print()
    
    # 构建标签(被解释变量)
    label_df = label_hub(data, labelInfo)
    
    print()
    
    # 分别将标签与因子拼接
    print('concating label and factor:')
    ready_data_list = []
    for label in label_df.columns:
        print(label)
        ready_data = pd.concat([label_df[label], factor_df], axis=1).dropna()
        ready_data.to_csv(outPath+'/'+'ready_data'+str(round(time.time()*1e6))+'.csv')
        ready_data_list.append(ready_data)

# define功能函数

In [3]:
# 加载数据并将索引转换为pandas.DatetimeIndex
def load_data(dataPath:str):

    print('calling load_data')
    print('data path is:', dataPath)
    
    if dataPath.endswith('.csv'):
        data = pd.read_csv(dataPath, index_col=[0])
    elif dataPath.endswith('.xlsx'):
        data=pd.read_excel(dataPath, index_col=[0])
    else:
        print('ERROR: please enter correct file path.')
        return
    
    data.index = pd.DatetimeIndex(data.index)
    
    return data

In [4]:
# 对因子做一个向历史n天的window
def factor_window(factor:pd.DataFrame, window:int):
    
    print('calling factor_window')
    print('factor name:', factor.columns)
    print('window:', str(window))
    
    if window <= 1:
        return factor
    
    window_list = []
    for i in range(0,len(factor.columns)):
        window_list.append(factor.iloc[:,i])
        factorName = factor.columns[i]
        for j in range(1, window):
            shift_col = factor.iloc[:,i].shift(j)
            shift_col.name = factorName+'+'+str(j)
            window_list.append(shift_col)
    
    return pd.concat(window_list, axis=1)

# define因子枢纽函数

In [5]:
def factor_hub(data:pd.DataFrame, factorInfo:dict):
    
    print('calling factor_hub')
    
    factorNumber = 0
    factorList = []
    
    for factorName in factorInfo.keys():
        factorNumber +=1
        print('constructing No.'+str(factorNumber)+' factor:')
        
        if factorName == 'close':
            factorList.append(close(data, factorInfo[factorName]))
        elif factorName == 'atr':
            factorList.append(atr(data, factorInfo[factorName]))
        elif factorName == 'rsi':
            factorList.append(rsi(data, factorInfo[factorName]))
        
    return pd.concat(factorList, axis=1)

# define各类因子构建函数
新建好一个factor后就到factor_hub去新建一个选择分支

In [6]:
def close(data:pd.DataFrame, params:dict):
    
    print('calling close')
    print(params)

    close = data.loc[:,['close']]  # 这样切列使得close不会变成Series，全部统一成DataFrame，为了方便处理
    time_index = data.index
        
    return factor_window(close, params['window'])

In [7]:
def atr(data:pd.DataFrame, params:dict):
    
    print('calling atr')
    print(params)
    
    high = data.loc[:,'high']
    low = data.loc[:,'low']
    close = data.loc[:,'close']
    time_index = data.index
    timeperiod = params['time period']
    
    atr = pd.DataFrame({'atr'+str(timeperiod): talib.ATR(high, low, close, timeperiod)}, index=time_index)
    
    return factor_window(atr, params['window'])

In [8]:
def rsi(data:pd.DataFrame, params:dict):
    
    print('calling rsi')
    print(params)

    close = data.loc[:,'close']
    time_index = data.index
    timeperiod = params['time period']
    
    rsi = pd.DataFrame({'rsi'+str(timeperiod): talib.RSI(close, timeperiod)}, index=time_index)
    
    return factor_window(rsi, params['window'])

# define标签枢纽函数

In [9]:
def label_hub(data:pd.DataFrame, labelInfo:dict):
    
    print('calling label_hub')
    
    labelNumber = 0
    labelList = []
    
    for labelName in labelInfo.keys():
        labelNumber +=1
        print('constructing No.'+str(labelNumber)+' label:')
        
        if labelName == 'future return':
            labelList.append(future_return(data, labelInfo[labelName]))
    
    return pd.concat(labelList, axis=1)

# define各类可选择的标签类型

In [10]:
def future_return(data:pd.DataFrame, params:dict):
    
    print('calling future_return')
    print(params)

    close = data.loc[:,['close']]  # 这样切列使得close不会变成Series，全部统一成DataFrame，为了方便处理
    time_index = data.index
    futureperiod = params['future period']
    
    future_returnList = []
    for fp in futureperiod:
        future_return = close.pct_change(fp).shift(-fp)
        future_return.rename(columns={'close':'future_return'+str(fp)}, inplace=True)
        future_returnList.append(future_return)

    return pd.concat(future_returnList,axis=1)