## 截面回归：选择变量

In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from utils import *
import os
import joblib

In [4]:
start_date = '2012-04-30'
end_date = '2018-04-30'

In [5]:
df_all = pd.read_csv('data/df_all.csv', dtype={'asset': str})
df_all['date'] = pd.to_datetime(df_all['date'])

# df_all = df_all[(df_all['date'] >= start_date) & (df_all['date'] <= end_date)]

df_all.drop(['Unnamed: 0', 'Rmrf', 'Smb', 'Hml'], axis=1, inplace=True)
df_all_pivot = df_all.pivot(index='date', columns='asset')

In [6]:
returns = pd.read_csv('data/returns.csv', index_col=0)
returns.index = pd.to_datetime(returns.index)
all_time_index = returns.index
all_time_index

DatetimeIndex(['2011-05-03', '2011-05-04', '2011-05-05', '2011-05-06',
               '2011-05-09', '2011-05-10', '2011-05-11', '2011-05-12',
               '2011-05-13', '2011-05-16',
               ...
               '2023-04-17', '2023-04-18', '2023-04-19', '2023-04-20',
               '2023-04-21', '2023-04-24', '2023-04-25', '2023-04-26',
               '2023-04-27', '2023-04-28'],
              dtype='datetime64[ns]', name='date', length=2918, freq=None)

In [7]:
returns.columns
assets = returns.columns

In [8]:
# 从因子数据中读取因子
alphaset = 'ourAlphas'
subset = '20110430'

from MyBacktrader import get_alpha_list

alpha_names = get_alpha_list(f'alphas/{alphaset}/{subset}')
# alpha_names = ['alpha_financial_health']

for alpha_name in alpha_names:
    alpha = pd.read_csv('alphas/{}/{}/{}.csv'.format(alphaset, subset, alpha_name))
    
    # 从因子数据中读取因子
    alpha = alpha.set_index('date')
    alpha.index = pd.to_datetime(alpha.index)
    
    # alpha = alpha[(alpha.index >= start_date) & (alpha.index <= end_date)]

    # 给 alpha 添加双重索引
    alpha.columns = pd.MultiIndex.from_product([[f'{alpha_name}'], alpha.columns], names=['', 'asset'])

    # 合并 alpha 到 df_all_pivot
    df_all_pivot = pd.concat([df_all_pivot, alpha], axis=1)

In [9]:
print("已经有的因子：")
print(df_all_pivot.columns.get_level_values(0).unique())

已经有的因子：
Index(['open', 'close', 'high', 'low', 'volume', 'amount', 'vwap', 'pctChg',
       'turnover', 'benchmark_open', 'benchmark_close', 'benchmark_high',
       'benchmark_low', 'benchmark_vol', 'roe', 'roa', 'cvd', 'epq', 'emq',
       'sgq', 'alaq', 'pmq', 'cta', 'size', 'rf', 'bm', 'ep', 'alpha001',
       'alpha_ALAQ', 'alpha_bm', 'alpha_CH3', 'alpha_CH3_Size',
       'alpha_CH3_Value', 'alpha_CTA', 'alpha_CVD', 'alpha_EMQ', 'alpha_EP',
       'alpha_EPQ', 'alpha_Fama_French', 'alpha_financial_health',
       'alpha_High_Low', 'alpha_Hml', 'alpha_liquidity', 'alpha_market_alpha',
       'alpha_momentum', 'alpha_momentum_60', 'alpha_multi', 'alpha_PMQ',
       'alpha_PMQ_momentum', 'alpha_Price_VWAP', 'alpha_reversal',
       'alpha_Rmrf', 'alpha_ROA_momentum', 'alpha_ROE', 'alpha_ROE_ROA',
       'alpha_rsi', 'alpha_sentiment', 'alpha_SGQ', 'alpha_SGQ_volatility',
       'alpha_size', 'alpha_Smb', 'alpha_trade_density', 'alpha_turnover',
       'alpha_turnover_10', 'alpha_turn

In [10]:
Y = returns - df_all_pivot['rf'].values
# print(Y.head())

# 截取时间段
Y = Y[(Y.index >= start_date) & (Y.index <= end_date)]

X = df_all_pivot.drop(['rf', 'alpha_Fama_French', 'open', 'close', 'high', 'low', 'volume', 'amount', 'pctChg', 'cta', 'roe','vwap', 'pctChg',
       'turnover', 'benchmark_open', 'benchmark_close', 'benchmark_high',
       'benchmark_low', 'benchmark_vol', 'roe', 'roa', 'cvd', 'epq', 'emq',
       'sgq', 'alaq', 'pmq', 'cta', 'size', 'rf', 'bm', 'ep', 'alpha_CH3', 'alpha_Smb', 'alpha_Hml'], axis=1, level=0)

X = X[(X.index >= start_date) & (X.index <= end_date)]

X.columns.get_level_values(0).unique()

Index(['alpha001', 'alpha_ALAQ', 'alpha_bm', 'alpha_CH3_Size',
       'alpha_CH3_Value', 'alpha_CTA', 'alpha_CVD', 'alpha_EMQ', 'alpha_EP',
       'alpha_EPQ', 'alpha_financial_health', 'alpha_High_Low',
       'alpha_liquidity', 'alpha_market_alpha', 'alpha_momentum',
       'alpha_momentum_60', 'alpha_multi', 'alpha_PMQ', 'alpha_PMQ_momentum',
       'alpha_Price_VWAP', 'alpha_reversal', 'alpha_Rmrf',
       'alpha_ROA_momentum', 'alpha_ROE', 'alpha_ROE_ROA', 'alpha_rsi',
       'alpha_sentiment', 'alpha_SGQ', 'alpha_SGQ_volatility', 'alpha_size',
       'alpha_trade_density', 'alpha_turnover', 'alpha_turnover_10',
       'alpha_turnover_month', 'alpha_Volume_Close', 'alpha_vol_skew'],
      dtype='object')

In [11]:
X.columns

MultiIndex([(      'alpha001', '000001'),
            (      'alpha001', '000002'),
            (      'alpha001', '000009'),
            (      'alpha001', '000012'),
            (      'alpha001', '000021'),
            (      'alpha001', '000027'),
            (      'alpha001', '000031'),
            (      'alpha001', '000039'),
            (      'alpha001', '000046'),
            (      'alpha001', '000059'),
            ...
            ('alpha_vol_skew', '601898'),
            ('alpha_vol_skew', '601899'),
            ('alpha_vol_skew', '601918'),
            ('alpha_vol_skew', '601919'),
            ('alpha_vol_skew', '601939'),
            ('alpha_vol_skew', '601958'),
            ('alpha_vol_skew', '601988'),
            ('alpha_vol_skew', '601989'),
            ('alpha_vol_skew', '601991'),
            ('alpha_vol_skew', '601998')],
           names=[None, 'asset'], length=10368)

## 通过横截面回归（逐步回归）选择变量

In [12]:
# 检查Y是否有缺失值
Y.isnull().sum().sum()

np.int64(15180)

In [15]:
# 逐步回归函数
def stepwise_regression(y, X, direction='both', threshold_in=0.05, threshold_out=0.10):
    """
    逐步回归方法，进行特征选择，返回最优模型以及选择的特征
    """
    initial_vars = X.columns.tolist()
    best_aic = float('inf')
    best_model = None
    remaining_vars = initial_vars.copy()
    selected_vars = []

    while True:
        changed = False
        
        # print("selected_vars: ", selected_vars)

        if direction in ['both', 'forward']:  # 前向选择
            for var in remaining_vars:
                model = sm.OLS(y, sm.add_constant(X[selected_vars + [var]])).fit()
                aic = model.aic
                print(f'forward: {var}, aic: {aic}')
                
                if aic < best_aic:
                    best_aic = aic
                    best_model = model
                    best_var = var
                    changed = True

            if changed:
                selected_vars.append(best_var)
                remaining_vars.remove(best_var)

        if direction in ['both', 'backward']:  # 后向剔除
            for var in selected_vars:
                temp_selected_vars = selected_vars.copy()
                temp_selected_vars.remove(var)
                model = sm.OLS(y, sm.add_constant(X[temp_selected_vars])).fit()
                aic = model.aic
                if aic < best_aic:
                    best_aic = aic
                    best_model = model
                    best_var = var
                    changed = True

            if changed:
                selected_vars.remove(best_var)

        if not changed:
            break

    return best_model, selected_vars

In [18]:
# 初始化存储因子的字典
selected_factors = {}

# 遍历每个时间点
for time_point in tqdm(X.index):
    # 提取当前时间点的 X 和 Y
    X_time = X.loc[time_point].unstack(level=0)  # 转换为因子为列，股票为行的 DataFrame
    Y_time = Y.loc[time_point]

    # 清理缺失值
    X_time = X_time.dropna()
    Y_time = Y_time.loc[X_time.index].dropna()
    X_time = X_time.loc[Y_time.index]

    # 执行逐步回归
    if not X_time.empty and not Y_time.empty:  # 确保数据非空
        # model, selected_vars = stepwise_regression(Y_time, X_time)
        # selected_factors[time_point] = selected_vars
        
        # print(f'{time_point} 选择的因子：{selected_vars}')
        
        model = sm.OLS(Y_time, sm.add_constant(X_time)).fit()
        
        # 如果路径不存在，创建路径
        if not os.path.exists('models/cut'):
            os.makedirs('models/cut')
        
        # 保存模型
        time = time_point.strftime('%Y-%m-%d')
        joblib.dump(model, f'models/cut/{time}.pkl')
    else:
        selected_factors[time_point] = []

# 将选择的因子保存为 DataFrame
selected_factors_df = pd.DataFrame.from_dict(selected_factors, orient='index')
selected_factors_df.index = pd.to_datetime(selected_factors_df.index)  # 确保时间格式正确
# selected_factors_df.to_csv('selected_factors.csv')

100%|██████████| 1460/1460 [00:21<00:00, 69.40it/s]
