In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import summary_table
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [2]:
# 采用clean_data后的数据来获取因子收益率
rf = pd.read_csv("../task3/clean_data/clean_rf.csv")
y_true = pd.read_csv("../task3/clean_data/clean_true.csv")

In [3]:
# 经过正交化处理发现,risk_factor_30的值和前面的因子值risk_factor_0-risk_factor_29存在高度共线性
# 可能原因为:risk_factor_0-risk_factor_30都是行业因子,确定了前面n-1个因子，那么剩下的那个因子值也随之确定
rf.drop('risk_factor_30',axis=1,inplace=True)
factor_name = list(rf.columns)[2:]
date_idx = np.sort(np.array(list(set(rf['trade_date']))))

In [4]:
max_dot_product = []
factor_min_length = []
for date in date_idx:
    temp = rf[rf['trade_date']==date]
    # z-score标准化
    for factor in factor_name:
        temp[factor] = (temp[factor]-temp[factor].mean())/temp[factor].std()
    # Gram-Schmidt正交化
    factor_name2 = []
    for factor in factor_name:
        x = 0
        for factor2 in factor_name2:
            x+=(temp[factor]*temp[factor2]).sum()/(temp[factor2]*temp[factor2]).sum()*temp[factor2]
        temp[factor]-=x
        factor_name2.append(factor)    
    # 记录正交化后最小的因子模长,希望它不小
    factor_length = []
    for factor in factor_name:
        factor_length.append(np.power((temp[factor]*temp[factor]).sum(),1/2))
    factor_min_length.append(min(factor_length))
    rf[rf['trade_date']==date]=temp 

In [5]:
print(min(factor_min_length))

13.880552843037849


In [6]:
# 检查正交化结果
max_dot_product = []
for date in date_idx:
    temp = rf[rf['trade_date']==date]
    dot_product = []
    for i in range(len(factor_name)):
        for j in range(i+1,len(factor_name)):
            dot_product.append((temp[factor_name[i]]*temp[factor_name[j]]).sum())
    max_dot_product.append(max(np.abs(dot_product)))

In [7]:
print(max(max_dot_product))

4.121147867408581e-13


In [8]:
# 检查因子列之间的相关性
rf[factor_name].corr()

Unnamed: 0,risk_factor_0,risk_factor_1,risk_factor_2,risk_factor_3,risk_factor_4,risk_factor_7,risk_factor_9,risk_factor_10,risk_factor_11,risk_factor_12,...,risk_factor_28,risk_factor_29,risk_factor_31,risk_factor_32,risk_factor_33,risk_factor_34,risk_factor_35,risk_factor_36,risk_factor_37,risk_factor_38
risk_factor_0,1.0,1.268447e-18,-4.444412e-19,-5.59273e-18,9.997892e-18,-2.8019279999999995e-19,-9.674181999999999e-19,4.370119e-18,2.309728e-18,-6.909947999999999e-19,...,8.332927e-18,1.588469e-18,2.1358219999999997e-19,-1.0675860000000001e-18,-1.343331e-18,-5.067556e-18,1.6763959999999998e-19,2.588917e-19,-4.3282840000000004e-18,-7.2621129999999995e-19
risk_factor_1,1.268447e-18,1.0,-1.849043e-18,7.304521999999999e-20,6.113904999999999e-19,7.461771e-18,2.359314e-18,-6.974354999999999e-19,-5.028193e-18,3.131638e-18,...,-2.245397e-18,-1.1308920000000001e-17,-5.033288e-18,-7.021937999999999e-19,3.3973769999999996e-19,-3.7251e-18,-1.860847e-18,1.496559e-18,-9.993322e-18,-1.5440659999999998e-19
risk_factor_2,-4.444412e-19,-1.849043e-18,1.0,-6.818309999999999e-19,-2.701832e-18,-9.106262e-18,-2.163448e-18,-1.3678230000000001e-17,-8.115726e-18,-1.2062390000000002e-17,...,-3.135512e-17,-5.2031950000000005e-17,3.4402280000000002e-18,1.189627e-18,2.8649620000000002e-18,4.6835870000000004e-18,2.575502e-18,4.23608e-18,-6.612546e-18,4.630854999999999e-19
risk_factor_3,-5.59273e-18,7.304521999999999e-20,-6.818309999999999e-19,1.0,5.8556130000000005e-18,9.805904e-18,7.095535e-18,3.2376009999999997e-19,3.080808e-18,1.577551e-18,...,3.3351150000000005e-17,5.1944890000000006e-17,9.854082e-18,2.0317729999999998e-19,1.987235e-18,1.642058e-17,-1.156916e-18,-1.2263330000000001e-17,3.1846570000000004e-17,8.170144e-18
risk_factor_4,9.997892e-18,6.113904999999999e-19,-2.701832e-18,5.8556130000000005e-18,1.0,5.90508e-18,-2.9964309999999996e-19,3.3397229999999995e-19,-5.588089e-19,-4.401315e-19,...,3.1692770000000002e-18,-1.5471620000000003e-17,-1.7706490000000002e-17,7.139549999999999e-20,-9.067677e-19,-5.78956e-19,9.005462e-18,-3.036137e-18,-1.77554e-18,1.692789e-18
risk_factor_7,-2.8019279999999995e-19,7.461771e-18,-9.106262e-18,9.805904e-18,5.90508e-18,1.0,1.439774e-18,-3.765736e-18,-4.73881e-18,5.279903e-18,...,-8.262669e-18,-1.994703e-17,-7.132598e-18,-5.862429e-18,-5.028667e-18,-4.765571e-18,2.393176e-18,-8.761126e-18,5.8875710000000005e-18,1.184146e-18
risk_factor_9,-9.674181999999999e-19,2.359314e-18,-2.163448e-18,7.095535e-18,-2.9964309999999996e-19,1.439774e-18,1.0,-5.427357e-18,-4.460796e-18,-6.556534e-18,...,-3.067158e-17,-4.6694530000000004e-17,1.6288040000000001e-18,-1.568636e-18,1.118865e-18,-8.086467e-18,-4.830353e-18,1.145995e-17,5.553821e-18,-7.835095e-18
risk_factor_10,4.370119e-18,-6.974354999999999e-19,-1.3678230000000001e-17,3.2376009999999997e-19,3.3397229999999995e-19,-3.765736e-18,-5.427357e-18,1.0,8.194928e-18,8.735207e-18,...,4.4930650000000004e-17,7.909542e-17,5.161138e-18,-1.010823e-18,-4.881654e-18,-3.326192e-18,3.301561e-18,-3.4268300000000004e-17,-3.818156e-17,-4.307948e-18
risk_factor_11,2.309728e-18,-5.028193e-18,-8.115726e-18,3.080808e-18,-5.588089e-19,-4.73881e-18,-4.460796e-18,8.194928e-18,1.0,-3.07726e-18,...,-1.6519120000000003e-17,-3.0530700000000003e-17,-8.182500999999999e-19,-6.722809999999999e-19,1.911838e-18,-3.185609e-18,-6.291139e-18,-4.423183e-18,-3.315756e-18,2.898884e-19
risk_factor_12,-6.909947999999999e-19,3.131638e-18,-1.2062390000000002e-17,1.577551e-18,-4.401315e-19,5.279903e-18,-6.556534e-18,8.735207e-18,-3.07726e-18,1.0,...,3.4513189999999995e-19,-7.752923000000001e-18,-2.265651e-18,-1.557599e-18,-2.423729e-18,-4.535404e-18,-1.734616e-18,-9.427472999999999e-19,-1.738009e-18,1.3260889999999999e-20


In [9]:
# 截面回归得到因子收益率
date_idx = np.sort(np.array(list(set(rf['trade_date']))))
factor_ret = pd.DataFrame(index=date_idx,columns=rf.columns[2:])
for date in date_idx:
    x = y_true[y_true['trade_date']==date]
    x.set_index('stock_code',inplace=True)
    x = x['stock_return']
    date_factor = rf[rf['trade_date']==date]
    date_factor.set_index('stock_code',inplace=True)
    date_factor = date_factor.iloc[:,1:]
    df = pd.concat([x,date_factor],axis=1,join="inner")
    model = sm.OLS(df.iloc[:,0],sm.add_constant(df.iloc[:,1:])).fit()
    factor_ret.loc[date] = model.params

factor_ret = factor_ret.astype(float)

In [10]:
# 重置factor_ret的索引
factor_ret = factor_ret.reset_index()
factor_ret = factor_ret.rename(columns={'index':'trade_date'})

In [11]:
rf.to_csv('../task3/clean_data/clean_ortho_rf.csv',index=False)
factor_ret.to_csv('../task3/clean_data/clean_factor_ret.csv',index=False)