# Fama-French三因子模型
该notebook用于计算A股市场的三因子模型
参考https://zhuanlan.zhihu.com/p/55071842?utm_source=ZHShareTargetIDMore&utm_medium=social&utm_oi=48092694970368

In [69]:
from typing import Iterable, Tuple, Dict
import pandas as pd
import pendulum as pdl
from myhelpers.tushare_pro import pro
from myhelpers.pandas_extensions import get_head_and_tail
from myhelpers.mytypings import DataFrame
import toolz as tlz
import toolz.curried as c
from toolz import pipe as pp


In [90]:
# 计算最近三个月的三因子
today = pdl.today()   # 获取今天日期

# 获取三个月前日期
start_date = today.subtract(months=3).format("YYYYMMDD")
end_date = today.format("YYYYMMDD")

# 获取股市交易日日历
trading_cal = pro.trade_cal(exchange="SSE", 
                            start_date=start_date, 
                            end_date=end_date,
                            is_open="1")

# 看一眼日历长什么样
get_head_and_tail(trading_cal)


Unnamed: 0,exchange,cal_date,is_open
0,SSE,20190513,1
1,SSE,20190514,1
2,SSE,20190515,1
3,SSE,20190516,1
4,SSE,20190517,1
59,SSE,20190805,1
60,SSE,20190806,1
61,SSE,20190807,1
62,SSE,20190808,1
63,SSE,20190809,1


In [71]:
def get_daily_returns_and_basic(cal: DataFrame) -> Iterable[DataFrame]:
    for date in cal["cal_date"]:
        daily_returns = pro.daily(trade_date=date)
        daily_basic = pro.daily_basic(trade_date=date)
        df_merge = daily_returns.merge(daily_basic, how="inner", on="ts_code")
        yield df_merge

df = pp(trading_cal,
        get_daily_returns_and_basic,
        tlz.curry(pd.concat, axis=0))
get_head_and_tail(df)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return self._partial(*args, **kwargs)


Unnamed: 0,amount,change,circ_mv,close_x,close_y,float_share,free_share,high,low,open,...,ps_ttm,total_mv,total_share,trade_date_x,trade_date_y,ts_code,turnover_rate,turnover_rate_f,vol,volume_ratio
0,42400.76,0.26,200495.0,9.94,9.94,20170.5187,13309.1796,9.95,9.63,9.76,...,6.7411,200495.0,20170.52,20190510,20190510,000014.SZ,2.1367,3.2382,43098.0,1.22
1,78448.317,0.09,712072.1,4.46,4.46,159657.43,107282.7368,4.49,4.28,4.39,...,0.2186,1073944.0,240794.5,20190510,20190510,000016.SZ,1.1115,1.6541,177451.92,1.02
2,26430.674,0.15,136341.5,4.5,4.5,30298.1008,23947.2261,4.53,4.31,4.38,...,21.1825,248106.6,55134.79,20190510,20190510,000017.SZ,1.9601,2.4799,59387.58,1.44
3,8329.51,-0.09,177214.1,1.78,1.78,99558.4713,70698.4695,1.78,1.78,1.78,...,1.8253,302287.6,169824.5,20190510,20190510,000018.SZ,0.47,0.6619,46795.0,0.43
4,77545.322,0.34,315301.2,7.58,7.58,41596.4578,27046.4598,7.73,7.05,7.2,...,0.6582,873621.7,115253.5,20190510,20190510,000019.SZ,2.5252,3.8836,105037.84,1.29
3647,697605.312,-0.36,26512700.0,27.29,27.29,971517.0043,554628.878,27.85,27.16,27.85,...,0.9785,30843550.0,1130214.0,20190809,20190809,000002.SZ,0.2622,0.4593,254721.49,0.65
3648,5839.596,0.01,150441.6,18.14,18.14,8293.3609,5065.9161,18.42,18.13,18.35,...,3.8254,152333.7,8397.668,20190809,20190809,000004.SZ,0.3857,0.6315,3199.1,0.53
3649,21055.174,-0.01,219044.2,7.09,7.09,30894.8044,23394.7917,7.12,6.99,7.12,...,59.9333,245631.7,34644.8,20190809,20190809,000007.SZ,0.9649,1.2742,29808.88,0.6
3650,24054.283,-0.07,597118.8,3.74,3.74,159657.43,107282.7368,3.84,3.7,3.83,...,0.1651,900571.6,240794.5,20190809,20190809,000016.SZ,0.4003,0.5957,63910.48,0.45
3651,8893.623,0.01,116483.4,1.17,1.17,99558.4713,70698.4695,1.19,1.16,1.16,...,1.1998,198694.7,169824.5,20190809,20190809,000018.SZ,0.7612,1.0719,75783.4,0.77


In [72]:
# 保存一下数据
df.to_csv("../data/returns_last_3months.csv", encoding="utf8")

In [61]:
# 划分大小市值企业
def calc_smb_hml(df: DataFrame) -> Tuple[float, float]:
    df["SB"] = df["circ_mv"].map(lambda x: "B" if x >= df["circ_mv"].median() else "S")
    
    # 账面市值比：1/PB
    df["BM"] = 1/df["pb"]
    
    # 划分中高低账面市值比公司
    border_down, border_up = df['BM'].quantile([0.3, 0.7])
    def make_hml(x: float) -> str:
        if x >= border_up: return "H"
        elif x <= border_down: return "L"
        else: return "M"
    df["HML"] = df['BM'].map(make_hml)
    
    # 划分组合
    def iter_groups(x):
        for s in ["S", "B"]:
            for h in ["H", "M", "L"]:
                field = s + h
                yield f"R_{field}", x.query(f"(SB == '{s}') & (HML == '{h}')")
    groups = pp(df,
                iter_groups)
    # 计算各组的收益率
    def calc_weighted_returns(x: DataFrame) -> float:
        return (x["pct_chg"] / 100 * x["circ_mv"]).sum() / x["circ_mv"].sum()
    return_groups = pp(groups,
                       c.map(lambda x: (x[0], calc_weighted_returns(x[1]))),
                       dict)
    
    def cal_smb_hml(x: Dict[str, float]) -> Tuple[float, float]:
        smb = (x["R_SL"] + x["R_SM"] + x["R_SL"] - x["R_BL"] - x["R_BM"] - x["R_BH"])
        hml = (x["R_SH"] + x["R_BH"] - x["R_SL"] - x["R_BL"])
        return smb, hml
    
    return cal_smb_hml(return_groups)

In [101]:
def make_smb_hml_df(df: DataFrame, cal: DataFrame) -> DataFrame:
    data = []
    for date in cal["cal_date"]:
        smb, hml = calc_smb_hml(df.query(f"trade_date_x == '{date}'"))
        print(date, smb, hml)
        data.append([date, smb, hml])
    new_df = pd.DataFrame(data, columns=["trade_date", "SMB", "HML"])
    new_df.loc[:,"trade_date"] = pd.to_datetime(new_df["trade_date"])
    return new_df.set_index("trade_date")

In [102]:
df_tfm = make_smb_hml_df(df, trading_cal)
get_head_and_tail(df_tfm)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


20190513 0.026746434980633147 -0.008759188922084271
20190514 -0.0007118817538149689 -0.006293457101630741
20190515 -0.0011341314383377776 -0.017826050207859195
20190516 -0.00012923510230732765 0.007579961127716245
20190517 -0.0226666732551641 -0.005671666835794155
20190520 0.006681734421730306 0.006502040344651898
20190521 0.0017827343879996335 -0.006627224280665998
20190522 0.006005243730412722 -0.01758689156120544
20190523 -0.015946222443800666 0.015300495171665814
20190524 -0.01903397331092538 0.0037726983904570422
20190527 0.039101678510268176 -0.016359200722340424
20190528 -0.012478804675381192 -0.014113162265171314
20190529 0.022992140576490475 -0.014982767521121765
20190530 0.0005283161468115653 -0.002637832041712418
20190531 0.021071989999012147 -0.012194346038605657
20190603 -0.037965408069861796 -0.0017740080653751702
20190604 -0.01792217808850102 0.01022187163670392
20190605 0.01450247371913994 0.0034444979234285796
20190606 -0.04780641915328372 0.013587482215670861
20190610

Unnamed: 0_level_0,SMB,HML
trade_date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-05-13,0.026746,-0.008759
2019-05-14,-0.000712,-0.006293
2019-05-15,-0.001134,-0.017826
2019-05-16,-0.000129,0.00758
2019-05-17,-0.022667,-0.005672
2019-08-05,0.032664,-0.011424
2019-08-06,-0.045372,-0.023082
2019-08-07,0.002437,-0.001327
2019-08-08,-0.021382,-0.01032
2019-08-09,-0.008314,0.007009


In [103]:
# 保存一下数据
df_tfm.to_csv("../data/smb_html.csv", encoding="utf8")


In [107]:
# 合并所有数据
whole_data = pd.merge(df.loc[:, ["trade_date_x", "pct_chg"]].set_index("trade_date_x"), 
                      df_tfm, how="inner",
                      left_index=True, right_index=True)
get_head_and_tail(whole_data)

Unnamed: 0,pct_chg,SMB,HML
2019-05-13,-1.2081,0.026746,-0.008759
2019-05-13,-1.0373,0.026746,-0.008759
2019-05-13,-1.3258,0.026746,-0.008759
2019-05-13,-1.7833,0.026746,-0.008759
2019-05-13,-2.4142,0.026746,-0.008759
2019-08-09,-1.302,-0.008314,0.007009
2019-08-09,0.0552,-0.008314,0.007009
2019-08-09,-0.1408,-0.008314,0.007009
2019-08-09,-1.8373,-0.008314,0.007009
2019-08-09,0.8621,-0.008314,0.007009


In [110]:
# 获取大盘收益
# 使用上证综指
df_rm = (
    pro.index_daily(ts_code="000001.sh", start_date=start_date, end_date=end_date)
       .rename(columns={"pct_chg": "pct_chg_rm"})
       .set_index("trade_date")
)
get_head_and_tail(df_rm)


Unnamed: 0_level_0,ts_code,close,open,high,low,pre_close,change,pct_chg_rm,vol,amount
trade_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
20190809,000001.SH,2774.7532,2805.5856,2808.3289,2770.4777,2794.5523,-19.7991,-0.7085,148240293.0,171140338.8
20190808,000001.SH,2794.5523,2784.1835,2799.6859,2782.2447,2768.6795,25.8728,0.9345,149539564.0,178156148.4
20190807,000001.SH,2768.6795,2789.0187,2792.6934,2768.6795,2777.5559,-8.8764,-0.3196,157716752.0,176475728.4
20190806,000001.SH,2777.5559,2776.9889,2787.4185,2733.9242,2821.4957,-43.9398,-1.5573,231123727.0,255209231.2
20190805,000001.SH,2821.4957,2854.578,2863.6891,2821.4957,2867.8376,-46.3419,-1.6159,167419423.0,196891483.1
20190517,000001.SH,2882.2962,2955.7669,2956.7832,2873.7998,2955.711,-73.4148,-2.4838,266301643.0,247875177.7
20190516,000001.SH,2955.711,2933.4954,2956.171,2929.0645,2938.6781,17.0329,0.5796,248385701.0,226203068.9
20190515,000001.SH,2938.6781,2902.6423,2945.3946,2902.6423,2883.6107,55.0674,1.9097,230519701.0,226262785.6
20190514,000001.SH,2883.6107,2872.8336,2909.2041,2872.8336,2903.7131,-20.1024,-0.6923,208345353.0,201223315.4
20190513,000001.SH,2903.7131,2905.067,2921.4103,2892.1745,2939.2106,-35.4975,-1.2077,212029882.0,202624125.0


In [111]:
# 使用shibor隔夜拆借利率
df_rf = (
    pro.shibor(start_date=start_date, end_date=end_date)
       .set_index("date")
)
get_head_and_tail(df_rf)


Unnamed: 0_level_0,on,1w,2w,1m,3m,6m,9m,1y
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
20190809,2.596,2.644,2.512,2.571,2.618,2.706,3.002,3.0785
20190808,2.5,2.578,2.438,2.562,2.618,2.701,3.004,3.0805
20190807,2.22,2.52,2.433,2.564,2.623,2.701,3.005,3.083
20190806,2.261,2.494,2.486,2.584,2.635,2.703,3.01,3.087
20190805,2.544,2.619,2.517,2.594,2.646,2.706,3.015,3.094
20190517,2.134,2.589,2.494,2.705,2.9,2.95,3.05,3.162
20190516,2.281,2.607,2.44,2.708,2.896,2.947,3.051,3.157
20190515,2.5334,2.641,2.462,2.713,2.893,2.948,3.052,3.153
20190514,2.4123,2.65,2.449,2.717,2.891,2.948,3.05,3.153
20190513,2.156,2.579,2.405,2.715,2.889,2.948,3.05,3.152


In [120]:
# 数据融合
df_all = (
    df.set_index("trade_date_x").merge(df_tfm, how="inner", left_index=True, right_index=True )
    .merge(df_rm, how="inner", left_index=True, right_index=True)
    .merge(df_rf, how="inner", left_index=True, right_index=True)
)
df_all = (df_all.assign(premium = df_all["pct_chg_rm"] / 100 - df_all["on"].map(lambda x: (1 + x/100)**(1/360) - 1))
          .assign(adj_return = df_all["pct_chg"] / 100 - df_all["on"].map(lambda x: (1 + x/100)**(1/360) - 1))
          .loc[:, ["adj_return", "premium", "SMB", "HML"]])
get_head_and_tail(df_all)

Unnamed: 0,adj_return,premium,SMB,HML
2019-05-13,-0.01214,-0.012136,0.026746,-0.008759
2019-05-13,-0.010432,-0.012136,0.026746,-0.008759
2019-05-13,-0.013317,-0.012136,0.026746,-0.008759
2019-05-13,-0.017892,-0.012136,0.026746,-0.008759
2019-05-13,-0.024201,-0.012136,0.026746,-0.008759
2019-08-09,-0.013091,-0.007156,-0.008314,0.007009
2019-08-09,0.000481,-0.007156,-0.008314,0.007009
2019-08-09,-0.001479,-0.007156,-0.008314,0.007009
2019-08-09,-0.018444,-0.007156,-0.008314,0.007009
2019-08-09,0.00855,-0.007156,-0.008314,0.007009


In [137]:
import numpy as np
get_head_and_tail(df.loc[:,["trade_date_x", "pct_chg"]])

Unnamed: 0,trade_date_x,pct_chg
0,20190510,2.686
1,20190510,2.0595
2,20190510,3.4483
3,20190510,-4.8128
4,20190510,4.6961
3647,20190809,-1.302
3648,20190809,0.0552
3649,20190809,-0.1408
3650,20190809,-1.8373
3651,20190809,0.8621


In [122]:
# 保存一下数据
df_all.to_csv("../data/df_all.csv", encoding="utf8")

In [121]:
# 跑线性回归
import statsmodels.formula.api as smf

# 拟合OLS回归
results = smf.ols("adj_return ~ premium + SMB + HML", data = df_all).fit()

# 打印结果
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:             adj_return   R-squared:                       0.175
Model:                            OLS   Adj. R-squared:                  0.175
Method:                 Least Squares   F-statistic:                 1.637e+04
Date:                Sun, 11 Aug 2019   Prob (F-statistic):               0.00
Time:                        22:46:04   Log-Likelihood:             4.8385e+05
No. Observations:              231091   AIC:                        -9.677e+05
Df Residuals:                  231087   BIC:                        -9.676e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0005   7.04e-05      6.994      0.0