In [1]:
# packages
import numpy as np
import pandas as pd

In [2]:
# hide warning messages
import warnings
warnings.filterwarnings("ignore")

### Daterange

In [5]:
# getting the daterange in daily frequency to create some dataframes (we're gonna use the marketcap dataset for this)
returns_path = '../../input/returns/daily.parquet'
returns = pd.read_parquet(returns_path)
daterange = returns.index
daterange

DatetimeIndex(['2005-01-03', '2005-01-04', '2005-01-05', '2005-01-06',
               '2005-01-07', '2005-01-10', '2005-01-11', '2005-01-12',
               '2005-01-13', '2005-01-14',
               ...
               '2019-12-17', '2019-12-18', '2019-12-19', '2019-12-20',
               '2019-12-23', '2019-12-24', '2019-12-26', '2019-12-27',
               '2019-12-30', '2019-12-31'],
              dtype='datetime64[ns]', length=3773, freq=None)

### Functions

correlation_matrix function: receives two parameters, df1 and df2.

* df1 is the deciles portfolio dataframe for any factor.
* df2 is the deciles portfolio dataframe for any factor.

This funtion returns the correlation matrix between the two dataframes. The rows refers to deciles portfolios for df1 and columns refers to deciles portfolios for df2.

In [6]:
def correlation_matrix(df1, df2):
    # correlation matrix dataframe
    corr_matrix = pd.DataFrame(index=df1.columns, columns=df2.columns)

    # filling the correlation matrix dataframe
    for df1_decile in df1.columns:
        for df2_decile in df2.columns:
            corr_matrix[df2_decile][df1_decile] = df1[df1_decile].corr(df2[df2_decile])
    return(corr_matrix)

value_weight function: receives one parameter, kozak_factor.

* kozak_factor is the total market cap by portfolio dataframe of any factor.

This funtion returns the value weight dataframe of the portfolios-by-decile.

In [7]:
def value_weight(kozak_factor):
    # computing the sum of totalme for each day
    sum_totalme = kozak_factor.sum(axis=1)
    
    # for each portfolio we compute the weight
    for decile in kozak_factor.columns:
        kozak_factor[decile] = kozak_factor[decile]/sum_totalme
    
    return kozak_factor

### Our Factors

In [7]:
df = pd.DataFrame(index=daterange, columns=['Size Factor', 'Value Factor', 'Prof Factor', 'Dur Factor',
       'Valprof Factor', 'Fscore Factor', 'Debtiss Factor', 'Repurch Factor',
       'Nissa Factor', 'Accruals Factor', 'Growth Factor', 'Aturnover Factor',
       'Gmargins Factor', 'Divp Factor', 'Ep Factor', 'Cfp Factor',
       'Noa Factor', 'Inv Factor', 'Invcap Factor', 'Igrowth Factor',
       'Sgrowth Factor', 'Lev Factor', 'Roaa Factor', 'Roea Factor',
       'Sp Factor', 'Gltnoa Factor', 'Divg Factor', 'Invaci Factor',
       'Mom Factor', 'Indmom Factor', 'Valmom Factor', 'Valmomprof Factor',
       'Shortint Factor', 'Mom12 Factor', 'Momrev Factor', 'Lrrev Factor',
       'Valuem Factor', 'Nissm Factor', 'Sue Factor', 'Roe Factor',
       'Rome Factor', 'Roa Factor', 'Strev Factor', 'Ivol Factor',
       'Betaarb Factor', 'Season Factor', 'Indrrev Factor', 'Indrrevlv Factor',
       'Indmomrev Factor', 'Ciss Factor', 'Price Factor', 'Age Factor',
       'Shvol Factor', 'Exchsw Factor', 'Ipo Factor'])  

In [8]:
for date in daterange:
    day = str(date)[:4] + str(date)[5:7] + str(date)[8:10]
    try:
        path = f'../../output/data/median_breakpoint/value_weighted/{day}.csv'
        factors = pd.read_csv(path, index_col=0)
        
        for factor in df.columns:
            df[factor][date] = factors.sum(axis=0)[factor]
            
    except:
        pass

In [9]:
df = df.dropna(axis=0, how='all')

In [10]:
df = df.astype('float64')

In [11]:
df

Unnamed: 0,Size Factor,Value Factor,Prof Factor,Dur Factor,Valprof Factor,Fscore Factor,Debtiss Factor,Repurch Factor,Nissa Factor,Accruals Factor,...,Season Factor,Indrrev Factor,Indrrevlv Factor,Indmomrev Factor,Ciss Factor,Price Factor,Age Factor,Shvol Factor,Exchsw Factor,Ipo Factor
2005-01-03,-0.007250,-0.002220,0.002848,-0.001811,0.000193,-0.012732,0.011951,-0.012224,0.001617,-0.000937,...,0.011178,0.001106,-0.000284,-0.000747,0.012023,0.003686,0.011284,0.007006,-0.012305,-0.013585
2005-01-04,-0.005812,0.002406,0.000131,-0.000443,0.000407,-0.001371,-0.003775,0.003288,0.002553,0.002176,...,-0.004686,0.003534,0.001273,0.003758,0.000592,0.005505,-0.005436,0.007938,-0.008761,-0.003746
2005-01-05,-0.007585,-0.000680,0.002531,0.000031,0.000745,0.001034,-0.000153,0.002128,0.001562,0.002140,...,-0.000166,0.000463,0.001775,0.001068,-0.000544,0.003578,-0.000205,0.000880,0.012336,-0.003184
2005-01-06,-0.001500,0.003033,-0.004679,0.001966,-0.000633,0.002182,-0.003401,0.002203,0.002811,-0.003232,...,-0.004207,0.000332,-0.000388,-0.000581,-0.000139,0.004242,-0.002267,0.003488,0.001692,-0.001950
2005-01-07,-0.009869,-0.001528,0.002647,-0.000651,-0.001777,-0.003123,0.003930,-0.003404,0.001788,0.000977,...,0.003326,-0.001220,-0.001444,-0.002579,0.005095,0.002966,0.003752,0.001062,-0.004641,-0.011289
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-24,0.003048,0.000384,0.000576,0.000618,0.000507,-0.000646,-0.000189,-0.000553,-0.001697,0.000552,...,0.000907,-0.000191,0.000143,0.000013,0.000056,-0.001663,0.001521,-0.001252,-0.001661,0.007473
2019-12-26,-0.005158,-0.003743,0.003832,-0.003950,-0.002449,0.002755,0.003314,-0.005126,-0.001421,0.004557,...,-0.004316,0.001382,-0.003574,-0.000144,-0.001606,0.003872,-0.001577,0.004359,-0.000160,-0.004569
2019-12-27,-0.006018,-0.001082,-0.000313,-0.001617,-0.001192,0.001125,-0.002494,0.002886,0.001035,0.000894,...,0.002781,0.000730,0.000919,-0.000309,0.001758,0.005325,-0.002257,0.003977,0.003155,-0.008384
2019-12-30,0.001967,0.003428,-0.001280,0.002699,0.000514,0.000726,-0.002369,0.003128,0.001010,0.000867,...,-0.000751,0.000204,0.000569,0.001024,0.002712,-0.000794,-0.003138,0.001271,-0.007165,-0.006296


### Kozak's Factors

File naming convention:

* ret10_ANOM.csv: value-weighted returns on each of the p1-p10 portfolios for an anomaly ANOM
* n10_ANOM.csv: number of firms in each portfolio
* bmc10_ANOM.csv: book-to-market ratios of portfolios
* totalme10_ANOM.csv: total market capitalization of each portfolio

p1-p10: corresponding variables for each of the p1 (short) -- p10 (long) portfolios

#### Equal-Weighted

In [12]:
kozak = pd.DataFrame(index=daterange, columns=['Size Factor', 'Value Factor', 'Prof Factor', 'Dur Factor',
       'Valprof Factor', 'Fscore Factor', 'Debtiss Factor', 'Repurch Factor',
       'Nissa Factor', 'Accruals Factor', 'Growth Factor', 'Aturnover Factor',
       'Gmargins Factor', 'Divp Factor', 'Ep Factor', 'Cfp Factor',
       'Noa Factor', 'Inv Factor', 'Invcap Factor', 'Igrowth Factor',
       'Sgrowth Factor', 'Lev Factor', 'Roaa Factor', 'Roea Factor',
       'Sp Factor', 'Gltnoa Factor', 'Divg Factor', 'Invaci Factor',
       'Mom Factor', 'Indmom Factor', 'Valmom Factor', 'Valmomprof Factor',
       'Shortint Factor', 'Mom12 Factor', 'Momrev Factor', 'Lrrev Factor',
       'Valuem Factor', 'Nissm Factor', 'Sue Factor', 'Roe Factor',
       'Rome Factor', 'Roa Factor', 'Strev Factor', 'Ivol Factor',
       'Betaarb Factor', 'Season Factor', 'Indrrev Factor', 'Indrrevlv Factor',
       'Indmomrev Factor', 'Ciss Factor', 'Price Factor', 'Age Factor',
       'Shvol Factor', 'Exchsw Factor', 'Ipo Factor'])  

In [13]:
for col in kozak.columns:
    # converting to folder's name 
    factor = (col[0]).lower() + col[1:-7]

    # reading kozak's factor dataframe
    kozak_ret_path = f'../../input/kozak/daily/ret10_{factor}.csv'
    kozak_ret = pd.read_csv(kozak_ret_path, index_col=0)

    # converting index to datetime object
    kozak_ret.index = pd.to_datetime(kozak_ret.index)

    # slicing for the same daterange we have
    kozak_ret = kozak_ret.loc[daterange]
    
    # computing the equal-weighted portfolio with median as breakpoint for each factor
    kozak[col] = kozak_ret[['p1','p2', 'p3', 'p4', 'p5']].mean(axis=1) - kozak_ret[['p6','p7', 'p8', 'p9', 'p10']].mean(axis=1)

In [14]:
kozak = kozak.loc[df.index]

In [15]:
kozak

Unnamed: 0,Size Factor,Value Factor,Prof Factor,Dur Factor,Valprof Factor,Fscore Factor,Debtiss Factor,Repurch Factor,Nissa Factor,Accruals Factor,...,Season Factor,Indrrev Factor,Indrrevlv Factor,Indmomrev Factor,Ciss Factor,Price Factor,Age Factor,Shvol Factor,Exchsw Factor,Ipo Factor
2005-01-03,0.003017,0.001967,-0.004997,0.002332,-0.001337,0.001250,0.001198,-0.001154,-0.003264,-0.000624,...,0.000828,-0.001286,-0.001099,0.001678,0.003214,-0.002772,0.003131,-0.005649,-0.021917,-0.001009
2005-01-04,0.003241,-0.001805,-0.000923,0.002048,0.000321,0.000388,0.004035,-0.003979,-0.003792,-0.001230,...,0.004314,-0.004558,-0.002306,-0.003902,-0.000062,-0.007721,0.006356,-0.007465,0.005341,0.005980
2005-01-05,0.007884,0.001024,-0.002466,0.001025,0.000402,-0.001304,0.001833,-0.003625,-0.003024,-0.002534,...,0.001205,-0.000317,-0.001297,-0.000505,0.000693,-0.005925,0.002007,-0.002391,-0.011479,0.006103
2005-01-06,0.000882,-0.001051,0.002765,0.000109,0.000860,-0.000809,0.003517,-0.000394,-0.001734,0.001853,...,0.003968,0.000456,0.000466,0.000275,-0.002741,-0.002763,0.000992,-0.001393,0.007771,0.003997
2005-01-07,0.006705,0.001156,-0.002587,-0.000141,0.003571,-0.003855,0.000728,-0.002508,-0.000641,-0.001589,...,0.000711,0.001001,0.001908,0.002245,-0.000143,-0.000830,0.001805,0.000960,-0.006750,0.007169
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-24,-0.004843,-0.000915,-0.000198,-0.001045,-0.001587,0.000154,0.000433,0.000192,0.001356,-0.000240,...,-0.000575,0.000369,0.000184,-0.000469,0.000562,0.003800,-0.000925,0.002645,0.003471,-0.008171
2019-12-26,0.002803,0.001744,-0.002839,0.002644,0.002903,-0.002382,-0.004092,0.006067,0.000537,-0.004113,...,0.003416,-0.000430,0.003470,-0.000283,0.001992,-0.002371,0.002883,-0.002993,0.002608,0.003686
2019-12-27,0.004837,0.003296,-0.000874,0.003254,0.002320,-0.001941,0.001813,-0.002442,-0.000989,-0.001373,...,-0.002945,-0.000757,-0.000254,0.000579,-0.001426,-0.006920,0.002946,-0.004531,-0.005378,0.006888
2019-12-30,-0.001686,-0.003132,0.001840,-0.002968,-0.002126,-0.001552,0.002924,-0.003604,-0.001063,-0.000100,...,0.001185,-0.000643,0.000039,-0.001451,-0.002672,0.001844,0.003648,-0.002496,0.013216,0.007592


#### Value-Weighted

In [16]:
kozak_value = pd.DataFrame(index=daterange, columns=['Size Factor', 'Value Factor', 'Prof Factor', 'Dur Factor',
       'Valprof Factor', 'Fscore Factor', 'Debtiss Factor', 'Repurch Factor',
       'Nissa Factor', 'Accruals Factor', 'Growth Factor', 'Aturnover Factor',
       'Gmargins Factor', 'Divp Factor', 'Ep Factor', 'Cfp Factor',
       'Noa Factor', 'Inv Factor', 'Invcap Factor', 'Igrowth Factor',
       'Sgrowth Factor', 'Lev Factor', 'Roaa Factor', 'Roea Factor',
       'Sp Factor', 'Gltnoa Factor', 'Divg Factor', 'Invaci Factor',
       'Mom Factor', 'Indmom Factor', 'Valmom Factor', 'Valmomprof Factor',
       'Shortint Factor', 'Mom12 Factor', 'Momrev Factor', 'Lrrev Factor',
       'Valuem Factor', 'Nissm Factor', 'Sue Factor', 'Roe Factor',
       'Rome Factor', 'Roa Factor', 'Strev Factor', 'Ivol Factor',
       'Betaarb Factor', 'Season Factor', 'Indrrev Factor', 'Indrrevlv Factor',
       'Indmomrev Factor', 'Ciss Factor', 'Price Factor', 'Age Factor',
       'Shvol Factor', 'Exchsw Factor', 'Ipo Factor'])  

In [21]:
for col in kozak_value.columns:
    # converting to folder's name 
    factor = (col[0]).lower() + col[1:-7]

    # reading kozak's factor dataframe
    kozak_ret_path = f'../../input/kozak/daily/ret10_{factor}.csv'
    kozak_ret = pd.read_csv(kozak_ret_path, index_col=0)
    # converting index to datetime object
    kozak_ret.index = pd.to_datetime(kozak_ret.index)
    # slicing for the same daterange we have
    kozak_ret = kozak_ret.loc[daterange]


    # reading kozak's total market cap by portfolio dataframe
    kozak_totalme_path = f'../../input/kozak/daily/totalme10_{factor}.csv'
    kozak_totalme = pd.read_csv(kozak_totalme_path, index_col=0)
    # converting index to datetime object
    kozak_totalme.index = pd.to_datetime(kozak_totalme.index)
    # slicing for the same daterange we have
    kozak_totalme = kozak_totalme.loc[daterange]

    # computing the value weight
    value_weight_df = value_weight(kozak_totalme)

    # weighted portfolios
    temp = kozak_ret*value_weight_df
        
    # computing the value-weighted portfolio with median as breakpoint for each factor
    kozak_value[col] = temp[['p1','p2', 'p3', 'p4', 'p5']].sum(axis=1) - temp[['p6','p7', 'p8', 'p9', 'p10']].sum(axis=1)

In [22]:
kozak_value = kozak_value.loc[df.index]

In [23]:
kozak_value

Unnamed: 0,Size Factor,Value Factor,Prof Factor,Dur Factor,Valprof Factor,Fscore Factor,Debtiss Factor,Repurch Factor,Nissa Factor,Accruals Factor,...,Season Factor,Indrrev Factor,Indrrevlv Factor,Indmomrev Factor,Ciss Factor,Price Factor,Age Factor,Shvol Factor,Exchsw Factor,Ipo Factor
2005-01-03,-0.006485,-0.004554,-0.000933,-0.004839,-0.005239,0.002458,-0.002079,0.000705,0.000530,0.001881,...,-0.000132,-0.001705,-0.002298,-0.000846,0.001350,0.003541,-0.001633,-0.001461,0.009418,-0.009267
2005-01-04,-0.009199,-0.007721,0.001661,-0.007083,-0.005837,0.002690,-0.001634,-0.000259,0.000648,0.002439,...,0.001405,-0.003253,-0.003548,-0.004192,-0.000319,0.003695,-0.001265,-0.001484,0.012906,-0.012283
2005-01-05,-0.001957,-0.002403,-0.000221,-0.002701,-0.002615,0.000340,-0.000509,-0.001117,-0.000498,-0.000203,...,0.000568,-0.001134,-0.001895,-0.001160,0.000208,0.000033,-0.000903,-0.000670,0.004930,-0.004624
2005-01-06,0.003013,0.000948,0.001147,0.001405,0.001430,-0.001058,0.002611,-0.000680,-0.001645,0.000261,...,0.002234,0.000236,0.000714,0.001167,-0.001359,-0.002674,0.001648,-0.001209,-0.003397,0.003465
2005-01-07,-0.000156,0.000138,-0.001204,-0.000212,0.000646,-0.001449,-0.000214,-0.000960,-0.000022,-0.000679,...,0.000371,0.000378,0.000029,0.000590,-0.000060,0.000589,-0.000010,0.000778,0.002050,-0.001776
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-24,-0.000253,-0.000051,-0.000206,-0.000214,-0.000359,-0.000016,0.000256,-0.000065,0.000479,-0.000122,...,-0.000210,0.000336,0.000025,-0.000194,-0.000026,0.000201,-0.000262,0.000667,-0.000190,0.000004
2019-12-26,0.004963,0.004099,-0.003013,0.004107,0.003605,-0.003009,0.000188,-0.001450,-0.001266,-0.003003,...,0.000424,-0.000098,0.003120,0.000068,-0.000093,-0.004608,0.002210,-0.004117,-0.004902,0.004880
2019-12-27,-0.000141,0.000419,-0.000364,0.000428,0.000268,-0.000443,0.000431,-0.000142,-0.000059,-0.000391,...,-0.000637,-0.000317,0.000112,0.000395,-0.000613,-0.000303,0.000790,-0.000910,0.000873,-0.000702
2019-12-30,-0.005345,-0.004726,0.002216,-0.004654,-0.003569,0.001695,-0.000972,0.002645,0.001768,0.000468,...,0.002467,-0.000711,-0.001135,-0.000923,-0.000134,0.004835,-0.000613,0.002563,0.005665,-0.005357


### Ours vs Kozak's Factors Correlation

In [29]:
correlation_matrix(df[['Size Factor']], kozak_value[['Size Factor']])

Unnamed: 0,Size Factor
Size Factor,0.177012
