In [13]:
# packages
import numpy as np
import pandas as pd

In [14]:
# hide warning messages
import warnings
warnings.filterwarnings("ignore")

### Daterange

In [15]:
# getting the daterange in daily frequency to create some dataframes (we're gonna use the marketcap dataset for this)
returns_path = '../../../input/returns/daily.parquet'
returns = pd.read_parquet(returns_path)
daterange = returns.index
daterange

DatetimeIndex(['2005-01-03', '2005-01-04', '2005-01-05', '2005-01-06',
               '2005-01-07', '2005-01-10', '2005-01-11', '2005-01-12',
               '2005-01-13', '2005-01-14',
               ...
               '2019-12-17', '2019-12-18', '2019-12-19', '2019-12-20',
               '2019-12-23', '2019-12-24', '2019-12-26', '2019-12-27',
               '2019-12-30', '2019-12-31'],
              dtype='datetime64[ns]', length=3773, freq=None)

### Functions

fill_df function: receives two parameters, factor and df_factor.

* factor is the finacial factor ('size', 'value', ..., 'ipo').
* df_factor is the factor dataframe in daily frequency which will be filled.

This funtion returns the df_factor filled with the returns in daily frequency.

In [16]:
def fill_df(factor, df_factor):
    for date in daterange:
        day = str(date)[:4] + str(date)[5:7] + str(date)[8:10]
        try:
            # deciles portfolio dataframe
            path = f'../../../output/data/double_check/{factor}/{day}.parquet'
            deciles_portfolio = pd.read_parquet(path)

            # filling the df_factor
            for decile in df_factor.columns:
                df_factor[decile][date] = deciles_portfolio.sum(axis=0)[decile]
                
        except:
            pass
    # dropping two days with NaN data
    # df_factor = df_factor.dropna(axis=0, how='all')

    # converting to float type (to compute correlation)
    df_factor = df_factor.astype('float64')
    return(df_factor)

read_kozak function: receives one parameter, factor.

* factor is the financial factor data you want to read.

This funtion returns the portfolio-by-deciles returns in dataframe format (with the whole necessary treatment).

In [17]:
def read_kozak(factor):
    # reading kozak's data
    kozak_path = f'../../../input/kozak/daily/ret10_{factor}.csv'
    kozak_df = pd.read_csv(kozak_path, index_col=0)
    # treating data (datetime object and slicing)
    kozak_df.index = pd.to_datetime(kozak_df.index)
    kozak_df = kozak_df.loc[daterange]
    return(kozak_df)

## Intradaily Factors

In [18]:
size = pd.DataFrame(index=daterange, columns=['p1','p2','p3','p4','p5','p6','p7','p8','p9','p10'])
size = fill_df('size', size)
size

Unnamed: 0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10
2005-01-03,-0.010109,-0.014020,-0.016132,-0.015536,-0.017383,-0.017261,-0.019373,-0.022418,-0.021244,-0.015023
2005-01-04,-0.013028,-0.015371,-0.017263,-0.017974,-0.019939,-0.020209,-0.019879,-0.022224,-0.021613,-0.015979
2005-01-05,-0.002791,-0.005667,-0.007622,-0.007419,-0.009610,-0.011414,-0.013768,-0.012893,-0.013079,-0.006647
2005-01-06,0.001774,0.002710,0.002593,0.002261,0.000916,0.001457,0.000238,0.000014,-0.000670,0.000907
2005-01-07,-0.003371,-0.002109,-0.005479,-0.005572,-0.007488,-0.013478,-0.014525,-0.015509,-0.016021,-0.007450
...,...,...,...,...,...,...,...,...,...,...
2019-12-24,-0.000679,0.000305,-0.000546,-0.001128,-0.001207,0.000436,0.002240,0.003909,0.005084,0.006440
2019-12-26,0.004725,0.000493,0.000106,0.000992,-0.001072,-0.001296,-0.003418,-0.004265,0.001529,0.003057
2019-12-27,-0.002526,-0.003274,-0.004280,-0.005046,-0.007554,-0.006757,-0.009006,-0.011778,-0.013119,-0.007349
2019-12-30,-0.006588,-0.004699,-0.004705,-0.002762,-0.004505,-0.004506,-0.003058,-0.005136,-0.004270,-0.001719


In [19]:
value = pd.DataFrame(index=daterange, columns=['p1','p2','p3','p4','p5','p6','p7','p8','p9','p10'])
value = fill_df('value', value)
value

Unnamed: 0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10
2005-01-03,-0.012373,-0.010467,-0.014667,-0.014894,-0.019096,-0.014457,-0.019404,-0.010784,-0.016662,-0.016814
2005-01-04,-0.017645,-0.015512,-0.014567,-0.014889,-0.019023,-0.014759,-0.017945,-0.013756,-0.010957,-0.012115
2005-01-05,-0.003260,-0.005663,-0.007161,-0.006781,-0.006425,-0.005041,-0.007681,-0.003716,-0.005137,-0.009465
2005-01-06,-0.000552,-0.001850,0.001541,0.003403,0.007615,0.001885,0.001601,0.005978,0.003544,0.006540
2005-01-07,-0.001252,-0.005031,-0.006712,-0.004440,-0.005695,-0.005054,-0.006589,-0.005810,-0.004857,-0.003677
...,...,...,...,...,...,...,...,...,...,...
2019-12-24,-0.000227,-0.000908,-0.001413,0.000741,0.000062,0.001514,-0.000685,-0.000774,-0.000459,0.001132
2019-12-26,0.006030,0.005049,0.001308,-0.000157,0.001472,0.000455,-0.000252,0.000553,0.000048,0.000489
2019-12-27,-0.002977,-0.003638,-0.002873,-0.001522,-0.002470,-0.001537,-0.003092,-0.006428,-0.004066,-0.012573
2019-12-30,-0.008963,-0.002158,-0.008056,-0.005421,-0.002728,-0.000045,-0.003658,-0.003228,-0.004027,-0.002650


## Daily Rosetta Factors

In [20]:
path = '../../../output/data/double_check/size/p10_rosetta.parquet'
size_rosetta = pd.read_parquet(path)
size_rosetta

Unnamed: 0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10
2005-01-03,-0.006461,-0.013004,-0.014212,-0.015129,-0.015346,-0.015396,-0.016530,-0.018684,-0.016748,-0.009899
2005-01-04,-0.010839,-0.012740,-0.015369,-0.016596,-0.018734,-0.018767,-0.018031,-0.019579,-0.018941,-0.012705
2005-01-05,-0.002044,-0.006150,-0.007701,-0.008693,-0.010607,-0.013912,-0.016784,-0.016244,-0.016085,-0.009574
2005-01-06,0.003818,0.003397,0.004016,0.004879,0.004353,0.004172,0.004549,0.003077,0.002788,0.003101
2005-01-07,-0.000843,-0.000009,-0.002249,-0.003933,-0.004849,-0.010226,-0.009951,-0.009818,-0.010162,-0.003428
...,...,...,...,...,...,...,...,...,...,...
2019-12-24,-0.000239,0.001368,0.000335,0.000019,-0.000066,0.002068,0.003589,0.006206,0.007439,0.009490
2019-12-26,0.006453,0.002117,0.001926,0.002921,0.000384,0.000290,-0.001764,-0.003470,0.003230,0.005434
2019-12-27,0.000221,-0.001218,-0.001908,-0.002747,-0.005519,-0.004455,-0.006052,-0.008690,-0.009618,-0.004634
2019-12-30,-0.006376,-0.004406,-0.004865,-0.002052,-0.003104,-0.003450,-0.001584,-0.002843,-0.002831,0.002925


In [21]:
path = '../../../output/data/double_check/value/p10_rosetta.parquet'
value_rosetta = pd.read_parquet(path)
value_rosetta

Unnamed: 0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10
2005-01-03,-0.008093,-0.007180,-0.009779,-0.015246,-0.017949,-0.011507,-0.016649,-0.007953,-0.015070,-0.017031
2005-01-04,-0.014547,-0.012016,-0.014011,-0.012241,-0.017469,-0.011827,-0.014629,-0.011201,-0.012534,-0.010852
2005-01-05,-0.002834,-0.006662,-0.006986,-0.005531,-0.006003,-0.006396,-0.008289,-0.003233,-0.005986,-0.010047
2005-01-06,0.002313,-0.000191,0.004227,0.004815,0.008678,0.002043,0.004695,0.007244,0.004633,0.006604
2005-01-07,0.002147,-0.002234,-0.002724,-0.001807,-0.003142,-0.003585,-0.002696,-0.003984,-0.001652,-0.002047
...,...,...,...,...,...,...,...,...,...,...
2019-12-24,0.000021,-0.000158,-0.000638,0.001482,0.001669,0.002482,-0.000426,0.000881,0.000078,0.004266
2019-12-26,0.008284,0.006569,0.002446,0.000660,0.003094,0.002936,0.001000,0.002001,0.001189,0.005395
2019-12-27,0.000571,-0.000735,-0.000101,0.000635,-0.000561,0.000199,-0.002267,-0.003771,-0.001722,-0.008976
2019-12-30,-0.009238,-0.002294,-0.008398,-0.005188,-0.002500,-0.000342,-0.004120,-0.003462,-0.003489,0.000145


## Kozak Factors

In [22]:
kozak_size = read_kozak('size')
kozak_size

Unnamed: 0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10
2005-01-03,-0.006628,-0.013151,-0.014393,-0.015365,-0.015594,-0.015541,-0.016897,-0.019342,-0.017297,-0.011137
2005-01-04,-0.010999,-0.012918,-0.015597,-0.016811,-0.019032,-0.019290,-0.018520,-0.020069,-0.019577,-0.014108
2005-01-05,-0.002090,-0.006287,-0.007795,-0.008968,-0.010919,-0.014113,-0.017090,-0.016599,-0.016892,-0.010783
2005-01-06,0.003687,0.003149,0.003750,0.004514,0.004040,0.003846,0.003914,0.002597,0.002339,0.002032
2005-01-07,-0.000834,-0.000198,-0.002372,-0.004236,-0.005150,-0.010726,-0.010339,-0.010229,-0.010556,-0.004465
...,...,...,...,...,...,...,...,...,...,...
2019-12-24,-0.000321,0.001343,0.000300,-0.000053,-0.000103,0.001935,0.003392,0.005824,0.006321,0.007907
2019-12-26,0.006444,0.002105,0.001888,0.002851,0.000310,0.000140,-0.002035,-0.004853,0.002493,0.003839
2019-12-27,0.000065,-0.001251,-0.001972,-0.002798,-0.005732,-0.004607,-0.006413,-0.008806,-0.009984,-0.006062
2019-12-30,-0.006451,-0.004474,-0.004991,-0.002140,-0.003201,-0.003586,-0.001801,-0.003122,-0.003745,-0.000570


In [23]:
kozak_value = read_kozak('value')
kozak_value

Unnamed: 0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10
2005-01-03,-0.008290,-0.007450,-0.010014,-0.015414,-0.018242,-0.011825,-0.016862,-0.008045,-0.015170,-0.017344
2005-01-04,-0.014779,-0.012262,-0.014210,-0.012630,-0.017696,-0.012072,-0.014881,-0.011228,-0.013234,-0.011139
2005-01-05,-0.002986,-0.006862,-0.007153,-0.005690,-0.006257,-0.006601,-0.008520,-0.003211,-0.005493,-0.010243
2005-01-06,0.002033,-0.000453,0.004085,0.004639,0.008382,0.001817,0.004464,0.007057,0.004230,0.006370
2005-01-07,0.001977,-0.002347,-0.002828,-0.001961,-0.003428,-0.004033,-0.002841,-0.004093,-0.001261,-0.002140
...,...,...,...,...,...,...,...,...,...,...
2019-12-24,-0.000013,-0.000181,-0.000672,0.001366,0.001623,0.002418,-0.000484,0.000838,0.000054,0.003871
2019-12-26,0.008073,0.006448,0.002309,0.000593,0.003062,0.002871,0.000947,0.001941,0.001060,0.004945
2019-12-27,0.000547,-0.000771,-0.000147,0.000601,-0.000588,0.000158,-0.002314,-0.003742,-0.001817,-0.009122
2019-12-30,-0.009293,-0.002327,-0.008447,-0.005341,-0.002572,-0.000409,-0.004172,-0.003572,-0.003601,-0.000568


### Daily Rosetta vs Kozak

### Correlation Matrix

correlation_matrix function: receives two parameters, df1 and df2.

* df1 is the deciles portfolio dataframe for any factor.
* df2 is the deciles portfolio dataframe for any factor.

This funtion returns the correlation matrix between the two dataframes. The rows refers to deciles portfolios for df1 and columns refers to deciles portfolios for df2.

In [24]:
def correlation_matrix(df1, df2):
    # correlation matrix dataframe
    corr_matrix = pd.DataFrame(index=df1.columns, columns=df2.columns)

    # filling the correlation matrix dataframe
    for df1_decile in df1.columns:
        for df2_decile in df2.columns:
            corr_matrix[df2_decile][df1_decile] = df1[df1_decile].corr(df2[df2_decile])
    return(corr_matrix)

In [25]:
correlation_matrix(size_rosetta, kozak_size)

Unnamed: 0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10
p1,0.999602,0.969705,0.952876,0.93676,0.922144,0.909675,0.894061,0.876542,0.862228,0.832602
p2,0.969833,0.999567,0.982542,0.971445,0.958997,0.947732,0.92925,0.911661,0.893236,0.866701
p3,0.953122,0.982636,0.999471,0.983648,0.974016,0.962351,0.945028,0.92768,0.907046,0.885923
p4,0.936418,0.971239,0.983382,0.999457,0.982819,0.972204,0.956862,0.941662,0.919916,0.90415
p5,0.921591,0.958226,0.973251,0.982385,0.999454,0.979987,0.970799,0.959416,0.941032,0.920214
p6,0.909019,0.946679,0.961077,0.971278,0.979343,0.999323,0.985232,0.977657,0.964358,0.933492
p7,0.892155,0.926788,0.942572,0.954667,0.968595,0.983765,0.998604,0.985524,0.975639,0.941528
p8,0.873612,0.908292,0.924308,0.938485,0.956446,0.975304,0.984483,0.998225,0.983025,0.948727
p9,0.85544,0.886079,0.900018,0.913595,0.934507,0.958394,0.971703,0.97964,0.996073,0.953373
p10,0.824867,0.857764,0.876796,0.895387,0.911526,0.925333,0.935206,0.943477,0.952047,0.991737


In [26]:
correlation_matrix(value_rosetta, kozak_value)

Unnamed: 0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10
p1,0.999667,0.94512,0.924479,0.900221,0.896844,0.887516,0.863011,0.856456,0.825683,0.779264
p2,0.945654,0.99976,0.94263,0.91992,0.925828,0.913716,0.892956,0.885347,0.855966,0.805009
p3,0.923962,0.941821,0.999843,0.920079,0.932493,0.926772,0.906843,0.899839,0.871591,0.823373
p4,0.900129,0.91913,0.920018,0.999653,0.9302,0.92379,0.916043,0.913759,0.88819,0.845554
p5,0.896103,0.924531,0.932252,0.929669,0.999702,0.945002,0.925589,0.924333,0.897281,0.841041
p6,0.886585,0.912214,0.926418,0.923237,0.944823,0.999693,0.9323,0.937364,0.906922,0.860501
p7,0.862813,0.892137,0.907135,0.916692,0.926216,0.932881,0.999527,0.942002,0.912492,0.870178
p8,0.856083,0.884331,0.900073,0.914131,0.924723,0.937998,0.942129,0.999633,0.915314,0.888855
p9,0.824727,0.85476,0.87142,0.88843,0.898056,0.907272,0.912088,0.915172,0.999139,0.886772
p10,0.777108,0.802732,0.821366,0.845232,0.841132,0.860497,0.870359,0.888919,0.887432,0.997488
