In [1]:
import helper 

## Load and clean data 

In [2]:
df = helper.load_clean_data('us_equities_logreturns.parquet')  
df.head()

shape before: (16155, 1012)
shape after: (8538, 50)
NaN value present ?  False


Unnamed: 0,^GSPC,GE,IBM,DIS,BA,CAT,AA,HPQ,DD,KO,...,CL,UIS,GD,WY,AXP,FRM,ASA,EXC,UNP,EIX
7595,0.009363,0.0,-0.006645,0.029853,0.015504,-0.013889,0.0,-0.019231,-0.012739,0.039221,...,0.0,-0.005364,0.0,0.013986,0.034289,0.037673,0.0,0.030305,-0.006803,0.019048
7596,-0.00279,0.0,-0.015114,-0.029853,-0.003854,-0.003503,-0.00692,-0.019608,0.00639,-0.039221,...,0.0,-0.020161,0.0,-0.028171,0.011173,0.0,-0.021053,0.014815,-0.010292,0.0
7597,-0.009195,0.0,-0.008496,0.015038,-0.015565,-0.014135,-0.006969,-0.030153,-0.012821,0.0,...,0.0,-0.016786,0.0,0.0,-0.022473,-0.050552,-0.043485,0.0,-0.013889,-0.019048
7600,-0.004837,0.0,0.003534,0.0,0.0,0.003724,0.014085,0.010811,0.006515,0.0,...,0.0,0.003889,-0.04652,-0.028988,0.0113,-0.005634,0.0,0.028573,-0.05446,0.0
7601,-0.00496,-0.029853,0.0,-0.015748,-0.034635,-0.003724,-0.014085,-0.010811,0.0,-0.039221,...,0.0,0.011704,0.0,0.0,0.022223,0.011236,0.021979,0.0,-0.007491,-0.018692


## The standard methodology
- For each pair $i$, $j$ of assets log-return, compute their correlation.
- Convert the correlation coefficients $\rho_{ij}$ into distances.

### - Get correlation: 

In [3]:
corr_df = helper.get_correlation(df, 0, 1000)
corr_df.head()

Unnamed: 0,^GSPC,GE,IBM,DIS,BA,CAT,AA,HPQ,DD,KO,...,CL,UIS,GD,WY,AXP,FRM,ASA,EXC,UNP,EIX
^GSPC,1.0,0.612887,0.749117,0.407287,0.517845,0.497068,0.488535,0.65094,0.610228,0.408714,...,0.303239,0.565772,0.442715,0.521328,0.602849,0.439942,0.356585,0.236877,0.633755,0.370892
GE,0.612887,1.0,0.532123,0.275035,0.359982,0.37197,0.310485,0.447893,0.347675,0.303557,...,0.239984,0.378983,0.260943,0.36013,0.386556,0.22303,0.21477,0.12564,0.326711,0.245536
IBM,0.749117,0.532123,1.0,0.306293,0.427801,0.352504,0.347561,0.5373,0.50302,0.333913,...,0.255915,0.504402,0.356618,0.360305,0.479152,0.234717,0.292678,0.216648,0.388773,0.256702
DIS,0.407287,0.275035,0.306293,1.0,0.210496,0.21374,0.240819,0.34442,0.278113,0.215018,...,0.068031,0.279239,0.263439,0.230504,0.267235,0.145102,0.061127,0.129066,0.204873,0.140365
BA,0.517845,0.359982,0.427801,0.210496,1.0,0.265206,0.294141,0.389155,0.332674,0.230304,...,0.202585,0.31887,0.33124,0.306071,0.34719,0.167866,0.236843,0.09391,0.274464,0.219071


### - Get distance: 

In [4]:
dist_df = helper.get_distance(corr_df)
dist_df.head()

Unnamed: 0,^GSPC,GE,IBM,DIS,BA,CAT,AA,HPQ,DD,KO,...,CL,UIS,GD,WY,AXP,FRM,ASA,EXC,UNP,EIX
^GSPC,0.0,0.879901,0.708354,1.088772,0.981993,1.002927,1.0114,0.835536,0.882918,1.087462,...,1.180475,0.931909,1.055732,0.978439,0.891237,1.058355,1.134386,1.235413,0.855856,1.121702
GE,0.879901,0.0,0.967343,1.20413,1.131387,1.120741,1.174321,1.050816,1.142213,1.180206,...,1.232896,1.114466,1.215777,1.131256,1.10765,1.246571,1.25318,1.322392,1.160422,1.228384
IBM,0.708354,0.967343,0.0,1.177886,1.069765,1.137977,1.142313,0.961977,0.996976,1.154198,...,1.219905,0.995588,1.134356,1.131102,1.020635,1.23716,1.189388,1.25168,1.105646,1.219261
DIS,1.088772,1.20413,1.177886,0.0,1.256586,1.254002,1.232218,1.145059,1.201571,1.252982,...,1.365261,1.200634,1.213722,1.240561,1.210591,1.307592,1.370309,1.319799,1.261053,1.31121
BA,0.981993,1.131387,1.069765,1.256586,0.0,1.212265,1.188157,1.105301,1.155271,1.240723,...,1.262866,1.167159,1.156512,1.178073,1.142637,1.290065,1.235441,1.346172,1.204605,1.249743


### - Get distance threshold:

In [5]:
print(helper.get_distance_threshold(dist_df, 0.6))

1.261142293484561


### - Generate graph dictionary:

In [6]:
g_dict = helper.generate_dict(dist_df, percentage_keep_=0.6)
print('Nodes:')
print(g_dict['nodes'][:5])
print('Edges:')
print(g_dict['edges'][:5])

Nodes:
[{'data': {'id': '0', 'label': '^GSPC'}}, {'data': {'id': '1', 'label': 'GE'}}, {'data': {'id': '2', 'label': 'IBM'}}, {'data': {'id': '3', 'label': 'DIS'}}, {'data': {'id': '4', 'label': 'BA'}}]
Edges:
[{'data': {'id': 'link_0', 'source': '0', 'target': '1', 'value': 0.8799008558378557}}, {'data': {'id': 'link_1', 'source': '0', 'target': '2', 'value': 0.7083544126957806}}, {'data': {'id': 'link_2', 'source': '0', 'target': '3', 'value': 1.0887724690206568}}, {'data': {'id': 'link_3', 'source': '0', 'target': '4', 'value': 0.9819925457514264}}, {'data': {'id': 'link_4', 'source': '0', 'target': '5', 'value': 1.002927444801489}}]


# - Save list of jsons with a determinate rolling window (and step size)

In [7]:
list_dict = helper.save_json_rolling(df, 'test', start_=1000, end_=None, rolling_window_=1000, step_=100, percentage_keep_=0.6)
g_dicts = list_dict[0]
print('Nodes:')
print(g_dicts['nodes'][:5])
print('Edges:')
print(g_dicts['edges'][:5])

Nodes:
[{'data': {'id': '0', 'label': '^GSPC'}}, {'data': {'id': '1', 'label': 'GE'}}, {'data': {'id': '2', 'label': 'IBM'}}, {'data': {'id': '3', 'label': 'DIS'}}, {'data': {'id': '4', 'label': 'BA'}}]
Edges:
[{'data': {'id': 'link_0', 'source': '0', 'target': '1', 'value': 0.8799008558378557}}, {'data': {'id': 'link_1', 'source': '0', 'target': '2', 'value': 0.7083544126957806}}, {'data': {'id': 'link_2', 'source': '0', 'target': '3', 'value': 1.0887724690206568}}, {'data': {'id': 'link_3', 'source': '0', 'target': '4', 'value': 0.9819925457514264}}, {'data': {'id': 'link_4', 'source': '0', 'target': '5', 'value': 1.002927444801489}}]
