In [1]:
import pandas as pd
import numpy as np
from sklearn.covariance import ledoit_wolf

In [2]:
# Número de acciones en el portafolio
PORTFOLIO_SIZE = 300
# Número de portafolios a generar
NUM_PORTFOLIOS = 50
# Rutas de S3
S3_REFINED_URI = 's3://proyecto-integrador-20212-pregrado/datasets/refined/'

# Entrenamiento

In [3]:
# Cargar la matriz de precios
df_returns_train = pd.read_parquet(S3_REFINED_URI+'matriz_retornos_filtrada_train.parquet')
df_returns_train

Unnamed: 0_level_0,A,AA,AAN,AAP,AAPL,ABBV,ABT,ACN,ADBE,ADM,...,WY,WYND,WYNN,XEL,XOM,XRAY,XYL,YUM,ZION,ZTS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-03,1.850863,0.285036,0.748045,3.485500,-1.429295,1.256524,1.323722,0.264806,0.016866,0.861868,...,0.736236,-1.151932,-1.204273,0.000000,0.150633,0.730536,0.000000,0.558734,0.691866,-0.619387
2014-01-04,-0.524201,0.142113,-0.269997,-0.212164,-0.704820,-0.152730,-0.038805,-0.134936,-0.236088,-0.254042,...,-0.381316,-0.027582,-0.035702,-0.181488,-0.220596,-0.145048,0.000000,-0.039688,0.083794,-0.124649
2014-01-06,-0.342526,0.000000,-0.135364,-0.921332,0.862324,-3.269598,1.423395,-0.687876,-1.183232,0.347303,...,-1.084530,0.041385,0.219388,-0.581818,0.221083,-0.674414,0.000000,0.463208,-0.569324,0.031201
2014-01-07,1.172116,-0.047304,-1.660454,0.947783,-0.970411,-0.553469,-0.612401,0.841064,0.598700,-0.946008,...,0.225734,-0.799779,2.433437,1.627652,1.263411,1.472684,0.000000,1.027533,-0.235770,0.218341
2014-01-08,1.533101,2.082347,1.550655,-0.017715,0.584806,0.298151,0.333761,0.711395,-0.119027,-1.374330,...,-0.868726,0.778426,1.570498,-0.125967,-0.445589,0.144328,0.000000,0.078237,1.080351,-1.027077
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-07-11,-2.167985,-0.442282,-2.279000,-0.051154,-0.664468,-0.669830,0.398360,1.544240,0.182387,-0.203301,...,-1.975684,1.594331,0.299309,0.049060,0.064508,-0.882019,0.983076,0.468637,1.325967,-0.205142
2019-07-12,-0.153310,0.310973,2.469335,1.452242,0.703597,-1.390840,-2.065585,0.806617,0.568921,0.443379,...,-1.007752,1.198779,2.440891,-0.604773,0.077359,0.264317,1.651263,-0.403660,1.155943,-0.918474
2019-07-15,-1.549414,2.635075,0.960781,0.195485,0.954537,0.128223,-0.131077,-0.443402,-0.216583,-0.357910,...,-0.039154,0.043076,2.972811,0.592008,-0.695697,0.351494,-0.715238,0.810592,-2.177663,0.220712
2019-07-16,-0.772721,0.647249,0.187207,1.894392,-0.346038,-1.607854,-0.829257,-0.527286,-0.942724,-2.227011,...,0.822562,1.420883,1.066299,-0.286088,-1.498443,-1.085814,0.854701,-0.348432,0.308574,0.044045


In [4]:
# Crear un arreglo con los nombres de las acciones
stock_names = np.array(df_returns_train.columns)

In [5]:
# Función para sacar un portafolio de PORTFOLIO_SIZE acciones escogidas aleatoriamente
def select_random_stocks(stock_names, n_stocks=PORTFOLIO_SIZE):
    return np.random.choice(stock_names, size=PORTFOLIO_SIZE, replace=False)

In [6]:
# Generación de <NUM_PORTFOLIOS> porfafolios de <PORTFOLIO_SIZE> acciones
portfolios = [df_returns_train[select_random_stocks(stock_names)] for i in range(NUM_PORTFOLIOS)]

In [7]:
# Calcular la matriz de covarianza para cada portafolio, con el método habitual y con shrinkage de Ledoit & Wolf
cov_matrices = [i for i in range(NUM_PORTFOLIOS)]
cov_matrices_lw = [i for i in range(NUM_PORTFOLIOS)]

for i, portfolio in enumerate(portfolios):
    cov_matrices[i] = pd.DataFrame(np.cov(portfolio.T), index=portfolio.columns, columns=portfolio.columns)
    cov_matrices_lw[i] = pd.DataFrame(ledoit_wolf(portfolio)[0], index=portfolio.columns, columns=portfolio.columns)

In [8]:
# Guardar cada portafolio de entrenamiento y sus matrices de covarianzas en la zona Refined
for i, portfolio in enumerate(portfolios):
    print(i, end=', ')
    portfolio.to_parquet(f'{S3_REFINED_URI}portfolio_{i}_returns_train.parquet')
    cov_matrices[i].to_parquet(f'{S3_REFINED_URI}portfolio_{i}_cov.parquet')
    cov_matrices_lw[i].to_parquet(f'{S3_REFINED_URI}portfolio_{i}_cov_lw.parquet')

0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 

# Validación

In [9]:
# Cargar la matriz de precios
df_returns_test = pd.read_parquet(S3_REFINED_URI+'matriz_retornos_filtrada_test.parquet')
df_returns_test

Unnamed: 0_level_0,A,AA,AAN,AAP,AAPL,ABBV,ABT,ACN,ADBE,ADM,...,WY,WYND,WYNN,XEL,XOM,XRAY,XYL,YUM,ZION,ZTS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-07-18,0.761494,1.123110,0.728886,0.202917,1.156041,-0.182963,2.296840,0.510783,0.148550,-0.372116,...,-0.829712,-0.323415,-0.603518,1.093520,-0.860927,0.096584,-1.076526,1.090844,1.447984,1.126067
2019-07-19,-2.053330,-1.238787,-0.503382,-0.677129,-1.483247,0.432583,-0.273535,-0.431189,-1.022185,0.199203,...,0.000000,-0.886870,-0.303591,-1.872780,0.227121,-0.561404,0.590759,-0.468778,1.229688,-0.782949
2019-07-22,0.567768,-0.583910,-0.363636,1.236062,2.285517,-0.686232,0.662857,0.262927,0.521258,0.372763,...,0.159363,-0.240070,-0.631313,-0.082264,0.079979,-0.811574,-0.667656,0.684262,-2.060738,1.648400
2019-07-23,1.172554,-0.174027,0.476039,-0.704890,0.786642,-0.499853,0.760672,-0.020568,-0.249554,0.383758,...,0.875099,1.487639,3.341057,-0.526922,0.386255,0.498043,1.717700,-0.344219,-5.227021,-0.034504
2019-07-24,-0.457862,2.680323,2.605812,0.392977,-0.100555,-0.280733,-0.191549,-0.005143,0.796023,0.086324,...,0.630915,0.021556,1.887748,0.066214,-0.013268,0.194690,0.770925,-0.212559,1.682636,-0.396928
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-11-19,2.489157,3.217012,2.183833,-1.302998,0.542373,1.033225,1.266053,1.455987,1.431555,-0.689935,...,0.850990,-0.347544,3.367849,-1.574468,0.026874,-0.204207,-0.778783,0.915058,-1.386037,1.743119
2020-11-20,2.023919,-0.475436,0.526316,-1.224434,0.000000,1.122920,-0.224861,0.000000,-0.793617,-0.674295,...,-0.981574,-2.464543,-3.521198,-0.086468,0.000000,1.064047,0.190921,-0.294341,-0.364394,-0.108206
2020-11-23,1.190261,5.175159,2.808187,0.831025,0.000000,1.358318,-1.505454,0.000000,-0.330797,0.432010,...,1.321739,5.101311,0.157315,-1.399106,0.000000,2.166430,0.285835,0.009523,3.526646,-1.035085
2020-11-24,2.299055,5.122382,2.407407,2.918956,-2.916386,1.721608,-1.501007,2.783832,0.329725,3.543630,...,0.343289,3.674303,7.602094,-0.321826,0.000000,2.120491,3.008551,2.109122,5.601817,-2.122226


In [10]:
# Crear un arreglo con los nombres de las acciones
stock_names = np.array(df_returns_test.columns)

In [11]:
# Función para sacar un portafolio de PORTFOLIO_SIZE acciones escogidas aleatoriamente
def select_random_stocks(stock_names, n_stocks=PORTFOLIO_SIZE):
    return np.random.choice(stock_names, size=PORTFOLIO_SIZE, replace=False)

In [12]:
# Generación de <NUM_PORTFOLIOS> porfafolios de <PORTFOLIO_SIZE> acciones
portfolios = [df_returns_test[select_random_stocks(stock_names)] for i in range(NUM_PORTFOLIOS)]

In [13]:
# Guardar cada portafolio de entrenamiento y sus matrices de covarianzas en la zona Refined
for i, portfolio in enumerate(portfolios):
    print(i, end=', ')
    portfolio.to_parquet(f'{S3_REFINED_URI}portfolio_{i}_returns_test.parquet')

0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 