In [1]:
# pacotes
import numpy as np
import pandas as pd
import statsmodels.formula.api as sm

In [2]:
# oculta mensagens de avisos
import warnings
warnings.filterwarnings("ignore")

In [3]:
f_lasso = pd.read_csv('../../output/data/20030102_f_lasso.csv', index_col=0)

In [4]:
f_bmk = pd.read_csv('../../output/data/20030102_f_bmk.csv', index_col=0)

In [5]:
y = pd.read_csv('../../output/data/20030102_y.csv', index_col=0)


$$
r_{n, t+1}=\bar{a}_n+\bar{b}_n \cdot\left(\frac{f_{n, t}^{\text {LASSO }}-\bar{m}_n^{\text {LASSO }}}{s_n^{\text {LASSO }}}\right)+e_{n, t+1}
$$

In [6]:
# padronização
f_lasso = (f_lasso - f_lasso.mean()) / f_lasso.std()

In [7]:
# não conseguimos estimar todos ativos, então vamos deixar apenas os que tiveram suas previsões estimadas
f_lasso = f_lasso.dropna(axis=1)

In [8]:
# mesma coisa para as ações
y = y[f_lasso.columns]

In [9]:
# também para o índice que começa a partir das 10:04 e termina em 15:59
y = y.loc[100400:155900]

In [10]:
lasso = pd.DataFrame(columns=['Adj. R-Squared', 'a', 'b'], index=y.columns)

In [11]:
ols = pd.DataFrame()

In [12]:
def adjusted_Rsquared(result):
    r2 = result.rsquared
    n = len(y)
    k = len(result.params) - 1
    ar2 = 1 - ( 1 - r2) * ( ( n - 1) / ( n - k ) )
    return ar2

In [13]:
for col in f_lasso.columns:
    ols['y'] = y[col]
    ols['x'] = f_lasso[col]
    result = sm.ols(formula="y ~ x", data=ols).fit()
    lasso.at[col, 'a'] = result.params[0]
    lasso.at[col, 'b'] = result.params[1]
    lasso.at[col, 'Adj. R-Squared'] = adjusted_Rsquared(result)

In [14]:
lasso

Unnamed: 0,Adj. R-Squared,a,b
FITB(t),0.005758,2.9e-05,5.6e-05
AGN(t),0.000409,3.2e-05,1.1e-05
ZBRA(t),3.7e-05,4e-05,-6e-06
ADBE(t),0.002273,4.8e-05,7.2e-05
CKFR(t),0.008107,0.000149,-0.00017
MEDI(t),0.003949,6.7e-05,7.8e-05
TXT(t),0.016412,7.9e-05,0.000113
CMCSA(t),0.001863,3.8e-05,-5.6e-05
HON(t),0.015476,6.2e-05,0.000187
SCH(t),0.0115,0.000107,-0.000145


In [15]:
lasso['Adj. R-Squared'].mean()

0.006785797817207614

$$
r_{n, t+1}=\bar{a}_n+\bar{c}_n \cdot\left(\frac{f_{n, t}^{\mathrm{Bmk}}-\bar{m}_n^{\mathrm{Bmk}}}{\bar{s}_n^{\mathrm{Bmk}}}\right)+e_{n, t+1} 
$$

In [16]:
# padronização
f_bmk = ( f_bmk - f_bmk.mean() ) / f_bmk.std()

In [17]:
# mesma coisa para as ações
f_bmk = f_bmk[f_lasso.columns]

In [18]:
bmk = pd.DataFrame(columns=['Adj. R-Squared', 'a', 'c'], index=y.columns)

In [19]:
for col in f_bmk.columns:
    ols['y'] = y[col]
    ols['x'] = f_bmk[col]
    result = sm.ols(formula="y ~ x", data=ols).fit()
    bmk.at[col, 'a'] = result.params[0]
    bmk.at[col, 'c'] = result.params[1]
    bmk.at[col, 'Adj. R-Squared'] = adjusted_Rsquared(result)

In [20]:
bmk

Unnamed: 0,Adj. R-Squared,a,c
FITB(t),0.004352,2.9e-05,4.9e-05
AGN(t),0.003372,3.2e-05,3.3e-05
ZBRA(t),0.001261,4e-05,-3.4e-05
ADBE(t),0.001346,4.8e-05,-5.6e-05
CKFR(t),0.00432,0.000149,0.000124
MEDI(t),0.011862,6.7e-05,0.000134
TXT(t),0.013154,7.9e-05,-0.000101
CMCSA(t),0.001102,3.8e-05,4.3e-05
HON(t),0.002527,6.2e-05,7.5e-05
SCH(t),0.003255,0.000107,7.7e-05


In [21]:
bmk['Adj. R-Squared'].mean()

0.005912546663266493

$$
r_{n, t+1}=\bar{a}_n+\bar{b}_n \cdot\left(\frac{f_{n, t}^{\mathrm{L} A S S O}-\bar{m}_n^{\mathrm{LASSO}}}{\bar{s}_n^{\mathrm{LASSO}}}\right)+\bar{c}_n \cdot\left(\frac{f_{n, t}^{\mathrm{Bmk}}-\bar{m}_n^{\mathrm{Bmkk}}}{\bar{B}_n^{\mathrm{Bmk}}}\right)+e_{n, t+1}
$$

In [22]:
both = pd.DataFrame(columns=['Adj. R-Squared', 'a', 'b', 'c'], index=y.columns)

In [23]:
for col in f_bmk.columns:
    ols['y'] = y[col]
    ols['x1'] = f_lasso[col]
    ols['x2'] = f_bmk[col]
    result = sm.ols(formula="y ~ x1 + x2", data=ols).fit()
    both.at[col, 'a'] = result.params[0]
    both.at[col, 'b'] = result.params[1]
    both.at[col, 'c'] = result.params[2]
    both.at[col, 'Adj. R-Squared'] = adjusted_Rsquared(result)

In [24]:
both

Unnamed: 0,Adj. R-Squared,a,b,c
FITB(t),0.005931,2.9e-05,4.9e-05,4.1e-05
AGN(t),0.001445,3.2e-05,1.7e-05,3.6e-05
ZBRA(t),-0.001548,4e-05,-3e-06,-3.3e-05
ADBE(t),0.001792,4.8e-05,8.9e-05,-7.5e-05
CKFR(t),0.011933,0.000149,-0.000195,0.000155
MEDI(t),0.012405,6.7e-05,7.1e-05,0.000131
TXT(t),0.026412,7.9e-05,0.000111,-9.9e-05
CMCSA(t),0.000305,3.8e-05,-5.9e-05,4.6e-05
HON(t),0.015456,6.2e-05,0.000188,7.9e-05
SCH(t),0.012635,0.000107,-0.000149,8.4e-05


In [36]:
round(lasso['Adj. R-Squared'].mean(), 4)

0.0068

In [37]:
round(bmk['Adj. R-Squared'].mean(), 4)

0.0059

In [39]:
round(both['Adj. R-Squared'].mean(), 4)

0.01

O resultado que obtivemos para essa análise de apenas um dia é que o coeficiente de determinação ajustado no modelo apenas com o LASSO é de $\bar{R}^{2,\operatorname{LASSO}}_n = 0.0068$, enquanto no modelo Benchmark é $\bar{R}^{2,\operatorname{Bmk}}_n = 0.0059$.

Mas a questão que temos é se incluir a previsão do retorno de um minuto à frente do LASSO aumenta a estatística $\bar{R}^2_n$, isso é, estamos calculando $\Delta \bar{R}^2_n \stackrel{\text { def }}{=} \bar{R}^{2,\operatorname{Both}}_n - \bar{R}^{2,\operatorname{Bmk}}_n$. Em um extremo, se o LASSO e o modelo Benchmark estiverem usando conjuntos informacionais totalmente diferentes para criar suas respectivas previsões de retornos de um minuto à frente, então encontraremos $\Delta \bar{R}^2_n = \bar{R}^{2,\operatorname{LASSO}}_n = 0.0068$. No outro extremo, se o LASSO estiver simplesmente replicando as previsões de retorno de um minuto à frente do modelo Benchmark, então estimaremos $\Delta \bar{R}^2_n = 0$.

In [42]:
round(both['Adj. R-Squared'].mean() - bmk['Adj. R-Squared'].mean(), 4)

0.0041

O que encontramos foi $\Delta \bar{R}^2_n = 0.0041$.

### Adjusted R-Squared - Test

In [28]:
np.random.seed(123)

ols['x'] = np.random.normal(0, 1, 356)
ols['y'] = np.random.normal(0, 1, 356) + ols['x'] * 4

In [29]:
ols

Unnamed: 0_level_0,y,x,x1,x2
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100400,-5.300104,-1.085631,-0.070119,1.337158
100500,6.045849,0.997345,-0.070119,-4.160602
100600,-0.756578,0.282978,1.308417,-1.064912
100700,-7.153509,-1.506295,-0.070119,0.145608
100800,-2.715815,-0.578600,-0.070119,-0.413734
...,...,...,...,...
155500,-1.250348,-0.188297,-1.788132,0.077631
155600,-2.908612,-0.900009,-0.236489,0.009532
155700,-1.780848,-0.931002,-1.643033,0.236525
155800,-6.007513,-1.222737,-0.080157,-0.164391


In [30]:
result = sm.ols(formula="y ~ x", data=ols).fit()

In [31]:
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.944
Model:                            OLS   Adj. R-squared:                  0.943
Method:                 Least Squares   F-statistic:                     5918.
Date:                Thu, 23 Feb 2023   Prob (F-statistic):          4.72e-223
Time:                        14:27:06   Log-Likelihood:                -501.61
No. Observations:                 356   AIC:                             1007.
Df Residuals:                     354   BIC:                             1015.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0395      0.053      0.749      0.4

In [32]:
result.rsquared

0.9435576240797289

In [33]:
result.rsquared_adj

0.9433981823398411

In [34]:
adjusted_Rsquared(result)

0.9435576240797289