## LASSO Forecast


**Input**: 
* YYYYMMDD_y.csv - retornos de 250 ações escolhidas aleatóriamente.
* YYYYMMDD_x.csv - primeiras três defasagens de retornos das ações.

**Output**: 
* YYYYMMDD_f_lasso.csv - previsões de retornos 250 das ações escolhidas aleatóriamente.


Esse notebook tem como objetivo estimar a variável de previsão a partir do LASSO, $f_{n, t}^{\mathrm{LASSO}}$.

$$
f_{n, t}^{\mathrm{LASSO}} \stackrel{\text { def }}{=} \tilde{\alpha}_n+\sum_{n^{\prime}=1}^{3 \cdot N} \tilde{\beta}_{n, n^{\prime}} \cdot x_{n^{\prime}, t}
$$

onde $x_{n^{\prime}, t}$ é o retorno do ativo $n^{\prime}$ padronizado para ter média nula e variância unitária dentro da janela de estimação de 30 minutos.

In [21]:
# pacotes
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV

In [22]:
# oculta mensagens de avisos
import warnings
warnings.filterwarnings("ignore")

In [23]:
# display all rows
# pd.set_option('display.max_rows', None)

In [24]:
x = pd.read_csv('../../output/data/20030102_x.csv', index_col=0)

In [25]:
x = x[3:-1]

In [26]:
x

Unnamed: 0_level_0,A(t-1),A(t-2),A(t-3),AA(t-1),AA(t-2),AA(t-3),AAAB(t-1),AAAB(t-2),AAAB(t-3),AAC(t-1),...,ZOOM(t-3),ZQK(t-1),ZQK(t-2),ZQK(t-3),ZRAN(t-1),ZRAN(t-2),ZRAN(t-3),ZTEL(t-1),ZTEL(t-2),ZTEL(t-3)
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
93400,0.005464,0.001645,0.000000,0.002167,-0.001734,0.002602,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,-0.002073,-0.002068,0.016667,0.0,0.0,-0.140357
93500,-0.001091,0.005464,0.001645,0.000865,0.002167,-0.001734,0.0,0.0,0.0,0.0,...,0.0,0.000371,0.000000,0.000000,0.005724,-0.002073,-0.002068,0.0,0.0,0.000000
93600,0.000000,-0.001091,0.005464,0.000000,0.000865,0.002167,0.0,0.0,0.0,0.0,...,0.0,0.000370,0.000371,0.000000,-0.007800,0.005724,-0.002073,0.0,0.0,0.000000
93700,0.001635,0.000000,-0.001091,-0.002599,0.000000,0.000865,0.0,0.0,0.0,0.0,...,0.0,0.003697,0.000370,0.000371,-0.004863,-0.007800,0.005724,0.0,0.0,0.000000
93800,0.000545,0.001635,0.000000,-0.002171,-0.002599,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.003697,0.000370,-0.000871,-0.004863,-0.007800,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155500,0.000522,0.000000,0.000000,0.000000,0.000000,-0.000424,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.001923,0.000000,-0.000995,0.0,0.0,0.000000
155600,0.000000,0.000522,0.000000,0.001696,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,-0.000596,0.001923,0.000000,0.0,0.0,0.000000
155700,0.000000,0.000000,0.000522,-0.000424,0.001696,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.001096,0.000000,0.000000,0.001655,-0.000596,0.001923,0.0,0.0,0.000000
155800,-0.001043,0.000000,0.000000,0.000000,-0.000424,0.001696,0.0,0.0,0.0,0.0,...,0.0,0.000730,0.001096,0.000000,-0.001655,0.001655,-0.000596,0.0,0.0,0.000000


In [27]:
y = pd.read_csv('../../output/data/20030102_y.csv', index_col=0)

In [28]:
y = y[3:-1]

In [29]:
y

Unnamed: 0_level_0,FITB(t),AGN(t),ZBRA(t),ADBE(t),CKFR(t),MEDI(t),TXT(t),CMCSA(t),HON(t),SCH(t),...,CVX(t),JPM(t),DISH(t),CHS(t),CSCO(t),FRX(t),OSIP(t),SAFC(t),YUM(t),AT(t)
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
93400,-0.000338,0.000000,0.000000,0.003945,0.013400,0.000182,0.000000,0.006667,0.000000,0.002706,...,0.000000,0.000000,0.003551,0.001564,0.000376,-0.000912,0.000000,0.000000,0.002452,-0.002142
93500,-0.000507,0.000000,-0.000156,-0.000394,-0.013338,-0.002364,0.002309,-0.006249,0.000000,0.003597,...,-0.001042,0.000000,0.001771,0.000521,-0.002636,0.000203,-0.003409,0.002725,0.000000,0.000000
93600,0.001688,0.000000,-0.001193,0.000394,0.009668,0.002183,0.000231,0.002087,0.000000,-0.002697,...,-0.001490,0.000000,0.001326,0.000000,-0.000754,0.000000,0.004646,-0.001577,0.000408,0.001753
93700,-0.000169,0.000000,-0.001593,-0.005131,-0.001134,-0.004370,0.000000,0.002499,-0.003984,-0.000900,...,-0.000895,0.000000,0.000883,0.013187,-0.001359,-0.000710,0.003085,-0.000287,0.001223,0.002139
93800,0.000337,0.000000,0.000364,-0.001981,-0.004273,-0.001826,0.000461,0.000042,-0.001998,0.000000,...,-0.001195,0.000000,-0.000883,-0.008513,-0.002800,-0.002845,0.007305,-0.005728,-0.005721,-0.000777
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155500,0.001326,-0.000681,0.000000,0.001151,-0.000058,0.000000,-0.000667,0.001011,-0.000400,0.000875,...,-0.000590,-0.000395,-0.000644,0.000494,-0.001835,-0.000200,0.002382,-0.000844,0.000399,0.000188
155600,-0.000995,0.000000,0.000034,-0.003106,0.008088,0.001078,-0.000111,0.000000,-0.000801,0.001748,...,0.000148,0.002761,0.002145,-0.000494,0.000661,0.000100,0.000535,0.000000,0.000000,0.000941
155700,-0.000664,0.000341,-0.000377,0.002735,-0.006349,-0.000359,-0.000111,0.000404,-0.001604,0.000000,...,-0.000148,-0.000394,0.000000,0.000000,-0.001432,-0.000050,-0.000535,0.000000,0.000399,-0.000565
155800,0.001078,-0.001363,-0.001888,-0.001562,0.000579,0.000000,-0.000222,0.000000,0.002005,0.000000,...,-0.000295,0.000394,-0.000429,0.000000,-0.000404,-0.000050,0.000000,0.000422,-0.000399,-0.001696


In [30]:
f_lasso = pd.DataFrame(index = y.iloc[30:].index, columns = y.columns)

In [31]:
f_lasso

Unnamed: 0_level_0,FITB(t),AGN(t),ZBRA(t),ADBE(t),CKFR(t),MEDI(t),TXT(t),CMCSA(t),HON(t),SCH(t),...,CVX(t),JPM(t),DISH(t),CHS(t),CSCO(t),FRX(t),OSIP(t),SAFC(t),YUM(t),AT(t)
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100400,,,,,,,,,,,...,,,,,,,,,,
100500,,,,,,,,,,,...,,,,,,,,,,
100600,,,,,,,,,,,...,,,,,,,,,,
100700,,,,,,,,,,,...,,,,,,,,,,
100800,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155500,,,,,,,,,,,...,,,,,,,,,,
155600,,,,,,,,,,,...,,,,,,,,,,
155700,,,,,,,,,,,...,,,,,,,,,,
155800,,,,,,,,,,,...,,,,,,,,,,


In [32]:
def LASSO_forecast(i, t):
    """
    t: variável de corte da janela de estimação (t = {0,...,356})
    i: índice referente à ação que estamos prevendo
    """

    """
    primeiro guardaremos duas variáveis importantes: minuto da previsão e nome da ação
    """
    index = f_lasso.iloc[t].name
    stock = y.iloc[:,i].name

    """
    precisamos padronizar os candidatos a preditores e colocar ambas variáveis na devida janela de estimação.
    """
    x_temp = x[t:30+t]
    x_temp = ( x_temp - x_temp.mean() ) / x_temp.std()
    x_temp = x_temp.dropna(axis=1)
    y_temp = y[t:30+t]

    """
    primeiro modelamos o valor do parâmetro de regularização ótimo usando cross-validation com k = 10 folds
    """
    model = LassoCV(cv=10, random_state=0, max_iter=10000)
    model.fit(x_temp, y_temp.iloc[:,i])

    """
    fitamos o modelo com o parâmetro de regularização ótimo
    """
    reg = Lasso(alpha=model.alpha_)
    reg.fit(x_temp, y_temp.iloc[:,i])

    """
    criamos um dataframe com a primeira coluna sendo os parâmetros estimados e a segunda o nome da ação defasado
    eliminaremos todas linhas que tiver parâmetro nulo para eficiência computacional
    """
    df = pd.DataFrame(zip(reg.coef_, x_temp))
    df = df[df[0] != 0]

    """
    precisamos criar um loop para fazer as previsões:
    somente para os preditores cujo parâmetro não é nulo, faremos a previsão
    """
    f = 0
    for j in range(len(df)):
        beta_ = df.iloc[j][0]
        stock_ = df.iloc[j][1]
        f = f + beta_ * x[stock_][index]
    
    return index, stock, f

In [33]:
LASSO_forecast(0, 0)

(100400, 'FITB(t)', 2.6848809362642767e-07)

In [35]:
for i in range(250):
    for t in range(356):
        print(i)
        print(t)
        tupla = LASSO_forecast(i, t)
        f_lasso.at[tupla[0], tupla[1]] = tupla[2]

0
0
0
1
0
2
0
3
0
4
0
5
0
6
0
7
0
8
0
9
0
10
0
11
0
12
0
13
0
14
0
15
0
16
0
17
0
18
0
19
0
20
0
21
0
22
0
23
0
24
0
25
0
26
0
27
0
28
0
29
0
30
0
31
0
32
0
33
0
34
0
35
0
36
0
37
0
38
0
39
0
40
0
41
0
42
0
43
0
44
0
45
0
46
0
47
0
48
0
49
0
50
0
51
0
52
0
53
0
54
0
55
0
56
0
57
0
58
0
59
0
60
0
61
0
62
0
63
0
64
0
65
0
66
0
67
0
68
0
69
0
70
0
71
0
72
0
73
0
74
0
75
0
76
0
77
0
78
0
79
0
80
0
81
0
82
0
83
0
84
0
85
0
86
0
87
0
88
0
89
0
90
0
91
0
92
0
93
0
94
0
95
0
96
0
97
0
98
0
99
0
100
0
101
0
102
0
103
0
104
0
105
0
106
0
107
0
108
0
109
0
110
0
111
0
112
0
113
0
114
0
115
0
116
0
117
0
118
0
119
0
120
0
121
0
122
0
123
0
124
0
125
0
126
0
127
0
128
0
129
0
130
0
131
0
132
0
133
0
134
0
135
0
136
0
137
0
138
0
139
0
140
0
141
0
142
0
143
0
144
0
145
0
146
0
147
0
148
0
149
0
150
0
151
0
152
0
153
0
154
0
155
0
156
0
157
0
158
0
159
0
160
0
161
0
162
0
163
0
164
0
165
0
166
0
167
0
168
0
169
0
170
0
171
0
172
0
173
0
174
0
175
0
176
0
177
0
178
0
179
0
180
0
181
0
182
0
183
0
184


KeyboardInterrupt: 

In [36]:
f_lasso

Unnamed: 0_level_0,FITB(t),AGN(t),ZBRA(t),ADBE(t),CKFR(t),MEDI(t),TXT(t),CMCSA(t),HON(t),SCH(t),...,CVX(t),JPM(t),DISH(t),CHS(t),CSCO(t),FRX(t),OSIP(t),SAFC(t),YUM(t),AT(t)
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100400,0.0,-0.0,0.0,0.0,0.0,0.000009,0.000008,0.000003,0.0,-0.000001,...,,,,,,,,,,
100500,-0.000002,0,0,-0.000003,0.0,-0.0,-0.0,0,0,-0.000003,...,,,,,,,,,,
100600,0.000001,0.0,-0.000001,-0.000004,0.0,-0.000001,0.0,0.000011,0.000001,0.000008,...,,,,,,,,,,
100700,0.0,-0.000002,0.0,-0.000003,0.0,-0.0,-0.000011,-0.0,0.000001,0,...,,,,,,,,,,
100800,-0.000001,0,0.0,-0.000003,0.0,0.000002,0,0.0,0.000002,-0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155500,0,0,0.0,0.000001,-0.0,0.0,-0.000001,0.0,-0.000002,0.0,...,,,,,,,,,,
155600,-0.0,0.0,0.0,-0.000002,0.0,0.0,0.0,0,-0.0,0,...,,,,,,,,,,
155700,0.0,0,0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,...,,,,,,,,,,
155800,-0.0,0.0,0.0,0.0,-0.0,0.0,-0.000001,-0.0,0,0.0,...,,,,,,,,,,


In [37]:
f_lasso.to_csv('..\..\output\data\\20030102_f_lasso.csv', sep=',', encoding='utf-8')