# Predicción de Stocks con LSTM Bayesiano (con PyTorch)

### Librerías

In [4]:
import pandas as pd
import numpy as np
#import mysql.connector
import plotly.graph_objects as go

## PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

from sklearn.preprocessing import MinMaxScaler

### Arquitectura del Modelo Bayesiano LSTM

In [5]:
class BayesianLSTM(nn.Module):
    
    def __init__(self, n_features, output_length, batch_size, var1):

        super(BayesianLSTM, self).__init__()

        self.batch_size = batch_size # user-defined

        self.hidden_size_1 = 128 # number of encoder cells (from paper)
        self.hidden_size_2 = 32 # number of decoder cells (from paper)
        self.stacked_layers = 2 # number of (stacked) LSTM layers for each stage
        self.dropout_probability = var1 # arbitrary value (the paper suggests that performance is generally stable across all ranges)

        self.lstm1 = nn.LSTM(n_features, 
                             self.hidden_size_1, 
                             num_layers=self.stacked_layers,
                             batch_first=True)
        self.lstm2 = nn.LSTM(self.hidden_size_1,
                             self.hidden_size_2,
                             num_layers=self.stacked_layers,
                             batch_first=True)
        
        self.fc = nn.Linear(self.hidden_size_2, output_length)
        self.loss_fn = nn.MSELoss()
        
    def forward(self, x):
        batch_size, seq_len, _ = x.size()

        hidden = self.init_hidden1(batch_size)
        output, _ = self.lstm1(x, hidden)
        output = F.dropout(output, p=self.dropout_probability, training=True)
        state = self.init_hidden2(batch_size)
        output, state = self.lstm2(output, state)
        output = F.dropout(output, p=self.dropout_probability, training=True)
        output = output[:, -1, :] # take the last decoder cell's outputs
        y_pred = self.fc(output)
        return y_pred
        
    def init_hidden1(self, batch_size):
        hidden_state = Variable(torch.zeros(self.stacked_layers, batch_size, self.hidden_size_1))
        cell_state = Variable(torch.zeros(self.stacked_layers, batch_size, self.hidden_size_1))
        return hidden_state, cell_state
    
    def init_hidden2(self, batch_size):
        hidden_state = Variable(torch.zeros(self.stacked_layers, batch_size, self.hidden_size_2))
        cell_state = Variable(torch.zeros(self.stacked_layers, batch_size, self.hidden_size_2))
        return hidden_state, cell_state
    
    def loss(self, pred, truth):
        return self.loss_fn(pred, truth)

    def predict(self, X):
        return self(torch.tensor(X, dtype=torch.float32)).view(-1).detach().numpy()

## Evaluación de los Stock

In [12]:
# Llamado de Datos
#conn = mysql.connector.connect(user='root', password='', host='localhost', database='stock_exchange')

stock= pd.DataFrame(pd.read_csv("n225.csv"))

#conn.close()
    
#Selección de variables
Selecccionadas=['Date', 'Open']
stock_2=stock[Selecccionadas]
stock_2=stock_2.loc[(stock_2["Date"] >= '2022-06-01')]
stock_2['Open']=np.log(stock_2['Open'])
    
    
# División en Entreanmiento y Prueba
def create_sliding_window(data, sequence_length, stride=1):
    X_list, y_list = [], []
    for i in range(len(data)):
        if (i + sequence_length) < len(data):
            X_list.append(data.iloc[i:i+sequence_length:stride, :].values)
            y_list.append(data.iloc[i+sequence_length, -1])
    return np.array(X_list), np.array(y_list)

train_split = 0.7
n_train = int(train_split * len(stock_2))
n_test = len(stock_2) - n_train

features = ['Open']
feature_array = stock_2[features].values

# Fit Scaler only on Training features
feature_scaler = MinMaxScaler()
feature_scaler.fit(feature_array[:n_train])
# Fit Scaler only on Training target values
target_scaler = MinMaxScaler()
target_scaler.fit(feature_array[:n_train, -1].reshape(-1, 1))

# Transfom on both Training and Test data
scaled_array = pd.DataFrame(feature_scaler.transform(feature_array), columns=features)

sequence_length = 10
X, y = create_sliding_window(scaled_array, sequence_length)

X_train = X[:n_train]
y_train = y[:n_train]

X_test = X[n_train:]
y_test = y[n_train:]
    
# Entrenamiento del modelo
n_features = scaled_array.shape[-1]
sequence_length = 10
output_length = 1

batch_size = 128
n_epochs = 150
learning_rate = 0.01

vari1=[0.1, 0.3, 0.5, 0.7, 0.9]
for x in vari1:
    bayesian_lstm = BayesianLSTM(n_features=n_features,
                            output_length=output_length,
                            batch_size = batch_size,
                            var1=x)

    criterion = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(bayesian_lstm.parameters(), lr=learning_rate)
    
    bayesian_lstm.train()

    for e in range(1, n_epochs+1):
        for b in range(0, len(X_train), batch_size):
            features = X_train[b:b+batch_size,:,:]
            target = y_train[b:b+batch_size]    

            X_batch = torch.tensor(features,dtype=torch.float32)    
            y_batch = torch.tensor(target,dtype=torch.float32)

            output = bayesian_lstm(X_batch)
            loss = criterion(output.view(-1), y_batch)  

            loss.backward()
            optimizer.step()        
            optimizer.zero_grad() 

        #if e % 10 == 0:
        #   print('epoch', e, 'loss: ', loss.item())
    
    # Evaluación del Rendimiento del Modelo
    offset = sequence_length

    def inverse_transform(y):
        return target_scaler.inverse_transform(y.reshape(-1, 1))

    training_df = pd.DataFrame()
    training_df['Date'] = stock_2['Date'].iloc[offset:n_train + offset:1] 
    training_predictions = bayesian_lstm.predict(X_train)
    training_df['Open'] = inverse_transform(training_predictions)
    training_df['source'] = 'Training Prediction'

    training_truth_df = pd.DataFrame()
    training_truth_df['Date'] = training_df['Date']
    training_truth_df['Open'] = stock_2['Open'].iloc[offset:n_train + offset:1] 
    training_truth_df['source'] = 'True Values'

    testing_df = pd.DataFrame()
    testing_df['Date'] = stock_2['Date'].iloc[n_train + offset::1] 
    testing_predictions = bayesian_lstm.predict(X_test)
    testing_df['Open'] = inverse_transform(testing_predictions)
    testing_df['source'] = 'Test Prediction'

    testing_truth_df = pd.DataFrame()
    testing_truth_df['Date'] = testing_df['Date']
    testing_truth_df['Open'] = stock_2['Open'].iloc[n_train + offset::1] 
    testing_truth_df['source'] = 'True Values'

    evaluation = pd.concat([training_df, testing_df, training_truth_df, testing_truth_df], axis=0)
    
    
## Cuantificación de la Incertidumbre
    n_experiments = 100

    test_uncertainty_df = pd.DataFrame()
    test_uncertainty_df['Date'] = testing_df['Date']

    for i in range(n_experiments):
        experiment_predictions = bayesian_lstm.predict(X_test)
        test_uncertainty_df['log_Open_{}'.format(i)] = inverse_transform(experiment_predictions)
    
    
    for i in [0.5, 1, 2, 3]:
        log_energy_consumption_df = test_uncertainty_df.filter(like='log_Open', axis=1)
        test_uncertainty_df['log_Open_mean'] = log_energy_consumption_df.mean(axis=1)
        test_uncertainty_df['log_Open_std'] = log_energy_consumption_df.std(axis=1)

        test_uncertainty_df = test_uncertainty_df[['Date', 'log_Open_mean', 'log_Open_std']]
    
        test_uncertainty_df['lower_bound'] = test_uncertainty_df['log_Open_mean'] - i*test_uncertainty_df['log_Open_std']
        test_uncertainty_df['upper_bound'] = test_uncertainty_df['log_Open_mean'] + 0.5*test_uncertainty_df['log_Open_std']
    
    

        test_uncertainty_plot_df = test_uncertainty_df.copy(deep=True)
        test_uncertainty_plot_df = test_uncertainty_plot_df.loc[test_uncertainty_plot_df['Date'].between('2022-06-01', '2023-03-24')]
        truth_uncertainty_plot_df = testing_truth_df.copy(deep=True)
        truth_uncertainty_plot_df = truth_uncertainty_plot_df.loc[testing_truth_df['Date'].between('2022-06-01', '2023-03-24')]

        upper_trace = go.Scatter(
            x=test_uncertainty_plot_df['Date'],
            y=test_uncertainty_plot_df['upper_bound'],
            mode='lines',
            fill=None,
            name='Límite superior de confianza al '+str(i)+' sigma'
            )
        lower_trace = go.Scatter(
            x=test_uncertainty_plot_df['Date'],
            y=test_uncertainty_plot_df['lower_bound'],
            mode='lines',
            fill='tonexty',
            fillcolor='rgba(255, 211, 0, 0.1)',
            name='Límite inferior de confianza al '+str(i)+' sigma'
            )
        real_trace = go.Scatter(
            x=truth_uncertainty_plot_df['Date'],
            y=truth_uncertainty_plot_df['Open'],
            mode='lines',
            fill=None,
            name='Valores reales'
            )

        data = [upper_trace, lower_trace, real_trace]

        fig = go.Figure(data=data)
        fig.update_layout(title='Cuantificación de incertidumbre para datos de prueba Para el valor Open Japon (N225), prob='+str(x),
                        xaxis_title='Tiempo',
                        yaxis_title='log_Open (USD)')

        fig.show()
    
### Evaluación de la Incertidumbre

# Using 99% confidence bounds
        bounds_df = pd.DataFrame()
        bounds_df['lower_bound'] = test_uncertainty_plot_df['lower_bound']
        bounds_df['prediction'] = test_uncertainty_plot_df['log_Open_mean']
        bounds_df['real_value'] = truth_uncertainty_plot_df['Open']
        bounds_df['upper_bound'] = test_uncertainty_plot_df['upper_bound']

        bounds_df['contained'] = ((bounds_df['real_value'] >= bounds_df['lower_bound']) & (bounds_df['real_value'] <= bounds_df['upper_bound']))
    

    
        print("Proporción of puntos contenidos con un intervalo de confianza del "+str(i)+" sigma:", bounds_df['contained'].mean())


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



Proporción of puntos contenidos con un intervalo de confianza del 0.5 sigma: 0.14


Proporción of puntos contenidos con un intervalo de confianza del 1 sigma: 0.0


Proporción of puntos contenidos con un intervalo de confianza del 2 sigma: 0.0


Proporción of puntos contenidos con un intervalo de confianza del 3 sigma: 0.0



DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



Proporción of puntos contenidos con un intervalo de confianza del 0.5 sigma: 0.36


Proporción of puntos contenidos con un intervalo de confianza del 1 sigma: 0.0


Proporción of puntos contenidos con un intervalo de confianza del 2 sigma: 0.0


Proporción of puntos contenidos con un intervalo de confianza del 3 sigma: 0.0



DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



Proporción of puntos contenidos con un intervalo de confianza del 0.5 sigma: 0.26


Proporción of puntos contenidos con un intervalo de confianza del 1 sigma: 0.0


Proporción of puntos contenidos con un intervalo de confianza del 2 sigma: 0.0


Proporción of puntos contenidos con un intervalo de confianza del 3 sigma: 0.0



DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



Proporción of puntos contenidos con un intervalo de confianza del 0.5 sigma: 0.36


Proporción of puntos contenidos con un intervalo de confianza del 1 sigma: 0.0


Proporción of puntos contenidos con un intervalo de confianza del 2 sigma: 0.0


Proporción of puntos contenidos con un intervalo de confianza del 3 sigma: 0.0



DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



Proporción of puntos contenidos con un intervalo de confianza del 0.5 sigma: 0.54


Proporción of puntos contenidos con un intervalo de confianza del 1 sigma: 0.0


Proporción of puntos contenidos con un intervalo de confianza del 2 sigma: 0.0


Proporción of puntos contenidos con un intervalo de confianza del 3 sigma: 0.0
