<a href="https://colab.research.google.com/github/rafaelfioretti/gru_thesis/blob/main/Thesis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load all libraries & Random Initializers

In [1]:
import warnings
import sys
import pandas as pd
import numpy as np
import keras
import tensorflow as tf
import matplotlib.pyplot as plt
from pandas import read_csv
from pandas import DataFrame
from sklearn.preprocessing import StandardScaler
#from sklearn.preprocessing import MinMaxScaler
from keras.layers import GRU, Dense, Dropout
from keras.models import Sequential
from keras import Input
from keras import utils
from keras import initializers

In [2]:
# print version
print('Python: %s' % sys.version)
print('Keras: %s' % keras.__version__)
print('tensorflow: %s' % tf.__version__)
warnings.filterwarnings("ignore")

# Initialize Seeds for Reproducibility
seed = 12345
keras.utils.set_random_seed(seed)
initializer_glorot_uniform = initializers.GlorotUniform(seed=seed)
tf.config.experimental.enable_op_determinism()

Python: 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]
Keras: 2.15.0
tensorflow: 2.15.0


In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Dataset Load + Features Completeness


First Step is to install the Ta-lib library, that will be used to calculate technical indicators

In [None]:
!curl -L http://prdownloads.sourceforge.net/ta-lib/ta-lib-0.4.0-src.tar.gz -O && tar xzvf ta-lib-0.4.0-src.tar.gz


In [None]:
!cd ta-lib && ./configure --prefix=/usr && make && make install && cd - && pip install ta-lib


In [None]:
#Load the dataset
df = pd.read_csv('/content/gdrive/MyDrive/Mestrado/DVK_Uppsats/Implementation/dataset_full.csv', header=0,parse_dates=["Date"],converters={'Price': float})

# Display the DataFrame
print(df.head())


In [None]:
import talib

#Calculate SMA, EMA, RSI

df['SMA'] = talib.SMA(df['Close Price'])
df['EMA'] = talib.EMA(df['Close Price'])
df['RSI'] = talib.RSI(df['Close Price'])


In [None]:
#Drop the values from May that were used to calculate the RSI
df_dropped = df.drop(df.index[:31],inplace=True)
print(df.head())


In [None]:
print(df.head(60))


In [None]:
df.to_csv('/content/gdrive/MyDrive/Mestrado/DVK_Uppsats/Implementation/dataset_completed.csv', index=False)


# Dataset Preparation

In this section we will prepare the dataset, applying scaling, shaping the input into a 3D array and splitting the data between Train & Test

In [4]:
#df = pd.read_csv('/content/gdrive/MyDrive/Mestrado/DVK_Uppsats/Implementation/dataset_completed.csv', header=0,parse_dates=["Date"],converters={'Price': float})
df = pd.read_csv('/content/gdrive/MyDrive/dataset_completed.csv', header=0,parse_dates=["Date"],converters={'Price': float})

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1826 entries, 0 to 1825
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Date           1826 non-null   datetime64[ns]
 1   Open           1826 non-null   float64       
 2   High           1826 non-null   float64       
 3   Low            1826 non-null   float64       
 4   Volume         1826 non-null   int64         
 5   Google Trends  1826 non-null   float64       
 6   GVZ            1826 non-null   float64       
 7   OFR FSI        1826 non-null   float64       
 8   Close Price    1826 non-null   float64       
 9   SMA            1826 non-null   float64       
 10  EMA            1826 non-null   float64       
 11  RSI            1826 non-null   float64       
dtypes: datetime64[ns](1), float64(10), int64(1)
memory usage: 171.3 KB


In [6]:
df.describe()

Unnamed: 0,Date,Open,High,Low,Volume,Google Trends,GVZ,OFR FSI,Close Price,SMA,EMA,RSI
count,1826,1826.0,1826.0,1826.0,1826.0,1826.0,1826.0,1826.0,1826.0,1826.0,1826.0,1826.0
mean,2019-11-30 12:00:00,18640.906363,19130.435088,18097.133164,23431370000.0,7.309863,15.392985,-2.162311,18654.533214,18431.579065,18416.596933,52.618795
min,2017-06-01 00:00:00,1932.619995,2058.77002,1843.030029,527856000.0,0.45,8.88,-4.364,1929.819946,1943.011645,1940.641106,9.920239
25%,2018-08-31 06:00:00,6631.572266,6768.785156,6528.540039,6107588000.0,2.38,11.46,-3.47575,6637.024902,6665.115019,6766.310376,42.513872
50%,2019-11-30 12:00:00,9481.151367,9668.195312,9276.264648,19979730000.0,4.08,14.79,-2.7305,9480.549317,9467.363688,9362.93997,51.642027
75%,2021-02-28 18:00:00,33789.308595,34790.409182,32267.986815,34043750000.0,7.5,17.99,-1.623638,33740.259767,34674.724431,34163.893221,62.52169
max,2022-05-31 00:00:00,67549.73438,68789.625,66382.0625,350967900000.0,100.0,48.98,10.266,67566.82813,62792.418881,61839.701401,94.302215
std,,17640.379603,18095.216951,17110.975615,20540210000.0,9.787737,5.062886,2.084358,17634.277719,17519.236712,17423.510183,14.302528


In [7]:
#check whether any column is empty
missing_values = (df == "?").sum(axis=0)
print(missing_values)

Date             0
Open             0
High             0
Low              0
Volume           0
Google Trends    0
GVZ              0
OFR FSI          0
Close Price      0
SMA              0
EMA              0
RSI              0
dtype: int64


In [8]:
#Creating the main train DataFrames.
y_train_df = df['Close Price']
x_train_df = df.drop(columns='Date')
dataset_dates = df['Date']

#Define the size of the training set = 80%
# Train - 01/06/2017,31/05/2021
# Test  - 01/06/2021,31/05/2022
train_size = int(x_train_df.shape[0] * 0.8)

In [9]:
#Generic Method to Create the Keras vector Many to One
def generic_data_transform(x_data, y_data, num_steps=5):
    """ Changes data to the format for LSTM training
for sliding window approach """
    # Prepare the list for the transformed data
    X, y = list(), list()
    # Loop of the entire data set
    for i in range(x_data.shape[0]):
        # compute a new (sliding window) index
        end_ix = i + num_steps
        # if index is larger than the size of the dataset, we stop
        if end_ix >= x_data.shape[0]:
            break
        # Get a sequence of data for x
        seq_X = x_data[i:end_ix]
        # Get only the last element of the sequency for y
        seq_y = y_data[end_ix]
        # Append the list with sequencies
        X.append(seq_X)
        y.append(seq_y)
    # Make final arrays
    x_array = np.array(X)
    y_array = np.array(y)
    return x_array, y_array

In [10]:
#Generic function to create the dataset:
# Include what exogenous variable should be removed and the # of Steps

def create_datasets(gtrends=True,gvz=True,ofr=True,label=True,num_steps=30):

  #Create the dataset
  x_df_simu = x_train_df.copy()
  y_df_simu = y_train_df.copy()

  #Drop columns that are not part of the simulation
  #Verify Google Trends
  if(not gtrends):
    x_df_simu.drop(columns='Google Trends',inplace=True)

  #Verify GVZ
  if(not gvz):
    x_df_simu.drop(columns='GVZ',inplace=True)

  #Verify OFR
  if(not ofr):
    x_df_simu.drop(columns='OFR FSI',inplace=True)

  #Verify Label
  if(not label):
    x_df_simu.drop(columns='Close Price',inplace=True)

  #1 - Split
  x_train_df_simu = x_df_simu[:train_size]
  x_test_df_simu = x_df_simu[train_size:]

  y_train_df_simu = y_df_simu[:train_size]
  y_test_df_simu = y_df_simu[train_size:]

  #2 - Scale
  scaler_x = StandardScaler()
  scaler_y = StandardScaler()

  x_train_sc = scaler_x.fit_transform(x_train_df_simu)
  x_test_sc = scaler_x.transform(x_test_df_simu)

  y_train_sc = scaler_y.fit_transform(y_train_df_simu.to_numpy().reshape(-1, 1))
  y_test_sc = scaler_y.transform(y_test_df_simu.to_numpy().reshape(-1, 1))

  #3 - Transform the Array
  x_train_trans, y_train_trans = generic_data_transform(x_train_sc, y_train_sc, num_steps)

  assert x_train_trans.shape[0] == y_train_trans.shape[0]

  x_test_trans, y_test_trans = generic_data_transform(x_test_sc, y_test_sc, num_steps)

  assert x_test_trans.shape[0] == y_test_trans.shape[0]

  #4 - Confirm Shape
  print('Dataset created', x_train_trans.shape, y_train_trans.shape, x_test_trans.shape, y_test_trans.shape)

  return (x_train_trans, y_train_trans, x_test_trans, y_test_trans)

# Simulations Implementation


## Grid Search Generic Methods

In [11]:
#Create config File for Grid Search
def create_grid_search():

  # a_batch_size = [64]
  # b_epochs = [10]
  # c_dropout_rate = [0.15]
  # d_neurons = [240] # -> 80,160,240,320
  # e_dense_unit = [15] #-> 10, 15, 20
  # f_layers = [1]

  a_batch_size = [64, 48, 32]
  b_epochs = [10, 25, 50]
  c_dropout_rate = [0.15, 0.20, 0.25]
  d_neurons = [320,240,180,60] # -> 80,160,240,320
  e_dense_unit = [10,15,20,30] #-> 10, 15, 20
  f_layers = [2,1]


  configs = list()
  for a in a_batch_size:
    for b in b_epochs:
      for c in c_dropout_rate:
        for d in d_neurons:
          for e in e_dense_unit:
            for f in f_layers:
              cfg = [a,b,c,d,e,f]
              configs.append(cfg)

  print('Total configs: %d' % len(configs))

  return configs

In [12]:
#Create the Generic Model
def model_gen_fit(x_train_data, y_train_data, x_test_data,y_test_data, config):

  #Extract configuration
  a_batch_size, b_epochs, c_dropout_rate, d_neurons, e_dense_unit, f_layers = config

  #Create the model
  model = Sequential()
  if(f_layers==1):
    model.add(GRU(d_neurons, activation='tanh', input_shape=(x_train_data.shape[1], x_train_data.shape[2]), return_sequences=False,kernel_initializer=initializer_glorot_uniform,bias_initializer='Zeros'))
    model.add(Dropout(c_dropout_rate))
  else:
    model.add(GRU(d_neurons, activation='tanh', input_shape=(x_train_data.shape[1], x_train_data.shape[2]), return_sequences=True,kernel_initializer=initializer_glorot_uniform,bias_initializer='Zeros'))
    model.add(Dropout(c_dropout_rate))
    model.add(GRU(d_neurons, activation='tanh', return_sequences=False,kernel_initializer=initializer_glorot_uniform,bias_initializer='Zeros'))
    model.add(Dropout(c_dropout_rate))

  model.add(Dense(units=e_dense_unit,activation='relu',kernel_initializer=initializer_glorot_uniform,bias_initializer='Zeros'))
  model.add(Dense(units=1,kernel_initializer=initializer_glorot_uniform,bias_initializer='Zeros'))

  #Compile the Model
  model.compile(optimizer='adam', # default='rmsprop', an algorithm to be used in backpropagation
              loss='mean_squared_error', # Loss function to be optimized. A string (name of loss function), or a tf.keras.losses.Loss instance.
              metrics=['MeanSquaredError', 'RootMeanSquaredError'], #MeanAbsoluteError-  List of metrics to be evaluated by the model during training and testing. Each of this can be a string (name of a built-in function), function or a tf.keras.metrics.Metric instance.
              loss_weights=None, # default=None, Optional list or dictionary specifying scalar coefficients (Python floats) to weight the loss contributions of different model outputs.
              weighted_metrics=None, # default=None, List of metrics to be evaluated and weighted by sample_weight or class_weight during training and testing.
              run_eagerly=None, # Defaults to False. If True, this Model's logic will not be wrapped in a tf.function. Recommended to leave this as None unless your Model cannot be run inside a tf.function.
              steps_per_execution=None # Defaults to 1. The number of batches to run during each tf.function call. Running multiple batches inside a single tf.function call can greatly improve performance on TPUs or small models with a large Python overhead.
             )

  #Fit the Model
  history = model.fit(x_train_data, # input data
                    y_train_data, # target data
                    batch_size=a_batch_size, # Number of samples per gradient update. If unspecified, batch_size will default to 32.
                    epochs=b_epochs, # default=1, Number of epochs to train the model. An epoch is an iteration over the entire x and y data provided
                    verbose=0, # default='auto', ('auto', 0, 1, or 2). Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch. 'auto' defaults to 1 for most cases, but 2 when used with ParameterServerStrategy.
                    callbacks=None, # default=None, list of callbacks to apply during training. See tf.keras.callbacks
                    validation_split=0.2, # default=0.0, Fraction of the training data to be used as validation data. The model will set apart this fraction of the training data, will not train on it, and will evaluate the loss and any model metrics on this data at the end of each epoch.
                    #validation_data=(X_test, y_test), # default=None, Data on which to evaluate the loss and any model metrics at the end of each epoch.
                    shuffle=True, # default=True, Boolean (whether to shuffle the training data before each epoch) or str (for 'batch').
                    class_weight=None, # default=None, Optional dictionary mapping class indices (integers) to a weight (float) value, used for weighting the loss function (during training only). This can be useful to tell the model to "pay more attention" to samples from an under-represented class.
                    sample_weight=None, # default=None, Optional Numpy array of weights for the training samples, used for weighting the loss function (during training only).
                    initial_epoch=0, # Integer, default=0, Epoch at which to start training (useful for resuming a previous training run).
                    steps_per_epoch=None, # Integer or None, default=None, Total number of steps (batches of samples) before declaring one epoch finished and starting the next epoch. When training with input tensors such as TensorFlow data tensors, the default None is equal to the number of samples in your dataset divided by the batch size, or 1 if that cannot be determined.
                    validation_steps=None, # Only relevant if validation_data is provided and is a tf.data dataset. Total number of steps (batches of samples) to draw before stopping when performing validation at the end of every epoch.
                    validation_batch_size=None, # Integer or None, default=None, Number of samples per validation batch. If unspecified, will default to batch_size.
                    validation_freq=10, # default=1, Only relevant if validation data is provided. If an integer, specifies how many training epochs to run before a new validation run is performed, e.g. validation_freq=2 runs validation every 2 epochs.
                    max_queue_size=10, # default=10, Used for generator or keras.utils.Sequence input only. Maximum size for the generator queue. If unspecified, max_queue_size will default to 10.
                    workers=1, # default=1, Used for generator or keras.utils.Sequence input only. Maximum number of processes to spin up when using process-based threading. If unspecified, workers will default to 1.
                    use_multiprocessing=True, # default=False, Used for generator or keras.utils.Sequence input only. If True, use process-based threading. If unspecified, use_multiprocessing will default to False.
                   )

  #Evaluate the Model
  results = model.evaluate(x_test_data, y_test_data,verbose=0)

  return results

##Simulation 1


In [None]:
#Create Grid Search
config = create_grid_search()
columns=['MSE', 'RMSE', 'Config']

# #Simulation 1
# print("\n\n")
# print("------ Simulation 1 Started ------")
# grid_results_sim1 = pd.DataFrame(columns=['MSE', 'RMSE', 'Config'])

# x_train_trans_sim1, y_train_trans_sim1, x_test_trans_sim1, y_test_trans_sim1= create_datasets(gtrends=False,gvz=False,ofr=False,label=True,num_steps=30)

# for cfg in config:
#   results = model_gen_fit(x_train_trans_sim1, y_train_trans_sim1, x_test_trans_sim1, y_test_trans_sim1, cfg)
#   grid_results_sim1 = pd.concat([grid_results_sim1, pd.DataFrame({'MSE': results[0], 'RMSE': results[2], 'Config': [cfg]})], ignore_index=True)
#   print(grid_results_sim1.tail(1))

# grid_results_sim1.to_csv('/content/gdrive/MyDrive/grid_simulation1.csv', index=False)
# print("------ Simulation 1 Completed ------")

#Simulation 2
print("\n\n")
print("------ Simulation 2 Started ------")

grid_results_sim2 = pd.DataFrame(columns=['MSE', 'RMSE', 'Config'])

x_train_trans_sim2, y_train_trans_sim2, x_test_trans_sim2, y_test_trans_sim2 = create_datasets(gtrends=True,gvz=False,ofr=False,label=True,num_steps=30)

for cfg in config:
  results = model_gen_fit(x_train_trans_sim2, y_train_trans_sim2, x_test_trans_sim2, y_test_trans_sim2, cfg)
  grid_results_sim2 = pd.concat([grid_results_sim2, pd.DataFrame({'MSE': results[0], 'RMSE': results[2], 'Config': [cfg]})], ignore_index=True)
  print(grid_results_sim2.tail(1))

grid_results_sim2.to_csv('/content/gdrive/MyDrive/grid_simulation2.csv', index=False)
print("------ Simulation 2 Completed ------")

#Simulation 3
print("\n\n")
print("------ Simulation 3 Started ------")

grid_results_sim3 = pd.DataFrame(columns=['MSE', 'RMSE', 'Config'])

x_train_trans_sim3, y_train_trans_sim3, x_test_trans_sim3, y_test_trans_sim3 = create_datasets(gtrends=False,gvz=True,ofr=False,label=True,num_steps=30)

for cfg in config:
  results = model_gen_fit(x_train_trans_sim3, y_train_trans_sim3, x_test_trans_sim3, y_test_trans_sim3, cfg)
  grid_results_sim3 = pd.concat([grid_results_sim3, pd.DataFrame({'MSE': results[0], 'RMSE': results[2], 'Config': [cfg]})], ignore_index=True)
  print(grid_results_sim3.tail(1))

grid_results_sim3.to_csv('/content/gdrive/MyDrive/grid_simulation3.csv', index=False)
print("------ Simulation 3 Completed ------")

#Simulation 4
print("\n\n")
print("------ Simulation 4 Started ------")
grid_results_sim4 = pd.DataFrame(columns=['MSE', 'RMSE', 'Config'])

x_train_trans_sim4, y_train_trans_sim4, x_test_trans_sim4, y_test_trans_sim4 = create_datasets(gtrends=False,gvz=False,ofr=True,label=True,num_steps=30)

for cfg in config:
  results = model_gen_fit(x_train_trans_sim4, y_train_trans_sim4, x_test_trans_sim4, y_test_trans_sim4, cfg)
  grid_results_sim4 = pd.concat([grid_results_sim4, pd.DataFrame({'MSE': results[0], 'RMSE': results[2], 'Config': [cfg]})], ignore_index=True)
  print(grid_results_sim4.tail(1))

grid_results_sim4.to_csv('/content/gdrive/MyDrive/grid_simulation4.csv', index=False)
print("------ Simulation 4 Completed ------")

#Simulation 5
print("\n\n")
print("------ Simulation 5 Started ------")
grid_results_sim5 = pd.DataFrame(columns=['MSE', 'RMSE', 'Config'])

x_train_trans_sim5, y_train_trans_sim5, x_test_trans_sim5, y_test_trans_sim5 = create_datasets(gtrends=True,gvz=True,ofr=True,label=True,num_steps=30)

for cfg in config:
  results = model_gen_fit(x_train_trans_sim5, y_train_trans_sim5, x_test_trans_sim5, y_test_trans_sim5, cfg)
  grid_results_sim5 = pd.concat([grid_results_sim5, pd.DataFrame({'MSE': results[0], 'RMSE': results[2], 'Config': [cfg]})], ignore_index=True)
  print(grid_results_sim5.tail(1))

grid_results_sim5.to_csv('/content/gdrive/MyDrive/grid_simulation5.csv', index=False)

print("------ Simulation 5 Completed ------")

Total configs: 864



------ Simulation 2 Started ------
Dataset created (1430, 30, 9) (1430, 1) (336, 30, 9) (336, 1)
        MSE      RMSE                      Config
0  4.311199  2.076343  [64, 10, 0.15, 320, 10, 2]
        MSE     RMSE                      Config
1  2.406189  1.55119  [64, 10, 0.15, 320, 10, 1]
        MSE     RMSE                      Config
2  1.893267  1.37596  [64, 10, 0.15, 320, 15, 2]
        MSE     RMSE                      Config
3  0.043214  0.20788  [64, 10, 0.15, 320, 15, 1]
        MSE      RMSE                      Config
4  2.519587  1.587321  [64, 10, 0.15, 320, 20, 2]
        MSE      RMSE                      Config
5  0.897853  0.947551  [64, 10, 0.15, 320, 20, 1]
        MSE      RMSE                      Config
6  3.652198  1.911073  [64, 10, 0.15, 320, 30, 2]
        MSE      RMSE                      Config
7  1.060165  1.029643  [64, 10, 0.15, 320, 30, 1]
        MSE      RMSE                      Config
8  1.577866  1.256132  [64, 10, 0.15,

In [None]:
import sys
print(sys.version)

In [None]:
##### Step 6 - Use model to make predictions
#pred_scenario1_result = model.predict(x_test_simu1_trans)

In [None]:
# print("")
# print('-------------------- Model Summary --------------------')
# model.summary() # print model summary
# print("")
# #print('-------------------- Weights and Biases --------------------')
# #print("Too many parameters to print but you can use the code provided if needed")
# #print("")
# #for layer in model.layers:
# #    print(layer.name)
# #    for item in layer.get_weights():
# #        print("  ", item)
# #print("")

# # Print the last value in the evaluation metrics contained within history file
# print('-------------------- Evaluation on Training Data --------------------')
# for item in history.history:
#     print("Final", item, ":", history.history[item][-1])
# print("")

# # Evaluate the model on the test data using "evaluate"
# print('-------------------- Evaluation on Test Data --------------------')
# results = model.evaluate(x_test_simu1_trans, y_test_simu1_trans)
# print("")

In [None]:
# ####OUTPUT######
# #Invert Scaler
# y_scenario1_inv = scaler_y.inverse_transform(y_test_simu1_trans)
# result_pred_scenario1_inv = scaler_y.inverse_transform(pred_scenario1_result)

# #Plot Graph
# plt.plot(y_scenario1_inv, color = 'black', label = 'Bitcoin Price')
# plt.plot(result_pred_scenario1_inv, color = 'green', label = 'Predicted Bitcoin Price')
# plt.title('Scenario 1 - Bitcoin Price Prediction')
# plt.xlabel('Time')
# plt.ylabel('Bitcoin Price')
# plt.legend()
# plt.show()