In [None]:
import warnings
import numpy as np
import pandas as pd
import yfinance as yf
import plotly.offline as py
from keras.models import Sequential
from keras.backend import clear_session
from keras.callbacks import EarlyStopping
from keras_tuner.tuners import RandomSearch
from sklearn.preprocessing import RobustScaler
from utilities.evaluation import RegressionEvaluator
from utilities.time_series import create_lags_features, create_date_features
from keras.layers import SimpleRNN, LSTM, GRU, Dropout, Dense, Bidirectional, Conv1D, MaxPooling1D, Flatten, TimeDistributed

py.init_notebook_mode()
pd.options.plotting.backend = "plotly"
warnings.filterwarnings(action="ignore")
pd.set_option("float_format", "{:.2f}".format)

In [2]:
cop = yf.Ticker(ticker="COP=X").history(start="2016-01-01", end="2021-12-31")
cop.head(n=5).append(cop.tail(n=5))

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-01-01,3138.0,3138.0,3136.7,3137.3,0,0,0
2016-01-04,3141.6,3223.0,3128.7,3154.8,0,0,0
2016-01-05,3215.0,3215.0,3153.7,3173.9,0,0,0
2016-01-06,3174.5,3256.0,3163.9,3254.0,0,0,0
2016-01-07,3163.8,3289.0,3163.8,3271.0,0,0,0
2021-12-27,3989.75,3996.25,3979.25,3989.75,0,0,0
2021-12-28,3995.08,4004.47,3956.04,3995.08,0,0,0
2021-12-29,4007.42,4024.5,4006.63,4007.42,0,0,0
2021-12-30,4036.12,4038.31,3990.58,4036.12,0,0,0
2021-12-31,4068.25,4069.83,4068.25,4068.25,0,0,0


In [3]:
idx = cop.query("Close < 100").index
cop.loc[idx, "Close"] = np.nan

In [4]:
date_range = pd.date_range(start="1/1/2016", end="12/31/2021")
cop.reset_index(inplace=True)
cop.columns = [col.lower() for col in cop.columns.tolist()]

data = pd.DataFrame(data={"date": date_range})
data = pd.merge(left=data, right=cop[["date", "close"]], on=["date"], how="left")
data.head().append(data.tail())

Unnamed: 0,date,close
0,2016-01-01,3137.3
1,2016-01-02,
2,2016-01-03,
3,2016-01-04,3154.8
4,2016-01-05,3173.9
2187,2021-12-27,3989.75
2188,2021-12-28,3995.08
2189,2021-12-29,4007.42
2190,2021-12-30,4036.12
2191,2021-12-31,4068.25


In [5]:
data.set_index(keys="date", inplace=True)

In [6]:
is_na = data.isna()

pd.DataFrame(data={"qty": is_na.sum(),
                   "perc": round(is_na.mean() * 100, 2)})

Unnamed: 0,qty,perc
close,628,28.65


In [7]:
data.interpolate(inplace=True)

data.plot(y="close")

In [8]:
data = create_lags_features(data=data, y="close", lags_min=1, lags_max=30)
data = create_date_features(data=data)

data.head(n=3).append(data.tail(n=3))

Unnamed: 0_level_0,close,close_1,close_2,close_3,close_4,close_5,close_6,close_7,close_8,close_9,...,close_55,close_56,close_57,close_58,close_59,close_60,month,day_of_week,is_month_start,is_month_end
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-03-01,3249.2,3297.6,3296.9,3296.2,3295.5,3297.7,3265.5,3324.0,3312.1,3304.53,...,3254.0,3173.9,3154.8,3148.97,3143.13,3137.3,3,1,True,False
2016-03-02,3205.0,3249.2,3297.6,3296.9,3296.2,3295.5,3297.7,3265.5,3324.0,3312.1,...,3271.0,3254.0,3173.9,3154.8,3148.97,3143.13,3,2,False,False
2016-03-03,3139.2,3205.0,3249.2,3297.6,3296.9,3296.2,3295.5,3297.7,3265.5,3324.0,...,3209.5,3271.0,3254.0,3173.9,3154.8,3148.97,3,3,False,False
2021-12-29,4007.42,3995.08,3989.75,3992.03,3994.3,3996.58,3998.24,4000.15,4004.75,4002.31,...,3830.35,3797.25,3758.25,3758.25,3764.75,3771.25,12,2,False,False
2021-12-30,4036.12,4007.42,3995.08,3989.75,3992.03,3994.3,3996.58,3998.24,4000.15,4004.75,...,3870.25,3830.35,3797.25,3758.25,3758.25,3764.75,12,3,False,False
2021-12-31,4068.25,4036.12,4007.42,3995.08,3989.75,3992.03,3994.3,3996.58,3998.24,4000.15,...,3870.42,3870.25,3830.35,3797.25,3758.25,3758.25,12,4,False,True


In [9]:
date_features = [ft for ft in data.columns if not "close" in ft]
numeric_features = [ft for ft in data.columns if ft not in date_features + ["close"]]

for dt in date_features:
    data[dt] = pd.Categorical(data[dt])

In [10]:
close = data.pop("close")

train, train_close = data.loc["2016-01-01": "2021-09-30", ], close.loc["2016-01-01": "2021-09-30"]
test, test_close = data.loc["2021-10-01":"2021-12-31", ], close.loc["2021-10-01":"2021-12-31"]

In [11]:
scaler = RobustScaler()

train.loc[:, numeric_features] = scaler.fit_transform(train[numeric_features])
test.loc[:, numeric_features] = scaler.transform(test[numeric_features])

train = pd.get_dummies(data=train, columns=date_features)
test = pd.get_dummies(data=test, columns=date_features)

In [12]:
train_data = train.values
test_data = test.values

train_data = train_data.reshape((train_data.shape[0], 1, train_data.shape[1]))
test_data = test_data.reshape((test_data.shape[0], 1, test_data.shape[1]))

In [13]:
early_stopping = EarlyStopping(monitor="val_mape", patience=10)

### SimpleRNN

In [14]:
clear_session()

rnn = Sequential()
rnn.add(layer=SimpleRNN(units=128, activation="selu", return_sequences=True, input_shape=(train_data.shape[1], train_data.shape[2])))
rnn.add(layer=Dropout(rate=0.25))
rnn.add(layer=SimpleRNN(units=64, activation="relu", return_sequences=True))
rnn.add(layer=Dropout(rate=0.25))
rnn.add(layer=SimpleRNN(units=32, activation="selu"))
rnn.add(layer=Dropout(rate=0.25))
rnn.add(layer=Dense(units=1, activation="linear"))

rnn.compile(optimizer="adam", loss="mse", metrics=["mse", "mape"])

rnn.fit(train_data, train_close,
        batch_size=32, epochs=1000, callbacks=[early_stopping], validation_split=0.3, verbose=0)

2022-10-20 10:39:00.451254: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


<keras.callbacks.History at 0x137d71460>

### LSTM

In [15]:
clear_session()

lstm = Sequential()
lstm.add(layer=LSTM(units=128, activation="selu", return_sequences=True, input_shape=(train_data.shape[1], train_data.shape[2])))
lstm.add(layer=Dropout(rate=0.25))
lstm.add(layer=LSTM(units=64, activation="selu", return_sequences=True))
lstm.add(layer=Dropout(rate=0.25))
lstm.add(layer=LSTM(units=32, activation="selu"))
lstm.add(layer=Dropout(rate=0.25))
lstm.add(layer=Dense(units=1, activation="linear"))

lstm.compile(optimizer="adam", loss="mse", metrics=["mse", "mape"])

lstm.fit(train_data, train_close,
        batch_size=32, epochs=1000, callbacks=[early_stopping], validation_split=0.3, verbose=0)

<keras.callbacks.History at 0x137748f40>

### Bidirectional LSTM

In [16]:
early_stopping = EarlyStopping(monitor="val_mse", patience=10)

clear_session()

bidirectional_lstm = Sequential()
bidirectional_lstm.add(layer=Bidirectional(LSTM(units=128, activation="selu", return_sequences=True)))
bidirectional_lstm.add(layer=Dropout(rate=0.25))
bidirectional_lstm.add(layer=Bidirectional(LSTM(units=64, activation="selu", return_sequences=True)))
bidirectional_lstm.add(layer=Dropout(rate=0.25))
bidirectional_lstm.add(layer=Bidirectional(LSTM(units=32, activation="selu")))
bidirectional_lstm.add(layer=Dropout(rate=0.25))
bidirectional_lstm.add(layer=Dense(units=1, activation="linear"))

bidirectional_lstm.compile(optimizer="adam", loss="mse", metrics=["mse"])

bidirectional_lstm.fit(train_data, train_close,
                       batch_size=32, epochs=1000, callbacks=[early_stopping], validation_split=0.3, verbose=0)

<keras.callbacks.History at 0x138d0fdf0>

# Evaluation

In [17]:
models = [("RNN", rnn), ("LSTM", lstm), ("BidirectionalLSTM", bidirectional_lstm)]

for name, model in models:
    print(f"Las metricas para la arquitectura {name}:")
    evaluation = RegressionEvaluator(predicted=model.predict(test_data, verbose=0).ravel(),
                                     observed=test_close)
    evaluation.print_metrics()
    print(f"--" * 30)

Las metricas para la arquitectura RNN:
El RMSE es: 3099.9598356675474
El MAE es: 3090.3201718537703
El MAPE es: 0.7974164761860503
------------------------------------------------------------
Las metricas para la arquitectura LSTM:
El RMSE es: 462.9802474291146
El MAE es: 455.19275167713994
El MAPE es: 0.11699728175173373
------------------------------------------------------------
Las metricas para la arquitectura BidirectionalLSTM:
El RMSE es: 355.15623419393575
El MAE es: 345.8128768257473
El MAPE es: 0.0887840664156539
------------------------------------------------------------


In [18]:
train = pd.concat(objs=[pd.Series(model.predict(train_data, verbose=0).ravel(),
                                  name=f"{name}_prediction",
                                  index=train_close.index) for name, model in models], axis=1)

test = pd.concat(objs=[pd.Series(model.predict(test_data, verbose=0).ravel(),
                                 name=f"{name}_prediction",
                                 index=test_close.index) for name, model in models], axis=1)

for df, close in [(train, train_close), (test, test_close)]:
    df["close"] = close.values

In [19]:
test.plot()

In [20]:
pd.concat(objs=[train, test], axis=0).plot()