In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
from pandas import datetime
import math, time
import itertools
from sklearn import preprocessing
import datetime
from operator import itemgetter
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from math import sqrt
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.recurrent import LSTM
from keras.models import load_model
import keras
import h5py
import requests
import os


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## About this notebook

I am using the available data to predict Brent Oil Prices. Predicting on stochastic data--randomly determined--while incredibly hard can be optmize to a certain point using NN. While the accuracy is not very high further optimization in its accuracy can be achieved by a deeper understanding of the variables that compose the price of this dataset.
 
### Steps taken towards the prediction:
    1. Load the dataset and check its data
        1.1. Verify any possible missing data and use the average price to fill it.
        1.2. Visual inspection of the data using matplotlib
        1.3. Data quality check
    2. Build of technical indicators
        2.1. MA - Moving Average
        2.2. EMA - Exponential Moving Average
        2.3. MACD - Moving Average convergence-divergence
        2.4. Bollinger Bands
    3. Machine Learning
        3.1. Data Normalization
        3.3. Split of the data into training and testing
        3.3. Recurrent Neural Network Model & training
        3.4. Visual inspection of the data & predicted data using matplotlib
        3.5. Score
    4. Conclussion

### 1. Load the dataset and check its data

In [None]:
df = pd.read_csv('/kaggle/input/brent-oil-prices/BrentOilPrices.csv')
df.Date = pd.to_datetime(df.Date)
df.set_index('Date', inplace=True)
df.head()

#### 1.1. Verify any possible missing data and use the average price to fill it.
Most machine learning models do not handle missing values very well so it is common practise to check for it before going any further.

In [None]:
df.isnull().sum()

#### 1.2. Visual inspection of the data using matplotlib
Visualy analyzing the data to check for ouliers that might skew the predictions.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns # using seaborn because the charts are more visually pleasing

sns.set_theme(style="darkgrid")
plt.figure(figsize=(20,5))
sns.lineplot(x="Date", y="Price", data=df)
plt.title('Daily historical Brent Oil Prices available on the U.S. Energy Information Admin', fontsize=14)
plt.ylabel('Dollars per Barrel')
plt.show()

#### 1.3. Data quality check
Checking the dataset as a whole using summary statistics.

In [None]:
df.describe()

### 2. Build of technical indicators
These indicators are a set of mathematical calculation that will be based on our 'oil price' they will help us to predict--hopefully--where the price is going.

##### 2.2. MA - Moving Average
A Moving average is, often called *Simple Moving Average* or SMA is a average that considers a time period to its calculation. The formula is as follows:$$MA=\frac{A_{1}+A_{2}+A_{3}+...A_{n}}{n}$$ 
Where: A=Average in period *n* and *n* = Number of periods

***
##### 2.2. EMA - Exponential Moving Average
This indicator is called Exponential Moving Average, what set its appart is that the most recent days have a higher importance to the formulas. This is often use to predict buy/sell in the stock market.
$$EMA_{Today} = \begin{pmatrix}Value_{Today} * \begin{pmatrix}\frac{smoothing} {1+Days}\end{pmatrix}\end{pmatrix} 
                +EMA_{Yesterday} * \begin{pmatrix}1-\begin{pmatrix}\frac{smoothing} {1+Days}\end{pmatrix}\end{pmatrix}
                $$For the smoothing factor we will set its default value: 2.
***
#### 2.3. MACD - Moving Average convergence-divergence 
MACD the difference two EMA a short-period (fast) EMA and a long-period (slow)  EMA:$$MACD = EMA_{1} - EMA_{2}$$
The usual values for EMA 1 is 12 and the EMA 2 is 26.
***
#### 2.4. Bollinger Bands
This bands are useful to predict the volatily of a given item, allow us to see if there is anything that goes outside what is considere "normal behavior".
There are two bollinger bands, one in the upper part of the chart, called *upper band* and one in the lower part called *lower band*:$$Upper band: MA_{21 days} + (\sigma MA_{21 days}x * 2)$$
$$Upper band: MA_{21 days} - (\sigma MA_{21 days}x * 2)$$ Where: x is the variable you are using as a predictor.

In [None]:
# Creating a simple moving average for 7 and 21 days
df['ma7'] = df.Price.rolling(window=7).mean()
df['ma21'] = df.Price.rolling(window=21).mean()

# Creating the EMA
df['ema12'] = df.Price.ewm(span=12).mean().fillna(0)
df['ema26'] = df.Price.ewm(span=26).mean().fillna(0)
df['macd'] = df.ema12 - df.ema26

#The variables below are used for Bollinger Bands.
window=21
no_std = 2
rolling_mean = df.Price.rolling(window).mean()
rolling_std = df.Price.rolling(window).std()
df['bollinger_low'] = (rolling_mean - (rolling_std * no_std)).fillna(0)
df['bollinger_high'] = (rolling_mean + (rolling_std * no_std)).fillna(0)
df['ema'] = df.Price.ewm(com=0.5).mean()
df['momentum'] =  df.Price - 1

df.head()

### 3. Machine learning
Now the actual prediction using machine learning begins to take shape. As stated earlier, machine learning do not handle missing values very well, but it also has a problem with outliers. To Fix dis we will use a scaler.

### 3.1. Data Normalization
A scaler, in machine learning is a tool that uses a range and set all values that you pass to it within data range.
The code belows handles that.

In [None]:
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
dataset = min_max_scaler.fit_transform(df.Price.values.reshape(-1, 1))
dataset[0:10]

#### 3.3. Split of the data into training and testing
In order to test the effectiveness of our model we need to test it in the data that we already have.
To do so, you usually uses a *train_test_split* approach where 70% of our data is used for training and 30% for testing.

In [None]:
# split into train and test sets
train_size = int(len(dataset) * 0.7)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]
f'Dataset size: {len(df)} >> Train length: {len(train)} || Test Length: {len(test)}'

The function belows handles the data transformation of the training and test data so we can load it into our model, since LTMS prefere matrixes.

In [None]:
# convert an array of values into a dataset matrix
def create_dataset(dataset, look_back=15):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-1):
        a = dataset[i:(i+look_back), 0]
        dataX.append(a)
        dataY.append(dataset[i + look_back, 0])
    return np.array(dataX), np.array(dataY)

In [None]:
x_train, y_train = create_dataset(train, look_back=15)
x_test, y_test = create_dataset(test, look_back=15)
f'X_train: {x_train.shape} || \
y_train: {y_train.shape} || \
X_test: {x_test.shape} || \
y_test: {y_test.shape}'

We can now check our reshaped dataset.

In [None]:
x_train = np.reshape(x_train, (x_train.shape[0], 1, x_train.shape[1]))
x_test = np.reshape(x_test, (x_test.shape[0], 1, x_test.shape[1]))
f'X_train: {x_train.shape} || \
y_train: {y_train.shape} || \
X_test: {x_test.shape} || \
y_test: {y_test.shape}'

# LSTM

In [None]:
# create and fit the LSTM network
look_back = 15
model = Sequential()
model.add(LSTM(20, input_shape=(1, look_back)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(x_train, y_train, epochs=20, batch_size=1, verbose=2)

In [None]:
trainPredict = model.predict(x_train)
testPredict = model.predict(x_test)
# invert predictions
trainPredict = min_max_scaler.inverse_transform(trainPredict)
trainY = min_max_scaler.inverse_transform([y_train])
testPredict = min_max_scaler.inverse_transform(testPredict)
testY = min_max_scaler.inverse_transform([y_test])
# calculate root mean squared error
trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
print('Train Score: %.2f RMSE' % (trainScore))
testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
print('Test Score: %.2f RMSE' % (testScore))

In [None]:
# shift train predictions for plotting
trainPredictPlot = np.empty_like(dataset)
trainPredictPlot[:, :] = np.nan
trainPredictPlot[look_back:len(trainPredict)+look_back, :] = trainPredict

# shift test predictions for plotting
testPredictPlot = np.empty_like(dataset)
testPredictPlot[:, :] = np.nan
testPredictPlot[len(trainPredict)+(look_back*2)+1:len(dataset)-1, :] = testPredict

# plot baseline and predictions
plt.figure(figsize=(20,5))
plt.plot(trainPredictPlot, color='black', label='Train data')
plt.plot(testPredictPlot, color='blue', label='Prediction',)
plt.plot(min_max_scaler.inverse_transform(dataset),label='baseline', alpha=0.4, linewidth=5)
plt.title('Daily historical Brent Oil Prices available on the U.S. Energy Information Admin', fontsize=14)
plt.ylabel('Dollars per Barrel')
plt.legend()
plt.show()