# Model for Predicting Gold Price

**Mean Absolute Percentage Error (MAPE)** of the model on the test data is about **3%**.

[Original dataset](https://www.kaggle.com/sid321axn/gold-price-prediction-dataset) in csv format contains 1718 rows and 80 columns in total, including stock indexes, currency exchange rates and stock prices for several precious metals. The goal is to predict future adjusted close price for the GOLD ETF.

Testing various sets of input features to predict the gold priced showed that better accuracy is achieved when using only the prices of gold for previous periods and technical indicators based on these prices.

Functions for calculating technical indicators were copied (with modifications) from [this notebook](https://www.kaggle.com/sid321axn/gold-price-prediction-using-machine-learning). Instead of sklearn regression models used by the author of the dataset in the original notebook **densely connected neural network** is applied to predict the next day's gold price.

In [None]:
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Plots display settings
plt.rcParams['figure.figsize'] = 12, 8
plt.rcParams.update({'font.size': 14})

In [None]:
FILE_PATH = '/kaggle/input/gold-price-prediction-dataset/FINAL_USO.csv'

In [None]:
TARGET_COLUMN = 'adj close'

In [None]:
# Tensorflow settings
EPOCHS = 1000
PATIENCE = 5
BATCH_SIZE = 64

## Functions

In [None]:
def daily_returns(df: pd.DataFrame, column: str) -> pd.DataFrame:
    """Function computes daily return values for selected parameter.
    :param df: DataFrame with original values
    :param column: Name of the column with selected parameter
    :return: Updated DataFrame
    """
    df[f'{column}_returns'] = df[column] / df[column].shift(1) - 1
    return df


def calculate_macd(df: pd.DataFrame, column: str,
                   nslow: int = 26, nfast: int = 12) -> pd.DataFrame:
    """Function computes moving average convergence-divergence (MACD)
    indicator for price values from selected 'column' in 'df'.
    :param df: DataFrame with original values
    :param column: Name of the column to use
    :param nslow: Larger time span
    :param nfast: Shorter time span
    :return: Updated DataFrame containing difference and MACD values
    """
    # Difference between two exponential moving averages
    # to measure momentum in a security
    emaslow = df[column].ewm(
        span=nslow, min_periods=nslow, adjust=True, ignore_na=False
    ).mean()
    emafast = df[column].ewm(
        span=nfast, min_periods=nfast, adjust=True, ignore_na=False
    ).mean()
    df[f'dif_{column}'] = emafast - emaslow
    # 9 days MACD indicator
    df[f'macd_{column}'] = df[f'dif_{column}'].ewm(
        span=9, min_periods=9, adjust=True, ignore_na=False
    ).mean()
    return df


def calculate_rsi(df: pd.DataFrame, column: str,
                  periods: int = 14) -> pd.DataFrame:
    """Function computes Relative Strength Index (RSI)
    for price values from selected 'column' in 'df'.
    :param df: DataFrame with original values
    :param column: Name of the column to use
    :param periods: Number of days
    :return: Updated DataFrame with RSI values
    """
    # Price difference with the previous day
    delta = df[column].diff()

    # Gain and loss
    up, down = delta.copy(), delta.copy()
    up[up < 0] = 0
    down[down > 0] = 0

    avg_gain = up.ewm(com=periods, adjust=False).mean()
    avg_loss = down.ewm(com=periods, adjust=False).mean().abs()

    df[f'rsi_{column}'] = 100 - 100 / (1 + avg_gain / avg_loss)
    return df


def calculate_sma(df: pd.DataFrame, column: str,
                  periods: int = 15) -> pd.Series:
    """Function computes Simple Moving Average (SMA)
    for price values from selected 'column' in 'df'.
    :param df: DataFrame with original values
    :param column: Name of the column to use
    :param periods: Number of days
    :return: Series with SMA values
    """
    return df[column].rolling(window=periods, min_periods=periods, center=False).mean()


def calculate_bands(df: pd.DataFrame, column: str,
                    peroids: int = 15) -> pd.DataFrame:
    """Function calculates Bollinger Bands
    for price values from selected 'column' in 'df'.
    :param df: DataFrame with original values
    :param column: Name of the column to use
    :param peroids: Number of days
    :return: Updated DataFrame containing upper and lower band values
    """
    std = df[column].rolling(window=peroids, min_periods=peroids, center=False).std()
    sma = calculate_sma(df, column)
    df[f'upper_band_{column}'] = sma + (2 * std)
    df[f'lower_band_{column}'] = sma - (2 * std)
    return df


def plot_history(hist):
    """Function plots a chart with training and validation metrics.
    :param hist: Tensorflow history object from model.fit()
    """
    # Losses
    mae = hist.history['loss']
    val_mae = hist.history['val_loss']

    # Epochs to plot along x axis
    x_axis = range(1, len(mae) + 1)

    plt.plot(x_axis, mae, 'bo', label='Training')
    plt.plot(x_axis, val_mae, 'ro', label='Validation')
    plt.title('Training and validation MSE')
    plt.ylabel('Loss (MSE)')
    plt.xlabel('Epochs')
    plt.legend()
    plt.tight_layout()
    plt.show()

## Data processing

In [None]:
# Original data
data = pd.read_csv(FILE_PATH,
                   index_col='Date',
                   parse_dates=True,
                   infer_datetime_format=True)
data.columns = data.columns.str.lower()

# Select only the features related to gold prices
data = data[['open', 'high', 'low', 'close', 'adj close']]
data.head()

In [None]:
print(f'Dataset size: {data.shape}')

In [None]:
# Price history
plt.plot(data[TARGET_COLUMN])
plt.title('Gold Price')
plt.show()

From the end of 2011 till beginning of 2016 gold price was mostly in a downward trend. In 2016-2018 gold price was fluctuating in a range from approximately 105 to 130. Using this time series to train a model could couse an issue. Most of the available data, which will be used for training, does not reflect the current price trends observed in the latest periods. However, when predicting the price just one step ahead it could be sufficient.

In [None]:
# Calculate daily returns for gold adjusted close price
data = daily_returns(data, TARGET_COLUMN)

In [None]:
# Add technical indicators for adjusted gold price
data = calculate_rsi(data, TARGET_COLUMN)
data = calculate_bands(data, TARGET_COLUMN)
data = calculate_macd(data, TARGET_COLUMN)
data.tail()

In [None]:
# As a result of calculating rolling indicators and shifting
# DataFrame has some missing values at the head and tail.
data.dropna(inplace=True)

In [None]:
# Create iterables containing input features
# and corresponding next day's prices
input_features = data.iloc[:-1, :].values
targets = data.iloc[1:, :][TARGET_COLUMN].values.reshape(-1, 1)

In [None]:
# Scale down predicted values for better model convergence
scaler = MinMaxScaler()
targets = scaler.fit_transform(targets)

In [None]:
# Leave latest periods of the time series for test and validation purposes
train_data = input_features[:-120]
val_data = input_features[-120:-50]
test_data = input_features[-50:]

train_targets = targets[:-120]
val_targets = targets[-120:-50]
test_targets = targets[-50:]

print(f'Train data: {train_data.shape}')
print(f'Validation data: {val_data.shape}')
print(f'Test data: {test_data.shape}')

# Total number of input features
n_features = train_data.shape[1]

In [None]:
# Create tensorflow dataset objects
train_ds = tf.data.Dataset.from_tensor_slices(
    (train_data, train_targets))\
    .shuffle(buffer_size=len(train_data))\
    .batch(BATCH_SIZE)

val_ds = tf.data.Dataset.from_tensor_slices(
    (val_data, val_targets)).batch(BATCH_SIZE)

test_ds = tf.data.Dataset.from_tensor_slices(
    (test_data, test_targets)).batch(BATCH_SIZE)

## Create and train a model

In [None]:
# Normalization layer to scale numeric data
normalizer = tf.keras.layers.experimental.preprocessing.Normalization(
    input_shape=(n_features,)
)
normalizer.adapt(train_data)

In [None]:
# Densely connected neural network
model = tf.keras.models.Sequential(
    [
        normalizer,
        tf.keras.layers.Dense(
            64, activation='relu',
            kernel_regularizer=tf.keras.regularizers.l2(0.001)),
        tf.keras.layers.Dense(
            32, activation='relu',
            kernel_regularizer=tf.keras.regularizers.l2(0.001)),
        tf.keras.layers.Dense(1)
    ]
)

model.compile(optimizer='adam', loss='mse',
              metrics=[tf.keras.metrics.MeanAbsolutePercentageError()])
model.summary()

In [None]:
# Train the model until validation accuracy stops improving
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=PATIENCE, restore_best_weights=True
)

history = model.fit(train_ds,
                    epochs=EPOCHS,
                    verbose=2,
                    validation_data=val_ds,
                    callbacks=[early_stop])

In [None]:
plot_history(history)

In [None]:
# Evaluate the model on the test set
test_loss, test_mape = model.evaluate(test_ds)
print(f'MSE loss on test data: {test_loss}\nMAPE: {test_mape}')

In [None]:
# Forecasts for validation and test periods
pred_val = model.predict(val_ds)
pred_val = scaler.inverse_transform(pred_val)
pred_test = model.predict(test_ds)
pred_test = scaler.inverse_transform(pred_test)

# Visualize forecast vs. actual prices
plt.plot(data[-150:][TARGET_COLUMN], label='Actual data')
plt.plot(data[-120:-50].index, pred_val.ravel(), label='Validation forecast')
plt.plot(data[-50:].index, pred_test.ravel(), label='Test forecast')
plt.title('Gold Price')
plt.legend()
plt.show()