# Gold Prices EDA and modelling
## Loading the data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
annual = pd.read_csv('/kaggle/input/gold-prices/annual_csv.csv')
monthly = pd.read_csv('/kaggle/input/gold-prices/monthly_csv.csv')

In [None]:
annual.head()

In [None]:
monthly.head()

In [None]:
annual.describe()

In [None]:
monthly.describe()

## Data Cleaning

In [None]:
annual.info()

In [None]:
monthly.info()

In [None]:
def set_ts_index(data, ts_column='Date'):
    data[ts_column] = pd.to_datetime(data[ts_column])
    data.set_index(ts_column, inplace=True)
    data.info()
    return data

In [None]:
annual = set_ts_index(annual)

In [None]:
monthly = set_ts_index(monthly)

In [None]:
annual.isna().sum()

In [None]:
monthly.isna().sum()

## Exploratory Data Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['figure.figsize'] = (14,8)
sns.set()

### Plotting the time series

In [None]:
fig, axes = plt.subplots(2,2)
plots = [
    ('annual', annual),
    ('annual', annual[:'1975']),
    ('log of annual', np.log(annual)),
    ('log of annual', np.log(annual['1970':])),
]

for i, plot_info in enumerate(plots):
    title, data = plot_info
    ax = axes[i//2, i%2]
    sns.lineplot(data=data, ax=ax)
    ax.set_title("{} gold prices from \n {} to {}".format(
        title.capitalize(), data.index.min().year, data.index.max().year
    ), pad=-30)
plt.show()

In [None]:
fig, axes = plt.subplots(2,2)
plots = [
    ('monthly', monthly),
    ('monthly', monthly[:'1975']),
    ('log of monthly', np.log(monthly)),
    ('log of monthly', np.log(monthly['1970':])),
]

for i, plot_info in enumerate(plots):
    title, data = plot_info
    ax = axes[i//2, i%2]
    sns.lineplot(data=data, ax=ax)
    ax.set_title("{} gold prices from \n {} to {}".format(
        title.capitalize(), data.index.min().year, data.index.max().year
    ), pad=-30)
plt.show()

Gold prices seem to be rising exponentially from 1970.

### Question

Why is the price of gold from 1950 to 1968 constant?
This is due to the [Bretton Woods System](https://en.wikipedia.org/wiki/Bretton_Woods_system)

### Visualizing the distribution of the data

In [None]:
fig, axes = plt.subplots(2,2)
plots = [
    ('annual', annual),
    ('annual', annual[:'1975']),
    ('log of annual', np.log(annual)),
    ('log of annual', np.log(annual['1970':])),
]

for i, plot_info in enumerate(plots):
    title, data = plot_info
    ax = axes[i//2, i%2]
    sns.histplot(data, ax=ax)
    ax.set_title("Histogram of {} gold prices from \n {} to {}".format(
        title.lower(), data.index.min().year, data.index.max().year
    ), pad=-30)
plt.show()

In [None]:
fig, axes = plt.subplots(2,2)
plots = [
    ('monthly', monthly),
    ('monthly', monthly[:'1975']),
    ('log of monthly', np.log(monthly)),
    ('log of monthly', np.log(monthly['1970':])),
]

for i, plot_info in enumerate(plots):
    title, data = plot_info
    ax = axes[i//2, i%2]
    sns.histplot(data, ax=ax)
    ax.set_title("Histogram of {} gold prices from \n {} to {}".format(
        title.lower(), data.index.min().year, data.index.max().year
    ), pad=-30)
plt.show()

The distribution of the data from 1970 is lognormal but left skewed.

### Time Series Analysis

In [None]:
# we shall work with data from 1970 going forward
annual_prices = annual['1970':].copy()
monthly_prices = monthly['1970':].copy()
annual_price_changes = annual_prices.diff().dropna()
monthly_price_changes = monthly_prices.diff().dropna()

### Differencing

In [None]:
fig, ax = plt.subplots()
sns.lineplot(data=annual_price_changes, ax=ax)
ax.set_title("Changes in anual gold price changes from \n {} to {}".format(
    annual_price_changes.index.min().year, annual_price_changes.index.max().year
), pad=-30)
ax.set_ylabel("Change in gold price")
plt.show()

In [None]:
fig, ax = plt.subplots()
sns.lineplot(data=monthly_price_changes, ax=ax)
ax.set_title("Changes in monthly gold price from \n {} to {}".format(
    monthly_price_changes.index.min().year, monthly_price_changes.index.max().year
), pad=-30)
ax.set_ylabel("Change in gold price")
plt.show()

### Plotting the autocorrelation

In [None]:
pd.plotting.autocorrelation_plot(annual_prices)
plt.title("Autocorrelation plot of annual gold prices")
plt.show()

In [None]:
pd.plotting.autocorrelation_plot(annual_price_changes)
plt.title("Autocorrelation plot of changes in annual gold price")
plt.show()

The autocorrelation of annual data is about 10 years. This implies that the price of gold this year is dependent on the price of gold for the last 10 years, i.e: last year, going backwards up to 10 years ago.

In [None]:
pd.plotting.autocorrelation_plot(monthly_prices)
plt.title("Autocorrelation plot of monthly gold prices")
plt.show()

In [None]:
pd.plotting.autocorrelation_plot(monthly_price_changes)
plt.title("Autocorrelation plot of changes in monthly gold price")
plt.show()

The autocorrelation of monthly data is about 150 months. This implies that the price of gold this month is dependent on the price of gold for the last 150 months, i.e: last month, going backwards up to 12.5 years ago.

### Time series decomposition

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

In [None]:
seasonal_decompose(annual_prices, period=10).plot()
print("Time series decomposition of annual gold prices")
plt.show()

In [None]:
seasonal_decompose(annual_price_changes).plot()
print("Time series decomposition of changes in annual gold prices")
plt.show()

In [None]:
seasonal_decompose(monthly_prices, period=150).plot()
print("Time series decomposition of monthly gold prices")
plt.show()

In [None]:
seasonal_decompose(monthly_price_changes, period=150).plot()
print("Time series decomposition of changes in monthly gold prices")
plt.show()

The annual data seems to have a nice trend and some seasonality, just as we expected. The monthly data, however, is quite noisy and though it had good-looking results on the autocorrelation plot, the same doesn't translate to the time series decomposition.

## Modelling

In [None]:
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.arima.model import ARIMA

In [None]:
# utility functions
def time_series_split(data, split_point):
    return data[:split_point], data[split_point:]

def get_features_and_targets(data, target_col='Price'):
    X = data.reset_index().copy()
    y = X.pop(target_col)
    X = np.squeeze(X)
    return X, y

In [None]:
import time

def train_model(model, X, **fit_params):
    start_time = time.perf_counter()
    model_instance = model(X, **fit_params).fit()
    fit_time = time.perf_counter() - start_time
    print(f'Fit time: {fit_time}s')
    return model_instance

In [None]:
def plot_model_performance(model, X, y, X_test, y_test):
    rows, cols = 2, 1
    fig, axes = plt.subplots(rows, cols, figsize=(14, 16))

    predictions = model.predict(start=X.min(), end=X.max())
    ax = axes[0]
    sns.lineplot(x=X, y=y, ax=ax)
    sns.lineplot(x=X, y=predictions.values, ax=ax)
    ax.set_title("Actual vs predicted gold price from \n {} to {}".format(
        X.min().year, X.max().year
    ), pad=-30)
#     print('Train RMSE: {}'.format(mean_squared_error(
#         y, predictions, squared=False)))

    y_pred = model.predict(start=X_test.min(), end=X_test.max())
    ax = axes[1]
    sns.lineplot(x=X_test, y=y_test, ax=ax)
    sns.lineplot(x=X_test, y=y_pred.values, ax=ax)
    ax.set_title("Actual vs predicted gold price from \n {} to {}".format(
        X_test.min().year, X_test.max().year
    ), pad=-30)
    
    for ax in axes:
        ax.set_ylabel("Gold price")
        ax.legend(["Actual", "Predicted"])
    plt.show()
    
    print('Test RMSE: {}'.format(mean_squared_error(
        y_test, y_pred, squared=False)))

### Annual Gold Prices

In [None]:
split_point = '2010'
Xa_train, Xa_test = time_series_split(annual_prices, split_point)

Xa, ya = get_features_and_targets(Xa_train)
Xa_test, ya_test = get_features_and_targets(Xa_test)

In [None]:
annual_ar_model = train_model(AutoReg, Xa_train, lags=10)

plot_model_performance(annual_ar_model, Xa, ya, Xa_test, ya_test)

The ARIMA model gives accurate predictions for the first 2 years.

In [None]:
annual_arima_model = train_model(ARIMA, Xa_train, order=(10,1,0))

plot_model_performance(annual_arima_model, Xa, ya, Xa_test, ya_test)

### Monthly Gold Prices

In [None]:
Xm_train, Xm_test = time_series_split(monthly_prices, split_point)

Xm, ym = get_features_and_targets(Xm_train)
Xm_test, ym_test = get_features_and_targets(Xm_test)

In [None]:
monthly_ar_model = train_model(AutoReg, Xm_train, lags=150)

plot_model_performance(monthly_ar_model, Xm, ym, Xm_test, ym_test)

In [None]:
monthly_arima_model = train_model(ARIMA, Xm_train, order=(150,1,0))

plot_model_performance(monthly_arima_model, Xm, ym, Xm_test, ym_test)

The ARIMA model gives accurate predictions for the first 12 months.

## Prediction
What will be the price of gold in 2020 and 2021?

In [None]:
annual_ar_model = AutoReg(annual_prices, lags=10).fit()

ya_pred = annual_ar_model.predict(start='2020', end='2021')

In [None]:
annual_data = annual_prices.append(pd.DataFrame(ya_pred, columns=['Price']))
annual_prices.tail(5)