# Exploration of historical monthly gold prices

## Table of Contents
* [Explore price development](#1)
* [Autocorrelations](#2)
* [Most extreme movements](#3)
* [Evaluate annual returns for each year](#4)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd

# plots
import matplotlib.pyplot as plt

In [None]:
# load monthly data
df = pd.read_csv('../input/gold-prices/monthly_csv.csv')

# add a few features
df['logPrice'] = np.log(df.Price)
df['logChange'] = df.logPrice.diff()
df['percChange'] = df.Price.pct_change()

# extract year
df['Year'] = df['Date'].apply(lambda x : x[0:4]).astype(int)

# show results
df

In [None]:
# first overview plot
fig, ax = plt.subplots(figsize=(16,6))
ax.plot(df.Date, df.Price, marker='.')
ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-labels
plt.grid()
plt.show()

#### We see that before the 1970s not much happened. This was due to the Bretton Woods system which tied currencies, including the USD, to gold. In August 1971 the US stopped the convertibility between USD and gold meaning the end of the Bretton Woods system. More details see: https://en.wikipedia.org/wiki/Bretton_Woods_system

#### In this analysis we will therefore use only the data from 1971 on.

In [None]:
# use only years from 1971 on:
df = df.iloc[252:].copy().reset_index(drop=True)
# preview
df.head()

<a id='1'></a>
# Explore price development

In [None]:
# overview plot
fig, ax = plt.subplots(figsize=(16,6))
ax.plot(df.Date, df.Price, marker='.')
ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-labels
plt.grid()
plt.show()

In [None]:
# basic stats (on monthly basis)
df.describe()

In [None]:
# plot relative changes
fig, ax = plt.subplots(figsize=(16,6))
ax.scatter(df.Date, df.percChange, marker='.')
ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-labels
plt.title('Relative changes [monthly]')
plt.grid()
plt.show()

In [None]:
# distribution of relative changes
plt.figure(figsize=(12,4))
df.percChange.plot(kind='hist', bins=50, color='gold')
plt.title('Relative changes [monthly]')
plt.grid()
plt.show()

In [None]:
# distribution of log changes
plt.figure(figsize=(12,4))
df.logChange.plot(kind='hist', bins=50, color='gold')
plt.title('Log changes [monthly]')
plt.grid()
plt.show()

<a id='2'></a>
# Autocorrelations

In [None]:
# look for autocorrelations
plt.acorr(df.percChange, maxlags=20)
plt.title('Autocorrelations of percChange')
plt.grid()
plt.show()

#### For a 1 month lag there seems to be a certain correlation. Let's dive deeper into that:

In [None]:
# calc autocorrelation for lag 1 (month)
df.percChange.corr(df.percChange.shift(1), method='pearson')

In [None]:
# check using rank correlation as well
df.percChange.corr(df.percChange.shift(1), method='spearman')

In [None]:
# visualize
xx = np.asarray(df.percChange.shift(1))
yy = np.asarray(df.percChange)

plt.figure(figsize=(6,6))
plt.scatter(xx[1:],yy[1:], alpha=0.5) # leave out first point (prev = NA)
# add regression line
mm,bb = np.polyfit(xx[1:],yy[1:],1)
plt.plot(xx, mm*xx + bb, c='magenta')
plt.title('Autocorrelation - Lag = 1 Month')
plt.xlabel('Month')
plt.ylabel('Month + 1')
plt.grid()
plt.show()

#### Try the same with lag 2 (months):

In [None]:
# calc autocorrelation for lag 2 (month)
df.percChange.corr(df.percChange.shift(2), method='pearson')

In [None]:
# visualize
xx = np.asarray(df.percChange.shift(2))
yy = np.asarray(df.percChange)

plt.figure(figsize=(6,6))
plt.scatter(xx[2:],yy[2:], alpha=0.5) # leave out first point (prev = NA)
# add regression line
mm,bb = np.polyfit(xx[2:],yy[2:],1)
plt.plot(xx, mm*xx + bb, c='magenta')
plt.title('Autocorrelation - Lag = 2 Month')
plt.xlabel('Month')
plt.ylabel('Month + 1')
plt.grid()
plt.show()

<a id='3'></a>
# Most extreme movements

In [None]:
threshold_x = 0.15
df_ext = df[abs(df.percChange)>threshold_x]
df_ext

<a id='4'></a>
# Evaluate annual returns for each year

In [None]:
# summing up the log changes of 12 months gives us the log change for the full year
df_annual_returns = pd.DataFrame(df.groupby('Year')['logChange'].sum())
# add also simple return (exp(logR)-1)
df_annual_returns['Return'] = np.exp(df_annual_returns.logChange)-1
df_annual_returns

In [None]:
# plot annual returns
plt.figure(figsize=(16,4))
df_annual_returns.Return.plot(kind='bar')
plt.title('Annual returns')
plt.grid()
plt.show()

#### Please note, that 2020 is not a complete year here!

#### Let's look at the most successful year 1979 in detail:

In [None]:
df[(df.Year==1979) | (df.Date=='1978-12')] # we also need the last month of 1978 as "offset"