In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

from scipy import stats
from scipy.stats import norm, skew

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold, GroupKFold, GridSearchCV, StratifiedKFold

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import BayesianRidge,LogisticRegression, PassiveAggressiveClassifier, RidgeClassifier, OrthogonalMatchingPursuit
from sklearn.svm import SVR, NuSVR, LinearSVR
from sklearn.mixture import BayesianGaussianMixture, GaussianMixture
from sklearn.neighbors import KNeighborsRegressor, KernelDensity, KDTree
from sklearn.metrics import *

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

import sys, os
import random 

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
    
from IPython import display, utils

pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 300)
pd.set_option('max_colwidth', 400)


def set_seed(seed=4242):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
set_seed()

In [None]:
!pip install quandl

In [None]:
import quandl
import warnings
import itertools
import numpy as np
import pandas as pd


import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import PowerTransformer

sns.set_style('whitegrid')
sns.set_context('talk')


from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels import tsa
from scipy import stats


#from arima_utils import ad_fuller_test, plot_rolling_stats
#from arima_utils import plot_acf_pacf, arima_gridsearch_cv

In [None]:
import quandl
gold_df = quandl.get("WGC/GOLD_DAILY_USD", authtoken="ao5ZsdzsxHBykZGZ6tZ5")

In [None]:
data = gold_df.reset_index()
data.head(10)

## **Time Series Analysis**

In [None]:
plt.style.use('seaborn')
gold_df.Value.plot(figsize=(15, 6), color= 'darkcyan')
plt.show()

In [None]:
from pylab import rcParams
rcParams['figure.figsize'] = 17,15
rcParams['lines.color'] = 'teal'

series = gold_df.Value.values
result = seasonal_decompose(series, model='additive', period=120)
sns.set()

plt.style.use('bmh')
result.plot()

plt.show()

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize=(9, 6))
sns.distplot(data.Value , bins=50, kde=True, hist=True, fit=norm, color = 'darkcyan');

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(data.Value)
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('passengers distribution')

#Get also the QQ-plot
fig = plt.figure(figsize=(9, 6))
res = stats.probplot(data.Value, plot=plt)
plt.show()

### Effects of power transform on time series

In [None]:
plt.style.use('fivethirtyeight')


pt = PowerTransformer(method='box-cox', standardize=False)
ptTargetbc = pt.fit_transform(data.Value.values.reshape(-1, 1))
ptTargetbc = pd.DataFrame(ptTargetbc)

pt2 = PowerTransformer(method='yeo-johnson', standardize=True)
ptTargetyc = pt2.fit_transform(data.Value.values.reshape(-1, 1))
ptTargetyc= pd.DataFrame(ptTargetyc)

plt.figure(1, figsize=(8, 4)); plt.title('Box-Cox')
#ptTargetbc.hist(bins=100, color='cyan')
sns.distplot(ptTargetbc, kde=False, bins=30, color = 'darkcyan')
plt.figure(2, figsize=(8, 4))
res = stats.probplot(ptTargetbc.values.ravel(), plot=plt)
plt.show()
plt.figure(3, figsize=(8, 4)); plt.title('yeo-johnson')
#ptTargetyc.hist(bins=100)
sns.distplot(ptTargetyc, kde=False, bins=30, color='darkgreen')
plt.figure(4, figsize=(8, 4))
res = stats.probplot(ptTargetyc.values.ravel(), plot=plt)
plt.show()

In [None]:
sns.set()
plt.style.use('seaborn')
#plt.figure(figsize=(12, 8))

pd.plotting.lag_plot(data['Value'])

In [None]:
sns.set()
plt.style.use('seaborn')
#plt.figure(figsize=(12, 8))

pd.plotting.lag_plot(data['Value'], lag=2)

In [None]:
plt.style.use('seaborn-poster')
pd.plotting.autocorrelation_plot(data.Value) 

### Rolling Plot

In [None]:
plt.figure(figsize=(20, 12))
data.Value.plot(color='darkorange', lw = 3)
data.Value.rolling(120).mean().plot(color='k', lw=2)

### upsampling

In [None]:
import quandl
data = quandl.get("WGC/GOLD_DAILY_USD", authtoken="ao5ZsdzsxHBykZGZ6tZ5")
data

In [None]:
data.head(20)

In [None]:
data.tail(20)

In [None]:
upsampled = data.resample('D').mean()
upsampled.head(10)

In [None]:
data.shape

In [None]:
upsampled.shape

>## Interpolation
>### You can use interpolate function to fill those NaN rows created above after resampling using different methods like :
>‘linear’: Ignore the index and treat the values as equally spaced. This is the only method supported on MultiIndexes.

>‘time’: Works on daily and higher resolution data to interpolate given length of interval.

>‘index’, ‘values’: use the actual numerical values of the index.

>‘pad’: Fill in NaNs using existing values.

>‘nearest’, ‘zero’, ‘slinear’, ‘quadratic’, ‘cubic’, ‘spline’, ‘barycentric’, ‘polynomial’: Passed to scipy.interpolate.interp1d. These methods use the numerical values of the index. Both ‘polynomial’ and ‘spline’ require that you also specify an order (int), e.g. df.interpolate(method='polynomial', order=5).

>‘krogh’, ‘piecewise_polynomial’, ‘spline’, ‘pchip’, ‘akima’, ‘cubicspline’: Wrappers around the SciPy interpolation methods of similar names. See Notes.

>‘from_derivatives’: Refers to scipy.interpolate.BPoly.from_derivatives which replaces ‘piecewise_polynomial’ interpolation method in scipy 0.18.
>#### https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.interpolate.html

In [None]:
lin_interpolated = upsampled.interpolate(method='linear')
print(lin_interpolated.head(32))
plt.style.use('fivethirtyeight')

lin_interpolated.plot(color='teal')
plt.show()

#### *‘polynomial’ and ‘spline’ require that you also specify an order (int), e.g. df.interpolate(method='polynomial', order=5)*

In [None]:
pol_interpolated = upsampled.interpolate(method='polynomial', order=5)
print(pol_interpolated.head(32))
plt.style.use('seaborn-poster')

pol_interpolated.plot(color='darkred')
plt.show()

In [None]:
pol_interpolated = upsampled.interpolate(method='spline', order=5)
print(pol_interpolated.head(32))
plt.style.use('seaborn-poster')

pol_interpolated.plot()
plt.show()

In [None]:
series

# **Box-Jenkins method with ARIMA**

>The Box-Jenkin’s methodology consists of a wide range of statistical models which are widely used to model
time series for forecasting. For this section, we will be concentrating on one such model called as ARIMA.
ARIMA stands for Auto Regressive Integrated Moving Average model. Let’s look at the basics and constituents 
of this model and then build on our understanding to forecast gold prices.

>- Auto Regressive or AR Modeling: A simple linear regression model where current
observation is regressed upon one or more prior observations. the dependency on prior values is denoted by p or the order of AR model.
>- Moving Average or MA Modeling: Is again essentially a linear regression model that
models the impact of noise/error from prior observations to current one. 


>### *The AR and MA models were known long before Box-Jenkin’s methodology was presented. Yet this methodology presented a classic approach to identify and apply these models for forecasting.*

In [None]:
import itertools
import numpy as np
import pandas as pd


import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose

from sklearn.model_selection import TimeSeriesSplit

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

sns.set_style('whitegrid')
sns.set_context('talk')

import warnings

def fxn():
    warnings.warn("deprecated", DeprecationWarning)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fxn()

In [None]:
from matplotlib.pyplot import figure

def plot_rolling_stats(ts):
        figure(num=None, figsize=(18, 7), dpi=80, linewidth=5)
        rolling_mean = ts.rolling(window=24,center=False).mean()
        rolling_std = ts.rolling(window=24,center=False).std()

        #Plot rolling statistics:
        orig = plt.plot(ts, color='c',label='Original')
        mean = plt.plot(rolling_mean, color='red', label='Rolling Mean')
        std = plt.plot(rolling_std, color='black', label = 'Rolling Std')
        
        plt.legend(loc='best')
        plt.title('Rolling Mean & Standard Deviation')
        plt.show(block=False)

>One the key assumptions behind the ARIMA models we will be
discussing next. Stationarity refers to the property where for a time series its mean,
variance, and autocorrelation are time invariant. In other words, mean, variance,
and autocorrelation do not change with time.

>Statistical tests that help us understand if a given series is stationary
or not. The **Augmented Dickey Fuller test** begins with a null hypothesis of series being
non-stationary, 

>If the test statistic of AD Fuller test is less than the critical value(s), we reject the null hypothesis of nonstationarity. The AD Fuller test is available as part of the statsmodel library. Since it is quite evident that our original series of gold prices is non-stationary, we will perform a log transformation and see if we are able to obtain stationarity. 

In [None]:
def ad_fuller_test(ts):
    dftest = adfuller(ts, autolag='AIC')
      
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
        print(dfoutput)


The ARIMA model is a logical progression and combination of the two models. Yet if we combine AR
and MA with a differenced series, what we get is called as ARIMA(p,d,q) model.
where,

    • p is the order of Autoregression

    • q is the order of Moving average

    • d is the order of differencing

Thus, for a stationary time series ARIMA models combine autoregressive and moving average concepts
to model the behavior of a long running time series and helps in forecasting. Let’s now apply these concepts
to model gold price forecasting.

In [None]:
def auto_arima(param_max=1,series=pd.Series(),verbose=True):
    # Define the p, d and q parameters to take any value 
    # between 0 and param_max
    p = d = q = range(0, param_max+1)
    print('p=', p)
    print('d=', d)
    print('q=', q)
    # Generate all different combinations of seasonal p, d and q triplets
    pdq = [(x[0], x[1], x[2]) for x in list(itertools.product(p, d, q))]
    
    model_resuls = []
    best_model = {}
    min_aic = 10000000
    for param in pdq:
        try:
            mod = sm.tsa.ARIMA(series, order=param)

            results = mod.fit()
            
            if verbose:
                print('ARIMA{}- AIC:{}'.format(param, results.aic))
            model_resuls.append({'aic':results.aic,
                                 'params':param,
                                 'model_obj':results})
            if min_aic>results.aic:
                best_model={'aic':results.aic,
                            'params':param,
                            'model_obj':results}
                min_aic = results.aic
        except Exception as ex:
            print(ex)
    if verbose:
        print("Best Model params:{} AIC:{}".format(best_model['params'],
              best_model['aic']))  
        
    return best_model, model_resuls


def arima_gridsearch_cv(series, cv_splits=2,verbose=True,show_plots=True):
    # prepare train-test split object
    tscv = TimeSeriesSplit(n_splits=cv_splits)
    
    # initialize variables
    splits = []
    best_models = []
    all_models = []
    i = 1
    
    # loop through each CV split
    for train_index, test_index in tscv.split(series):
        print("*"*20)
        print("Iteration {} of {}".format(i,cv_splits))
        i = i + 1
        
        # print train and test indices
        if verbose:
            print("TRAIN:", train_index, "TEST:", test_index)
        splits.append({'train':train_index,'test':test_index})
        
        # split train and test sets
        train_series = series.iloc[train_index]
        test_series = series.iloc[test_index]
        
        print("Train shape:{}, Test shape:{}".format(train_series.shape,
              test_series.shape))
        
        # perform auto arima
        _best_model, _all_models = auto_arima(series=train_series)
        best_models.append(_best_model)
        all_models.append(_all_models)
        
        # display summary for best fitting model
        if verbose:
            print(_best_model['model_obj'].summary())
        results = _best_model['model_obj']
       # plt.figure(figsize=(15, 9))
        if show_plots:
            # show residual plots
            residuals = pd.DataFrame(results.resid)
            #plt.figure(figsize=(15, 9))
            residuals.plot(figsize=(14, 6))
            plt.title('Residual Plot')
            plt.show()
            #plt.figure(figsize=(15, 9))
            residuals.plot(kind='kde', figsize=(14, 6))
            plt.title('KDE Plot')
            plt.show()
            print(residuals.describe())
        
            # show forecast plot
            fig, ax = plt.subplots(figsize=(18, 4))
            fig.autofmt_xdate()
            ax = train_series.plot(ax=ax)
            test_series.plot(ax=ax)
            fig = results.plot_predict(test_series.index.min(), 
                                       test_series.index.max(), 
                                       dynamic=True,ax=ax,
                                       plot_insample=False)
            plt.title('Forecast Plot ')
            plt.legend()
            plt.show()
            
           # train_series = train_series.reindex(pd.date_range(train_series.index.min(), 
            #                      train_series.index.max(), 
            #                      freq='D')).fillna(method='ffill')
            # show error plot
           # insample_fit = list(results.predict(train_series.index.min()+1, 
                                         #       train_series.index.max(),freq='D')) 
            
           # plt.plot((np.exp(train_series.iloc[1:].tolist())-\
           #                  np.exp(insample_fit)))
            #plt.title('Error Plot')
            plt.show()
    return {'cv_split_index':splits,
            'all_models':all_models,
            'best_models':best_models}
    

In [None]:
if __name__ == '__main__':
    
    import quandl
    gold_df = quandl.get("WGC/GOLD_DAILY_USD", authtoken="ao5ZsdzsxHBykZGZ6tZ5")
    
    new_df = gold_df.reindex(pd.date_range(gold_df.index.min(), 
                                  gold_df.index.max(), 
                                  freq='D')).fillna(method='ffill')
    print(new_df.shape)
    gold_df.plot(figsize=(15, 6))
    plt.show()
    
    # log series
    log_series = np.log(new_df.Value)
    
    ad_fuller_test(log_series)
    plot_rolling_stats(log_series)
    
    # Using log series with a shift to make it stationary
    log_series_shift = log_series - log_series.shift()
    log_series_shift = log_series_shift[~np.isnan(log_series_shift)]
    
    ad_fuller_test(log_series_shift)
    plot_rolling_stats(log_series_shift)
    
    # determining p and q
   # plot_acf_pacf(log_series_shift)
    
    
    new_df['log_series'] = log_series
    new_df['log_series_shift'] = log_series_shift
    print(new_df.head())
    # cross validate 
    results_dict = arima_gridsearch_cv(new_df.log_series,cv_splits=5)

>In this case, we generated forecast for time periods for which we already had data. This helps us in
visualizing and understanding how the model is performing. This is also called as **back testing**. Out of sample
forecasting is also supported by statsmodels through its forecast() method. Also, the plots
showcases values in the transformed scale, i.e. log scale. Inverse transformation can be easily applied to get
data back in original form.
You should also note that commodity prices are impacted by a whole lot of other factors like global
demand, economic conditions like recession and so on. Hence, what we showcased here was in certain ways
a naïve modeling of a complex process. We would need more features and attributes to have sophisticated
forecasts.