In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use(['bmh'])
import seaborn as sns
import scipy.stats as ss
import statsmodels.tsa.api as smt
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import acf, pacf
from sklearn.metrics import mean_squared_error
from math import sqrt

Import and show a few rows of the data.

In [None]:
gdp = pd.read_csv("../input/finland-gdp-and-business-and-consumer-surveys/data/GDP_FINLAND.csv")
gdp.head()

In [None]:
bcs = pd.read_csv("../input/finland-gdp-and-business-and-consumer-surveys/data/BCS_FINLAND.csv",
                  index_col=0, parse_dates=True)
bcs.head()

Drop redundant columns and convert the time period format of GDP to the same format to BCS indicator and set it as index of the table.

In [None]:
gdp = gdp.drop(columns = ['geo','unit','s_adj','na_item'])

gdp.time = pd.to_datetime(gdp.time)
gdp['time'] = pd.PeriodIndex(gdp.time, freq='Q').to_timestamp(how = 'end').normalize()
gdp = gdp.set_index('time')

Print the number of missing values in each dataset.

In [None]:
print("Missing value in GDP: \n" + str(gdp.isnull().sum()) + 
      "\n\nMissing value in BCS indicator: \n" + str(bcs.isnull().sum()))

Resample the monthly BCS indicator to quarter by mean

In [None]:
bcs_quarterly_mean = bcs['INDUFITOTCOFBSM'].resample('Q').mean()

# Data exploration

Plot 2 series

In [None]:
figsize = (12, 5)

ax_gdp = gdp.value.dropna().plot(figsize=figsize, label='Quarterly GDP')
ax_gdp.grid(True)
ax_gdp.set_title("Finland Quarterly GDP")
ax_gdp.set_ylabel("GDP")

start, end = '1985', '2020'
fig, ax_bcs = plt.subplots(figsize=figsize)
ax_bcs.plot(bcs.loc[start:end,'INDUFITOTCOFBSM'], marker='.', linestyle='-', linewidth=0.5, label='Monthly indicator')
ax_bcs.plot(bcs_quarterly_mean.loc[start:end], marker='o', markersize=8, linestyle='-', label='Quarterly indicator')
ax_bcs.set_ylabel('BCS indicator')
ax_bcs.set_title("Finland BCS indicator")
ax_bcs.legend()

plt.show()

GDP seems to show trend, and BCS indicator seems to be stationary.

In [None]:
lags = 30

fig, ax = plt.subplots(2,2,figsize=(12,7))
smt.graphics.plot_acf(pd.Series(gdp.value.dropna()), lags=lags, ax=ax[0,0])
ax[0,0].set_title('GDP ACF')
smt.graphics.plot_pacf(pd.Series(gdp.value.dropna()), lags=lags, ax=ax[0,1])
ax[0,1].set_title('GDP PACF')
smt.graphics.plot_acf(pd.Series(bcs_quarterly_mean.dropna()), lags=lags, ax=ax[1,0])
ax[1,0].set_title('BCS Indicator ACF')
smt.graphics.plot_pacf(pd.Series(bcs_quarterly_mean.dropna()), lags=lags, ax=ax[1,1])
ax[1,1].set_title('BCS Indicator PACF')

Define a fonciton of Augmented Dickey-Fuller (ADF) test with different regression. 'ct' means the model contains constant and trend, 'c' contains only constant and 'nc' drops both of constant and trend.

In [None]:
def adf_test(timeseries):
    for regression in ['ct','c','nc']:
        print ('\nResults of Dickey-Fuller test with regression\t' + str(regression))
        dftest = adfuller(timeseries, autolag='AIC', regression=regression)
        dfoutput = pd.Series(dftest[0:2], index=['Test Statistic','p-value'])
        print (dfoutput)
        
adf_test(gdp.value.dropna())
adf_test(bcs_quarterly_mean.dropna())

ADF tests show that **GDP is a random walk** which can be stationary on difference and quarterly **BCS indicator is stationary**. Let's check if GDP is stationary on first difference.

In [None]:
# Check stationarity of the GDP on first difference
gdp_diff = gdp - gdp.shift(1)

# ADF tests
adf_test(gdp_diff.value.dropna())

# Plot series, ACF and PACF
fig = plt.figure(figsize=(12,7))
layout = (2, 2)
ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2)
acf_ax = plt.subplot2grid(layout, (1, 0))
pacf_ax = plt.subplot2grid(layout, (1, 1))
        
gdp_diff.dropna().plot(ax=ts_ax)
ts_ax.set_title('First difference GDP')
smt.graphics.plot_acf(gdp_diff.dropna(), lags=lags, ax=acf_ax)
smt.graphics.plot_pacf(gdp_diff.dropna(), lags=lags, ax=pacf_ax)
plt.tight_layout()

The first differenced GDP is stationary, and these plots seem to suggest the first-order autoregression and first-order of moving average. Thus, I apply firstly a ARIMAX model in the following section with a **Walk Forward validation**. 

In [None]:
data = gdp.join(bcs_quarterly_mean, how = 'outer').dropna()
data.rename(columns={'value': 'gdp'}, inplace=True)
data.head()

The loop below illustrate how I split data into train and test sets by using the Walk Forward Validation method. The minimum number of observations is set as 80% of total observations.

In [None]:
n_train = 87 # 80%
n_records = len(data)
for i in range(n_train, n_records):
    train, valid = data[0:i], data[i:i+1]
    print('train=%d, valid=%d' % (len(train), len(valid)))

# ARIMAX

After testing *p* = 1, 2, 3, *d* = 1 and *q* = 0, 1, 2, 3, we can find the best model which minimises the root mean squared error (RMSE) is ARIMAX (1, 1, 1).

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

preds_arimax = []

for i in range(n_train, n_records):
    train, valid = data[0:i], data[i:i+1]
    arimax = SARIMAX(train.gdp, exog = train.INDUFITOTCOFBSM, order=(1, 1, 1), seasonal_order=(0, 0, 0, 0))
    arimax_fit = arimax.fit(disp = False)
    yhat_arimax = arimax_fit.predict(len(valid.gdp), exog = valid.INDUFITOTCOFBSM)
    for preds in yhat_arimax.tail(1):
        preds_arimax.append(preds)
    
# Transform prediction into a more presentable format
idx = pd.date_range('2014-12-31', periods=22, freq='q')
preds_arimax = pd.Series(preds_arimax, index = idx)

In [None]:
fig, ax = plt.subplots(figsize=(12, 5))

start, end = '2014-12-31', '2020-03-31'
ax.plot(data.gdp[start:end].sort_index(), marker='.', linestyle='-', linewidth=0.5, label='Actual value')
ax.plot(preds_arimax[start:end].sort_index(), marker='o', markersize=6, linestyle='-', label='Predictions by SARIMAX')
ax.set_title("Finland quarterly GDP")
ax.legend()

In [None]:
mse_arimax = mean_squared_error(data.gdp[start:end], preds_arimax)
rmse_arimax = sqrt(mse_arimax)
print('RMSE: %.3f' % rmse_arimax)

# XGBoost

For a XGBoost model, I use lag 1 of GDP value and BCS indicator as covariates in order to compare the performance of models

In [None]:
data = data.join(gdp.shift(1), how = 'outer').dropna()
data.rename(columns = {'value': 'gdp_lag1'}, inplace = True)

X = ["INDUFITOTCOFBSM", "gdp_lag1"]

In [None]:
from xgboost import XGBRegressor

preds_xgb = []

for i in range(n_train, n_records):
    train, valid = data[0:i], data[i:i+1]
    xgb = XGBRegressor(n_estimators = 1000, learning_rate = 0.05)
    xgb.fit(train[X], train.gdp,
            eval_set=[(train[X], train.gdp), (valid[X], valid.gdp)],
            early_stopping_rounds = 10,
            verbose = False)
    yhat_xgb = xgb.predict(valid[X])
    for preds in yhat_xgb:
        preds_xgb.append(preds)
        
preds_xgb = pd.Series(preds_xgb, index = idx)

In [None]:
fig, ax = plt.subplots(figsize=(12, 5))

start, end = '2014-12-31', '2020-03-31'
ax.plot(data.gdp[start:end].sort_index(), marker='.', linestyle='-', linewidth=0.5, label='Actual value')
ax.plot(preds_xgb[start:end].sort_index(), marker='o', markersize=6, linestyle='-', label='Predictions by XGBoost')
ax.set_title("Finland quarterly GDP")
ax.legend()

In [None]:
mse_xgb = mean_squared_error(data.gdp[start:end], preds_xgb)
rmse_xgb = sqrt(mse_xgb)
print('RMSE: %.3f' % rmse_xgb)

RMSE of XGBoost model is much lower, but we observe a big gap between prediction and actual value. So I furtherly apply a neural network model so see if it performs better.

# Neural network model

In [None]:
from tensorflow import keras
from tensorflow.keras import layers

preds_nn = []

for i in range(n_train, n_records):
    train, valid = data[0:i], data[i:i+1]
    
    input_shape = [train[X].shape[1]]

    nn = keras.Sequential([
        layers.Dense(80, activation = 'relu', input_shape = input_shape),
        layers.Dropout(0.3),
        layers.Dense(60, activation = 'relu'),
        layers.Dropout(0.3),
        layers.Dense(1, activation = 'relu')])

    nn.compile(
        optimizer = 'adam',
        loss = 'mse',
        metrics = [keras.metrics.RootMeanSquaredError()])

    early_stopping = keras.callbacks.EarlyStopping(
        patience = 10,
        min_delta = 0.001,
        restore_best_weights = True,)

    history = nn.fit(
        train[X],train.gdp,
        validation_data = (valid[X], valid.gdp),
        epochs = 200,
        callbacks = [early_stopping],
        verbose = 0)
    
    yhat_nn = nn.predict(valid[X])
    for preds in yhat_nn:
        preds_nn.append(preds)
        
preds_nn = pd.DataFrame(preds_nn)
preds_nn = preds_nn.set_index(idx)

In [None]:
fig, ax = plt.subplots(figsize=(12, 5))

start, end = '2014-12-31', '2020-03-31'
ax.plot(data.gdp[start:end], marker='.', linestyle='-', linewidth=0.5, label='Actual value')
ax.plot(preds_nn[start:end], marker='o', markersize=6, linestyle='-', label='Predictions by neural network')
ax.set_title("Finland quarterly GDP")
ax.legend()

In [None]:
mse_nn = mean_squared_error(data.gdp[start:end], preds_nn)
rmse_nn = sqrt(mse_nn)
print('RMSE: %.3f' % rmse_nn)

As we see, RMSE decreases, the neural network model is outstanding among these models.  