# Imports

In [81]:
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go

from statsmodels.stats.stattools import jarque_bera
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.stats.stattools import durbin_watson
import statsmodels.api as sm

# Question 1

(1 point) Import the time series data. 

Create a time series of quarterly growth rates of GDP by taking
the first differences of the log-levels and multiply by 100. 

Denote this series by ∆yt. Are the quarterly
growth rates normally distributed?

In [82]:
# import GDPEA.csv
df = pd.read_csv('GDPEA.csv')

In [83]:
df

Unnamed: 0,DATE,GDP
0,1995-01-01,1835998.0
1,1995-04-01,1846613.1
2,1995-07-01,1853116.3
3,1995-10-01,1859777.7
4,1996-01-01,1862148.5
...,...,...
108,2022-01-01,2716755.9
109,2022-04-01,2741228.7
110,2022-07-01,2751656.0
111,2022-10-01,2750179.8


In [84]:
# take first difference of the log-levels and multiply by 100 of the column 'GDP'
df['GDP_growth'] = 100 * np.log(df['GDP']).diff()

# print the first 5 rows of the data frame
df.head()

Unnamed: 0,DATE,GDP,GDP_growth
0,1995-01-01,1835998.0,
1,1995-04-01,1846613.1,0.5765
2,1995-07-01,1853116.3,0.35155
3,1995-10-01,1859777.7,0.358826
4,1996-01-01,1862148.5,0.127396


In [109]:
def plot_hist_and_jb_test(df, col_name, title):
    fig = px.histogram(df, x=col_name, nbins=50, title=f'Histogram of {title}')
    fig.show()

    jb_test = jarque_bera(df[col_name].dropna())
    print('The test statistic is: ', jb_test[0])
    print('The p-value is: ', jb_test[1])
    print('Skewness is: ', jb_test[2])
    print('Kurtosis is: ', jb_test[3])

    if jb_test[1] < 0.05:
        return 'The growth rates are not normally distributed.'
    else:
        return 'The growth rates are normally distributed.'

In [86]:
plot_hist_and_jb_test(df, 'GDP_growth', 'GDP growth rates')

The test statistic is:  6567.305750040752
The p-value is:  0.0
Skewness is:  -1.0642473408443531
Kurtosis is:  40.45329510163537


'The growth rates are not normally distributed.'

# Question 2

(2 points) Repeat question (1.), but using the sample Q1 1995 to Q3 2008 and (b) Q1 1995 to Q4 2019.

What are the most striking differences compared to (1.)? What does this tell you about the observations
in Q4 2008, Q1 2009, and in 2020? 

Also plot the time series over the full sample to argue about those
observations.

In [87]:
# Create sample from 1995-01-01 till 2008-07-01 and another sample from 1995-01-01 till 2019-10-01
df_sample1 = df[(df['DATE'] >= '1995-01-01') & (df['DATE'] <= '2008-07-01')]
df_sample2 = df[(df['DATE'] >= '1995-01-01') & (df['DATE'] <= '2019-10-01')]

In [110]:
plot_hist_and_jb_test(df_sample1, 'GDP_growth', 'GDP growth rates from 1995-01-01 till 2008-07-01')

The test statistic is:  0.779508310613518
The p-value is:  0.677223345799987
Skewness is:  -0.28563410963352354
Kurtosis is:  3.141777148038425


'The growth rates are normally distributed.'

In [108]:
plot_hist_and_jb_test(df_sample2, 'GDP_growth', 'GDP growth rates from 1995-01-01 till 2019-10-01')

The test statistic is:  1045.3700625623092
The p-value is:  1.0017864222482736e-227
Skewness is:  -2.937079164188037
Kurtosis is:  17.795854466803217


'The growth rates are not normally distributed.'

In [90]:
# plot the obeservation of 2008-10-01, 2009-01-01, 2020-01-01, 2020-04-01, 2020-07-01, 2020-10-01
fig = go.Figure()
fig.add_trace(go.Scatter(x=df['DATE'], y=df['GDP_growth'], mode='lines', name='GDP growth'))

obs_dates = ['2008-10-01', '2009-01-01', '2020-01-01', '2020-04-01', '2020-07-01', '2020-10-01']
obs_values = df.loc[df['DATE'].isin(obs_dates), 'GDP_growth']

fig.add_trace(go.Scatter(x=obs_dates,
                         y=obs_values,
                         mode='markers',
                         name='Outliers',
                         text=obs_dates,
                         textposition='top center'))

fig.update_layout(title='GDP growth rates with outliers of 2008-10-01, 2009-01-01, 2020-01-01, 2020-04-01, 2020-07-01, 2020-10-01',
                    xaxis_title='Date',
                    yaxis_title='GDP growth rates')

fig.show()

# Question 3

(3 points) Estimate an AR(p) model for ∆yt selecting the lag length based on AIC (with a maximal
lag length of 8). 

Evaluate the model by inspecting the properties of the residuals by applying tests for
normality and no-autocorrelation. 

What kind of patterns do you observe in the residuals around Q1 2020
to Q4 2020?

In [91]:
diff_y = df['GDP_growth'].values[1:]

In [92]:
def find_optimal_lag(y, max_lag=8):
    """
    Finds the optimal lag length using AIC and plots the AIC values.
    
    Parameters:
    y (array-like): The time series data.
    max_lag (int): The maximum lag length to consider.
    
    Returns:
    int: The optimal lag length.
    """
    aic_values = []
    for p in range(1, max_lag+1):
        model = ARIMA(endog=y, order=(p, 0, 0))
        model_fit = model.fit()
        aic_values.append(model_fit.aic)

    print('AIC values for lag length 1 to {}: {}'.format(max_lag, aic_values))

    # Plot the AIC values
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=list(range(1, max_lag+1)), y=aic_values, mode='lines+markers'))
    fig.update_layout(title='AIC values for lag length 1 to {}'.format(max_lag),
                        xaxis_title='Lag length',
                        yaxis_title='AIC values')
    # make the minimum value red
    fig.add_trace(go.Scatter(x=[aic_values.index(min(aic_values))+1], y=[min(aic_values)], mode='markers', marker_color='red'))
    fig.show()
    
    return aic_values.index(min(aic_values)) + 1

In [93]:
best_lag = find_optimal_lag(diff_y)
print('The optimal lag length is: ', best_lag)

AIC values for lag length 1 to 8: [441.7112290925104, 441.8546445573909, 443.670416484652, 445.52167024476245, 446.808919215419, 448.08363418405656, 449.9634665877808, 451.5846148050765]


The optimal lag length is:  1


In [94]:
def plot_residuals(y, best_lag):
    """
    Plots the residuals of an ARIMA model with a specified lag.
    
    Parameters:
    y (array-like): The time series data.
    best_lag (int): The optimal lag length.
    """
    model = ARIMA(endog=y, order=(best_lag, 0, 0))
    model_fit = model.fit()
    residuals = model_fit.resid

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df['DATE'].values[best_lag:], y=residuals, mode='lines+markers'))
    fig.update_layout(title='Residuals of the AR({}) model'.format(best_lag),
                        xaxis_title='Date',
                        yaxis_title='Residuals')
    fig.show()

    return residuals

In [95]:
residuals = plot_residuals(diff_y, best_lag)

Note, comment about the outliers of 2020-Q1 till 2020-Q4

In [98]:
# plot the histogram of the residuals
plot_hist_and_jb_test(pd.DataFrame(residuals), 0, 'residuals of the AR(1) model')

The test statistic is:  7076.422018358543
The p-value is:  0.0
Skewness is:  -3.540914520668044
Kurtosis is:  41.29130246179157


'The growth rates are not normally distributed.'

In [99]:
dw = durbin_watson(residuals)
print('The Durbin-Watson test statistic is: ', dw)
# Interpret the test statistic
if dw < 1.5:
    print('Positive autocorrelation')
elif dw > 2.5:
    print('Negative autocorrelation')
else:
    print('No autocorrelation')

The Durbin-Watson test statistic is:  2.0624496738369174
No autocorrelation


In general, if d is less than 1.5 or greater than 2.5 then there is potentially a serious autocorrelation problem. 

Otherwise, if d is between 1.5 and 2.5 then autocorrelation is likely not a cause for concern.

# Question 4

(2 points) Test whether the observations in Q4 2008, Q1 2009, Q1 2020, Q2 2020, and Q3 2020 are
innovation outliers. 

What do you conclude from the test results?

-> Approach:

To test whether the observations in Q4 2008, Q1 2009, Q1 2020, Q2 2020, and Q3 2020 are innovation outliers, 

you can use statistical methods such as the Z-score or modified Z-score.

In [100]:
# Create a matrix of zeros for the dummy variables
z = np.zeros((len(df), 5))

# Set the values of the dummy variables for the outliers
z[df['DATE'] == '2008-10-01', 0] = 1
z[df['DATE'] == '2009-01-01', 1] = 1
z[df['DATE'] == '2020-01-01', 2] = 1
z[df['DATE'] == '2020-04-01', 3] = 1
z[df['DATE'] == '2020-07-01', 4] = 1

# Add the dummy variables to the DataFrame
df['outlier_1'] = z[:, 0]
df['outlier_2'] = z[:, 1]
df['outlier_3'] = z[:, 2]
df['outlier_4'] = z[:, 3]
df['outlier_5'] = z[:, 4]

In [101]:
# Set up the regression model
X = df[['outlier_1', 'outlier_2', 'outlier_3', 'outlier_4', 'outlier_5']]
y = df['GDP_growth']
X = sm.add_constant(X)

# drop first row
X = X.iloc[1:, :]
y = y.iloc[1:]

# Fit the regression model
model = sm.OLS(y, X).fit()

# Print the summary of the model
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:             GDP_growth   R-squared:                       0.937
Model:                            OLS   Adj. R-squared:                  0.934
Method:                 Least Squares   F-statistic:                     316.6
Date:                Tue, 30 May 2023   Prob (F-statistic):           5.21e-62
Time:                        17:02:36   Log-Likelihood:                -65.856
No. Observations:                 112   AIC:                             143.7
Df Residuals:                     106   BIC:                             160.0
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.4617      0.043     10.666      0.0

# Question 5

(2 points) Repeat question (3.) but treating the observations Q1 2009, Q1 2020, Q2 2020 and Q3 2020 as
outliers. 

Estimate an AR(2) model treating these observations as innovation outliers. 

Evaluate the model
by inspecting the properties of the residuals testing for normality and no-autocorrelation. 

Compare the
properties of the residuals to those obtained in question (3.), also comparing the plotted residuals.

In [102]:
# remove outliers from data
df_no_outliers = df[(df['DATE'] != '2009-01-01') & (df['DATE'] != '2020-01-01') & (df['DATE'] != '2020-04-01') & (df['DATE'] != '2020-07-01')]
df_no_outliers = df_no_outliers.reset_index(drop=True)

In [103]:
y_diff_no_outliers = df_no_outliers['GDP_growth'].values[1:]
best_lag_no_outliers = find_optimal_lag(y_diff_no_outliers)

AIC values for lag length 1 to 8: [129.8292632373391, 131.78287037317193, 133.3257358223754, 135.29942304612922, 136.1042365478785, 137.94273032237106, 139.7333442687439, 138.62586732094064]


In [104]:
residuals_no_outliers = plot_residuals(y_diff_no_outliers, best_lag_no_outliers)

In [111]:
# plot the histogram of the residuals
plot_hist_and_jb_test(pd.DataFrame(residuals_no_outliers), 0, 'residuals of the AR(1) model wihout outliers')

The test statistic is:  69.30209756321887
The p-value is:  8.938007649001005e-16
Skewness is:  0.024003867662763124
Kurtosis is:  6.924049105516986


'The growth rates are not normally distributed.'

# Question 6

Compare the in-sample fit of 

the previously estimated AR(1) model 

and 

the estimated AR(1)
model treating the observations Q1 2009, Q1 2020, Q2 2020, and Q3 2020 as innovation outliers 

based on
the value of the log likelihood and the AIC. 

Based on those in-sample fit measures which model would
you select?

In [114]:
# fit the model
model = ARIMA(endog=diff_y, order=(1, 0, 0))
model = model.fit()

# print the summary
print(model.summary())

                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:                  112
Model:                 ARIMA(1, 0, 0)   Log Likelihood                -217.856
Date:                Tue, 30 May 2023   AIC                            441.711
Time:                        17:08:48   BIC                            449.867
Sample:                             0   HQIC                           445.020
                                - 112                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.3615      0.280      1.291      0.197      -0.187       0.910
ar.L1         -0.2283      0.043     -5.334      0.000      -0.312      -0.144
sigma2         2.8631      0.126     22.790      0.0

In [112]:
# fit the model
model_no_outliers = ARIMA(endog=y_diff_no_outliers, order=(best_lag_no_outliers, 0, 0))
model_fit_no_outliers = model_no_outliers.fit()

# print the summary
print(model_fit_no_outliers.summary())

                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:                  108
Model:                 ARIMA(1, 0, 0)   Log Likelihood                 -61.915
Date:                Tue, 30 May 2023   AIC                            129.829
Time:                        17:08:02   BIC                            137.876
Sample:                             0   HQIC                           133.092
                                - 108                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.4383      0.084      5.224      0.000       0.274       0.603
ar.L1          0.4982      0.055      9.105      0.000       0.391       0.605
sigma2         0.1838      0.015     12.364      0.0

In [116]:
print('Log likelihood of the model with outliers: ', np.round(model.llf, 2))
print('Log likelihood of the model without outliers: ', np.round(model_fit_no_outliers.llf, 2))

print('AIC of the model with outliers: ', np.round(model.aic, 2))
print('AIC of the model without outliers: ', np.round(model_fit_no_outliers.aic, 2))

Log likelihood of the model with outliers:  -217.86
Log likelihood of the model without outliers:  -61.91
AIC of the model with outliers:  441.71
AIC of the model without outliers:  129.83


In [118]:
print('The model without outliers is better because it has a higher log likelihood and a lower AIC.')

The model without outliers is better because it has a higher log likelihood and a lower AIC.


# Question 7

(2 points) The CSV file IEA contains quarterly observations on investment in growth rates for the Euro
Area over the period Q1 1995 until Q4 2022. 

We denote the series by xt. 

Import the time series and plot
the data together with ∆yt. 

What do you observe? What is the correlation between the two series?

In [119]:
# Import IEA.csv
df_xt = pd.read_csv('IEA.csv')
df_xt

Unnamed: 0,DATE,IN
0,1995-01-01,1.12130
1,1995-04-01,1.05847
2,1995-07-01,1.04738
3,1995-10-01,1.77690
4,1996-01-01,-0.92144
...,...,...
107,2021-10-01,6.65805
108,2022-01-01,-2.84848
109,2022-04-01,9.57580
110,2022-07-01,-3.75747


In [122]:
# Plot the column 'IN' of df_xt together with the column 'GDP_growth' of df
fig = go.Figure()
fig.add_trace(go.Scatter(x=df['DATE'], y=df['GDP_growth'], mode='lines+markers', name='GDP growth'))
fig.add_trace(go.Scatter(x=df_xt['DATE'], y=df_xt['IN'], mode='lines+markers', name='Investment in growth rates for the Euro Area'))
fig.update_layout(title='GDP growth and investment in growth rates for the Euro Area',
                    xaxis_title='Date',
                    yaxis_title='Value')
fig.show()

In [124]:
print('We observe potential high correlation between the two time series.')

We observe potential high correlation between the two time series.


In [123]:
# correlation between GDP growth and investment
print('The correlation between GDP growth and investment is: ', np.round(df['GDP_growth'].corr(df_xt['IN']), 2))

The correlation between GDP growth and investment is:  0.82


# Question 8

(3 points) Estimate an AR(p) model for xt selecting the lag length based on AIC 
(with a maximal lag
length of 8). 

Next, estimate an AR(p) model for xt including ∆yt as an additional regressor. 

Select the
lag length of the model based on AIC (with a maximal lag length of 8). 

Evaluate the two models in the
usual way, by considering the properties of the residuals. 

Is the second model an improvement over the
AR(p) model for xt?

# Question 9

(3 points) Estimate a threshold model for xt with an AR(1) model in both regimes, with ∆yt as the
threshold variable, and with the threshold fixed at zero. 

Discuss the differences between the AR(1)
models in the two regimes based on the estimated coefficients. 

Also, explain the interpretation of setting
the threshold fixed at zero for the specified threshold variable.

# Question 10

(2 points) Estimate the same threshold model but now with the threshold as an unknown parameter
(instead of fixing it at 0). 

Discuss the differences between the results for the threshold model with fixed
threshold at zero and the threshold model with unknown threshold.

# Question 11

(2 points) Estimate a logistic smooth transition model (including an intercept and one lag of xt and using
∆yt as threshold variable). 

Estimate the smoothing parameter (γ) and threshold parameter (c). 

Discuss
the differences between the results for this model and the threshold model with unknown threshold.

# Question 12

(2 points) Use the SupF approach to test for threshold nonlinearity in the threshold model, but with the
threshold as an unknown parameter (instead of fixing it at 0). 

Discuss the test results.

# Question 13

(4 points) Use the threshold model with unknown threshold and the logistic smooth transition model
to obtain fitted values. 

Evaluate the relative accuracy of these in-sample forecasts based on mean
squared prediction errors. 

Which of the nonlinear models delivers more accurate forecasts? 

Are potential
improvements in forecast accuracy statistically significant? Discuss also the limitation of comparing the
models only based on in-sample fit.
