In [None]:
# Import modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima.model import ARIMA


## Atmospheric carbon dioxide data

In [None]:
# Define column names
col_names = ['year','month','decimal_date','avg_co2',
             'de_seasonalized','days','std','uncertainty']

# Read dataset with custom column names
df = pd.read_csv('../datasets/co2_mm_mlo.txt', comment='#', delimiter='\s+', names=col_names)

# Display a few rows
df.head(3)


In [None]:
# Add date column
df['date'] = pd.to_datetime({'year':df['year'],
                             'month':df['month'],
                             'day':1})

# Set timestamp as index (specify the freq for the statsmodels package)
df.set_index('date', inplace=True)
df.index.freq = 'MS' # print(df.index.freq) to check that is not None
df.head(3)


In [None]:
# Check if we have any missing values
df.isna().sum()


In [None]:
# Visualize time series data
plt.figure(figsize=(6,4))
plt.plot(df['avg_co2'])
plt.ylabel('$CO_2$ (ppm)')
plt.show()


### Test for stationarity

Stationarity in a time series implies that the statistical properties of the series like mean, variance, and autocorrelation are constant over time. In a stationary time series, these properties do not depend on the time at which the series is observed, meaning that the series does not exhibit trends or seasonal effects. Non-stationary data typically show clear trends, cyclical patterns, or other systematic changes over time. Non-stationary time series often need to be transformed (or de-trended) to become stationary before analysis.

The Dickey-Fuller (`adfuller`) test provided by the `statsmodels` library can be helpful to statistically test for stationarity.

**Dickey-Fuller test**
- Null Hypothesis: The series is NOT stationary
- Alternate Hypothesis: The series is stationary.

The null hypothesis can be rejected if `p-value<0.05`.  Hence, if the p-value is >0.05, the series is non-stationary.


In [None]:
# Dickey-Fuller test
adfuller(df['avg_co2'], autolag="AIC")

### Create training and testing sets

In [None]:
N_train = round(df.shape[0] * 0.95)
df_train = df[:N_train]
df_test = df[N_train+1:]


### Decompose time series

In [None]:
# Decompose time series
# Extrapolate to avoid NaNs
results = seasonal_decompose(df_train['avg_co2'], 
                             model='additive', 
                             period=12,
                             extrapolate_trend='freq')
 

In [None]:
# Create figure with trend components
plt.figure(figsize=(6,6))
 
plt.subplot(3,1,1)
plt.title('Trend')
plt.plot(results.trend, label='Trend')
plt.ylabel('$CO_2$ (ppm)')
 
plt.subplot(3,1,2)
plt.title('Seasonality')
plt.plot(results.seasonal, label='Seasonal')
plt.ylabel('$CO_2$ (ppm)')

plt.subplot(3,1,3)
plt.title('Residuals')
plt.plot(results.resid, label='Residuals')
plt.ylabel('$CO_2$ (ppm)')

plt.subplots_adjust(hspace=0.5)
plt.show()

## Examine autocorrelation lags

The `statsmodels` module offers an extensive library of functions for time series analysis. In addition to autocorrelation function, we can also apply a partial autocorrelation function, that removes the effect of intermediate lags. For instance, the PACF between time `t` and time `t-4` is the pure autocorrelation without the effect of `t-1`, `t-2`, and `t-3`.

In [None]:
# Create figure
fig, ax = plt.subplots(figsize=(8,5), ncols=1, nrows=2)

# Plot the autocorrelation function
plot_acf(df['avg_co2'], ax=ax[0])
ax[0].set_xlabel('Lag (days)')
ax[0].set_ylabel('Correlation coefficient')

# Plot the partial autocorrelation function
plot_pacf(df['avg_co2'], ax[1], method='ywm')
ax[1].set_xlabel('Lag (days)')
ax[1].set_ylabel('Correlation coefficient')

fig.subplots_adjust(hspace=0.5)
plt.show()


In [None]:
# Fit model to train set
# (p,d,q) => autoregressive, differences, and moving average
# (p,d,q,s) => autoregressive, differences, moving average, and periodicity

model = ARIMA(df_train['avg_co2'],
              order=(2,0,0), 
              seasonal_order=(1,0,0,12),
              dates=df_train.index,
              trend=[1,1,1] # Use quadratic polynomial to approximate main trend
             ).fit()

# seasonal_order (3,0,0,12) means that we add 12, 24, and 36 month lags

In [None]:
# Mean absolute error
print(model.mae,'ppm')


In [None]:
# Print summary statistics
model.summary()


### Predict with autoregressive model

In [None]:
pred_values = model.predict(start=df_test.index[0],
                            end=df_test.index[-2])

In [None]:
plt.plot(pred_values)
plt.plot(df_test['avg_co2'])

In [None]:
### Compute forecast error
plt.figure(figsize=(6,6))
model.plot_diagnostics()
plt.show()


## Practice

- Using the `statsmodels` library to add a deterministic process to the model in the form of a fourier series. [Hint](https://www.statsmodels.org/stable/examples/notebooks/generated/autoregressions.html)