In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import acf
from statsmodels.graphics.tsaplots import pacf
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf

In [None]:
# The purpose of this notebook is for me to get a better understanding of the confidence intervals from plot_acf() and plot_pacf().
# Standard texts suggest that the confidence intervals for these analyses should be +/- 1.96/(N^0.5) at 95% level, where N is the number of obsevations.
# This leads to straight lines, but the CIs from those functions are not straight. I want to be clear on what we're looking at and what the difference is.

In [None]:
chill = pd.read_csv('data_csvs/everest_8000m_chill_time_series.csv')
chill['chill'].plot()
plt.title('Chill Time Series, Everest 8000m, May 13 to July 1 2024')
plt.xlabel('data point')
plt.ylabel('chill (degrees C)')
plt.show()

In [None]:
# Plot the autocorrelation for chill data with significance level of 0.05
plot_acf(chill['chill'], alpha=0.05)
plt.title('Autocorrelation of Chill Time Series')
plt.xlabel('lag')
plt.ylabel('ACF')
plt.show()

In [None]:
acf_values, confint = acf(chill['chill'], alpha=0.05)
print(acf_values)
print(confint)

In [None]:
# So, the confidence intervals that are returned from acf() are centered around the actual ACF value for each lag.
# To some extent, the higher autocorrelations will have their CI size limited by 1.0.

# In the plot, the CIs are centered around 0. So to get these bounds, you have to subtract the acf value from each.
lower_bound = confint[1:, 0] - acf_values[1:]
upper_bound = confint[1:, 1] - acf_values[1:]
bounds = pd.DataFrame([lower_bound, upper_bound])
print(bounds)

In [None]:
# So it goes from about 0.16 to just over 0.5, matching what we see in the plot.

In [None]:
# According to the documentation:
# https://tedboy.github.io/statsmodels_doc/generated/statsmodels.graphics.tsaplots.acf.html
# the standard deviations are determined by Bartlett's Formula.

# To me the question is: why are these values so high in this case? Why is it so different from +/- 1.96/(T^0.5)??