Autoregressive Models


In [None]:
import warnings

import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
from IPython.display import VimeoVideo
from pymongo import MongoClient
from sklearn.metrics import mean_absolute_error
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.ar_model import AutoReg

warnings.simplefilter(action="ignore", category=FutureWarning)

Prepare Data

Import

In [None]:
# Complete to the create a client to connect to the MongoDB server, assigns the "air-quality" database to db, and assigned the "nairobi" connection to nairobi
client = MongoClient(host='localhost', port=27017)
db = client["air-quality"]
nairobi = db["nairobi"]

In [None]:
#Change the wrangle function below so that it returns a Series of the resampled data instead of a DataFram
def wrangle(collection):
    results = collection.find(
        {"metadata.site": 29, "metadata.measurement": "P2"},
        projection={"P2": 1, "timestamp": 1, "_id": 0},
    )

    # Read data into DataFrame
    df = pd.DataFrame(list(results)).set_index("timestamp")

    # Localize timezone
    df.index = df.index.tz_localize("UTC").tz_convert("Africa/Nairobi")

    # Remove outliers
    df = df[df["P2"] < 500]

    # Resample to 1hr window
    y = df["P2"].resample("1H").mean().fillna(method='ffill')

    return y


In [None]:
#Use your wrangle function to read the data from the nairobi collection into the Series y
y = wrangle(nairobi)
y.head()

In [None]:
y.corr(y.shift(5))

In [None]:
#Create an ACF plot for the data in y. Be sure to label the x-axis as "Lag [hours]" and the y-axis as "Correlation Coefficient"
fig, ax = plt.subplots(figsize=(15, 6))
plot_acf(y, ax=ax)
plt.xlabel("Lag [hours]")
plt.ylabel("Correlation Coefficient");

In [None]:
#Create an ACF plot for the data in y. Be sure to label the x-axis as "Lag [hours]" and the y-axis as "Correlation Coefficient"
fig, ax = plt.subplots(figsize=(15, 6))
plot_acf(y, ax=ax)
plt.xlabel("Lag [hours]")
plt.ylabel("Correlation Coefficient");

In [None]:
#Create an PACF plot for the data in y. Be sure to label the x-axis as "Lag [hours]" and the y-axis as "Correlation Coefficient"
fig, ax = plt.subplots(figsize=(15, 6))
plot_pacf(y, ax=ax)
plt.xlabel("Lag [hours]")
plt.ylabel("Correlation Coefficient");

In [None]:
#Split y into training and test sets. The first 95% of the data should be in your training set. The remaining 5% should be in the test set
cutoff_test = int(len(y)*.95)

y_train = y.iloc[:cutoff_test]
y_test = y.iloc[cutoff_test:]


In [None]:
#Calculate the baseline mean absolute error for your model
y_train_mean = y_train.mean()
y_pred_baseline = [y_train_mean] * len(y_train)
mae_baseline = mean_absolute_error(y_train, y_pred_baseline)

print("Mean P2 Reading:", round(y_train_mean, 2))
print("Baseline MAE:", round(mae_baseline, 2))

In [None]:
#nstantiate an AutoReg model and fit it to the training data y_train. Be sure to set the lags argument to 26
model = AutoReg(y_train, lags=26).fit()

In [None]:
model.predict() #.isnull().sum()

In [None]:
#Generate a list of training predictions for your model and use them to calculate your training mean absolute error
y_pred = model.predict().dropna()
training_mae = mean_absolute_error(y_train.iloc[26:], y_pred)
print("Training MAE:", training_mae)

In [None]:
#y_train_resid = y_train-y_pred
y_train_resid=model.resid
y_train_resid.tail()

In [None]:
#Create a plot of y_train_resid
fig, ax = plt.subplots(figsize=(15, 6))
y_train_resid.plot(ylabel="Residual Value", ax=ax)

In [None]:
#Create a histogram of y_train_resid
y_train_resid.hist()
plt.xlabel("Residual Value")
plt.ylabel("frequency")
plt.title("Distribution of Residuals");

In [None]:
#Create an ACF plot of y_train_resid
fig, ax = plt.subplots(figsize=(15, 6))
plot_acf(y_train_resid, ax=ax);

Evaluate

In [None]:
#Calculate the test mean absolute error for your model
y_pred_test = model.predict(y_test.index.min(), y_test.index.max())
test_mae =mean_absolute_error(y_test, y_pred_test)
print("Test MAE:", test_mae)

In [None]:
#Create a DataFrame test_predictions that has two columns: "y_test" and "y_pred". The first should contain the true values for your test set, and the second should contain your model's predictions. Be sure the index of test_predictions matches the index of y_test
df_pred_test = pd.DataFrame(
    {"y_test": y_test, "y_pred": y_pred_test}, index=y_test.index
)
df_pred_test.head()

In [None]:
#Create a time series plot for the values in test_predictions using plotly express. Be sure that the y-axis is properly labeled as "P2"
fig = px.line(df_pred_test, labels={"value": "P2"})
fig.show()

In [None]:
#Perform walk-forward validation for your model for the entire test set y_test. Store your model's predictions in the Series y_pred_wfv

y_pred_wfv = pd.Series()
history = y_train.copy()
for i in range(len(y_test)):
    model=AutoReg(history, lags=26).fit()
    next_pred=model.forecast()
    y_pred_wfv=y_pred_wfv.append(next_pred)
    history=history.append(y_test[next_pred.index])

In [None]:
#Calculate the test mean absolute error for your model
test_mae = mean_absolute_error(y_test, y_pred_wfv)
print("Test MAE (walk forward validation):", round(test_mae, 2))

Communicate Results

In [None]:
#Print out the parameters for your trained model
print(model.params)

In [None]:
#Put the values for y_test and y_pred_wfv into the DataFrame df_pred_test (don't forget the index). Then plot df_pred_test using plotly express
df_pred_test=pd.DataFrame(
    {"y_test":y_test, "y_pred_wfv": y_pred_wfv}
)
fig = px.line(df_pred_test, labels={"value":"PM2.5"})
fig.show()