Air Quality in Dar er Salaam TZ


In [None]:
import warnings

import wqet_grader

warnings.simplefilter(action="ignore", category=FutureWarning)
wqet_grader.init("Project 3 Assessment")

In [None]:
# Import libraries here
from pymongo import MongoClient
import pandas as pd
from pprint import PrettyPrinter
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from sklearn.metrics import mean_absolute_error
from statsmodels.tsa.ar_model import AutoReg
import plotly.express as px

Prepare Data

Connect

In [None]:
#Connect to MongoDB server running at host "localhost" on port 27017. Then connect to the "air-quality" database and assign the collection for Dar es Salaam to the variable name dar
client =  MongoClient(host='localhost', port=27017)
db =client['air-quality']
dar = db['dar-es-salaam']

In [None]:
dar.find_one({})

In [None]:
#Determine the numbers assigned to all the sensor sites in the Dar es Salaam collection. Your submission should be a list of integers
sites = dar.distinct("metadata.site")
sites

In [None]:
pp = PrettyPrinter(indent=2)

In [None]:
#Determine which site in the Dar es Salaam collection has the most sensor readings (of any type, not just PM2.5 readings). You submission readings_per_site should be a list of dictionaries that follows this format:
result = dar.aggregate(
    [
        {"$group":{"_id": "$metadata.site", "count":{"$count": {}}}}
    ]
)
readings_per_site = list(result)
readings_per_site

In [None]:
dar.distinct("metadata.measurement")

Import

In [None]:
#Create a wrangle function that will extract the PM2.5 readings from the site that has the most total readings in the Dar es Salaam collection. Your function should do the following steps:
#Localize reading time stamps to the timezone for "Africa/Dar_es_Salaam".
#Remove all outlier PM2.5 readings that are above 100.
#Resample the data to provide the mean PM2.5 reading for each hour.
#Impute any missing values using the forward-will method.
#Return a Series y
def wrangle(collection):
    results = collection.find(
        {"metadata.site": 11, "metadata.measurement": "P2"},
        projection={"P2": 1, "timestamp": 1, "_id":0},
    )

    y =pd.DataFrame(results).set_index("timestamp")
    
    #Remove outlier over 100
    y=y[y["P2"]<100]
        
    #localize timezone
    y.index=y.index.tz_localize("UTC").tz_convert("Africa/Dar_es_Salaam")
    
    #Resample to 1H window
    y=y["P2"].resample("1H").mean().fillna(method="ffill")
    return y

In [None]:
#Use your wrangle function to query the dar collection and return your cleaned results
y = wrangle(dar)
y.head()

In [None]:
#Create a time series plot of the readings in y. Label your x-axis "Date" and your y-axis "PM2.5 Level". Use the title "Dar es Salaam PM2.5 Levels"
fig, ax = plt.subplots(figsize=(15, 6))
y.plot(ax=ax, xlabel="Date", ylabel="PM2.5", title="PM2.5 Level")
# Don't delete the code below 👇
plt.savefig("images/3-5-5.png", dpi=150)


In [None]:
y.head()
y=pd.DataFrame(y)
y.head()

In [None]:
#Plot the rolling average of the readings in y. Use a window size of 168 (the number of hours in a week). Label your x-axis "Date" and your y-axis "PM2.5 Level". Use the title "Dar es Salaam PM2.5 Levels, 7-Day Rolling Average"
fig, ax = plt.subplots(figsize=(15, 6))
y["P2"].rolling(168).mean().plot(ax=ax,xlabel="Date", ylabel="PM2.5 Level", title="Dar er Salaam PM2.5")
# Don't delete the code below 👇

plt.savefig("images/3-5-6.png", dpi=150)

In [None]:
#Create an ACF plot for the data in y. Be sure to label the x-axis as "Lag [hours]" and the y-axis as "Correlation Coefficient". Use the title "Dar es Salaam PM2.5 Readings, ACF"
fig, ax = plt.subplots(figsize=(15, 6))
plot_acf(y, ax=ax,)
plt.xlabel("Lag [Hours]") 
plt.ylabel("Correction Coefficient") 
plt.title("Dar es Salaam PM2.5 Reading, ACF")
# Don't delete the code below 👇
plt.savefig("images/3-5-7.png", dpi=150)

In [None]:
#Create an PACF plot for the data in y. Be sure to label the x-axis as "Lag [hours]" and the y-axis as "Correlation Coefficient". Use the title "Dar es Salaam PM2.5 Readings, PACF"
fig, ax = plt.subplots(figsize=(15, 6))
plot_pacf(y, ax=ax,)
plt.xlabel("Lag [Hours]") 
plt.ylabel("Correction Coefficient") 
plt.title("Dar es Salaam PM2.5 Reading, PACF")
# Don't delete the code below 👇
plt.savefig("images/3-5-8.png", dpi=150)

Split

In [None]:
#Split y into training and test sets. The first 90% of the data should be in your training set. The remaining 10% should be in the test set
cutoff_test=int(len(y)*.9)
y_train = y.iloc[0:cutoff_test]
y_test = y.iloc[cutoff_test:]
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

In [None]:
#Establish the baseline mean absolute error for your model
y_train_mean = y_train.mean()
y_pred_baseline =[y_train.mean()]*len(y_train)
mae_baseline = mean_absolute_error(y_train, y_pred_baseline)

print("Mean P2 Reading:", y_train_mean)
print("Baseline MAE:", mae_baseline)

In [None]:
#You're going to use an AR model to predict PM2.5 readings, but which hyperparameter settings will give you the best performance? Use a for loop to train your AR model on using settings for p from 1 to 30. Each time you train a new model, calculate its mean absolute error and append the result to the list maes. Then store your results in the Series mae_series
p_params = range(1, 31)
maes = []
for p in p_params:
    model=AutoReg(y_train, lags=p).fit()
    y_pred=model.predict().dropna()
    mae = mean_absolute_error(y_train.iloc[p:], y_pred)
    maes.append(mae)
mae_series = pd.Series(maes, name="mae", index=p_params)
mae_series.head(30)

In [None]:
#Look through the results in mae_series and determine what value for p provides the best performance. Then build and train final_model using the best hyperparameter value
best_p = AutoReg(y_train, lags=28).fit()
best_model =best_p

In [None]:
#Calculate the training residuals for best_model and assign the result to y_train_resid. Note that your name of your Series should be "residuals"
y_train_resid = best_model.resid
y_train_resid.name = "residuals"
y_train_resid.head()

In [None]:
#Create a histogram of y_train_resid. Be sure to label the x-axis as "Residuals" and the y-axis as "Frequency". Use the title "Best Model, Training Residuals"
# Plot histogram of residuals
plt.hist(y_train_resid)
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.title("Best Model, Training Residuals")
# Don't delete the code below 👇
plt.savefig("images/3-5-14.png", dpi=150)

In [None]:
#Create an ACF plot for y_train_resid. Be sure to label the x-axis as "Lag [hours]" and y-axis as "Correlation Coefficient". Use the title "Dar es Salaam, Training Residuals ACF"
fig, ax = plt.subplots(figsize=(15, 6))
plot_acf(y_train_resid,ax=ax)
plt.xlabel("Lag [Hours]")
plt.ylabel("Correction Coefficient")
plt.title("Dar es Salaam Training Residuals ACF")
# Don't delete the code below 👇
plt.savefig("images/3-5-15.png", dpi=150)


Evaluate

In [None]:
#Perform walk-forward validation for your model for the entire test set y_test. Store your model's predictions in the Series y_pred_wfv. Make sure the name of your Series is "prediction" and the name of your Series index is "timestamp"
y_pred_wfv = pd.Series()
history = y_train.copy()
for i in range(len(y_test)):
    model=AutoReg(history, lags=28).fit()
    next_pred=model.forecast()
    y_pred_wfv=y_pred_wfv.append(next_pred)
    history=history.append(y_test[next_pred.index])
    

y_pred_wfv.name = "prediction"
y_pred_wfv.index.name = "timestamp"
y_pred_wfv.head()

Communicate Results

In [None]:
#Put the values for y_test and y_pred_wfv into the DataFrame df_pred_test (don't forget the index). Then plot df_pred_test using plotly express. Be sure to label the x-axis as "Date" and the y-axis as "PM2.5 Level". Use the title "Dar es Salaam, WFV Predictions"
df_pred_test = pd.DataFrame(
{"y_test": y_test, "y_pred_wfv": y_pred_wfv})
fig = px.line(df_pred_test)
fig.update_layout(
    title="Dar es Salaam, WFV Predictions",
    xaxis_title="Date",
    yaxis_title="PM2.5 Level",
)
# Don't delete the code below 👇
fig.write_image("images/3-5-18.png", scale=1, height=500, width=700)

fig.show()