In [1]:
import pandas as pd
from datetime import timedelta, date, datetime
import plotly.express as px


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Data
Here is where you can download the data from EPA: https://www.epa.gov/outdoor-air-quality-data/download-daily-data

Select: 
- Pollutant = PM2.5
- Year = 2020 thru 2024 (individual downloads for each, sigh)
- Geographic Area = Houston - The Woodlands - Sugar Land CBSA + All Sites

Subsequent downloads of past data aren't always identical, telling me that they likely continue to update values long after the dates have passed. So just download files fresh every time you're doing this. I would assume the further back you go, the less likely things are to change but... again... I'm not wasting time on inspecting that now.

In [12]:
base_path = '../data/source/EPA/'
file_suffix = '_houmetro_epa_ad_viz_plotval_data'

years = range(2018, 2025)
pm25_dfs = []
for year in years:
    df = pd.read_csv(f'{base_path}{year}{file_suffix}.csv')
    df['datetime'] = pd.to_datetime(df['Date'])
    df['year'] = df['datetime'].dt.year
    df['month'] = df['datetime'].dt.month
    df['day_of_week'] = df['datetime'].dt.day_name()
    pm25_dfs.append(df)
    
pm25 = pd.concat(pm25_dfs)

In [13]:
print(pm25.dtypes)
print(pm25.datetime.min())
print(pm25.datetime.max())
display(pm25.head())

Date                                      object
Source                                    object
Site ID                                    int64
POC                                        int64
Daily Mean PM2.5 Concentration           float64
Units                                     object
Daily AQI Value                            int64
Local Site Name                           object
Daily Obs Count                            int64
Percent Complete                         float64
AQS Parameter Code                         int64
AQS Parameter Description                 object
Method Code                              float64
Method Description                        object
CBSA Code                                  int64
CBSA Name                                 object
State FIPS Code                            int64
State                                     object
County FIPS Code                           int64
County                                    object
Site Latitude       

Unnamed: 0,Date,Source,Site ID,POC,Daily Mean PM2.5 Concentration,Units,Daily AQI Value,Local Site Name,Daily Obs Count,Percent Complete,AQS Parameter Code,AQS Parameter Description,Method Code,Method Description,CBSA Code,CBSA Name,State FIPS Code,State,County FIPS Code,County,Site Latitude,Site Longitude,datetime,year,month,day_of_week
0,01/02/2018,AQS,481671034,1,4.7,ug/m3 LC,26,Galveston 99th Street,1,100.0,88101,PM2.5 - Local Conditions,145.0,R & P Model 2025 PM-2.5 Sequential Air Sampler...,26420,"Houston-The Woodlands-Sugar Land, TX",48,Texas,167,Galveston,29.254474,-94.861289,2018-01-02,2018,1,Tuesday
1,01/08/2018,AQS,481671034,1,7.1,ug/m3 LC,39,Galveston 99th Street,1,100.0,88101,PM2.5 - Local Conditions,145.0,R & P Model 2025 PM-2.5 Sequential Air Sampler...,26420,"Houston-The Woodlands-Sugar Land, TX",48,Texas,167,Galveston,29.254474,-94.861289,2018-01-08,2018,1,Monday
2,01/14/2018,AQS,481671034,1,3.4,ug/m3 LC,19,Galveston 99th Street,1,100.0,88101,PM2.5 - Local Conditions,145.0,R & P Model 2025 PM-2.5 Sequential Air Sampler...,26420,"Houston-The Woodlands-Sugar Land, TX",48,Texas,167,Galveston,29.254474,-94.861289,2018-01-14,2018,1,Sunday
3,01/20/2018,AQS,481671034,1,4.6,ug/m3 LC,26,Galveston 99th Street,1,100.0,88101,PM2.5 - Local Conditions,145.0,R & P Model 2025 PM-2.5 Sequential Air Sampler...,26420,"Houston-The Woodlands-Sugar Land, TX",48,Texas,167,Galveston,29.254474,-94.861289,2018-01-20,2018,1,Saturday
4,01/26/2018,AQS,481671034,1,6.4,ug/m3 LC,36,Galveston 99th Street,1,100.0,88101,PM2.5 - Local Conditions,145.0,R & P Model 2025 PM-2.5 Sequential Air Sampler...,26420,"Houston-The Woodlands-Sugar Land, TX",48,Texas,167,Galveston,29.254474,-94.861289,2018-01-26,2018,1,Friday


In [19]:
#ok we're gonna first do something real quick and dirty... just group all the sites together
#by date and take the avg. pm2.5 reading for each date
#then we're gonna plot that baby with plotly!
by_date = pm25.groupby(['datetime','year','month','day_of_week']).agg({'Daily Mean PM2.5 Concentration': 'mean'}).reset_index()
just_summer = by_date[(by_date['month'] >= 6) & (by_date['month'] <= 8)]

years = just_summer['year'].unique()
for year in years:
    annual_data = just_summer.loc[just_summer['year'] == year]
    fig = px.line(annual_data, x='datetime', y='Daily Mean PM2.5 Concentration', 
                  title=str(year), hover_data=['day_of_week'])
    fig.show()

In [15]:
just_summer

Unnamed: 0,datetime,year,month,day_of_week,Daily Mean PM2.5 Concentration
151,2018-06-01,2018,6,Friday,13.56
152,2018-06-02,2018,6,Saturday,9.95
153,2018-06-03,2018,6,Sunday,8.27
154,2018-06-04,2018,6,Monday,9.425
155,2018-06-05,2018,6,Tuesday,8.2
156,2018-06-06,2018,6,Wednesday,7.18
157,2018-06-07,2018,6,Thursday,7.026667
158,2018-06-08,2018,6,Friday,6.37
159,2018-06-09,2018,6,Saturday,10.79
160,2018-06-10,2018,6,Sunday,21.227273
