This is a test notebook!

I need to make an API call.

In [5]:
import requests
import pandas as pd
import os
import time
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv('API_KEY')
email = os.getenv('EMAIL')

In [6]:
# we can change this to include more states or different pollutants as needed
target_states = ["06", "48", "36", "53", "42", "37"] 

# this can also be changed if we want more or less years of data
years = range(2018, 2024) 

# these are the pollutant codes we want to retrieve data for
params = "44201,42602,42101,81102,88101" 

# mapping pollutant codes to their names 
pollutant_map = {
    '44201': 'Ozone',
    '42602': 'NO2',
    '42101': 'CO',
    '81102': 'PM10',
    '88101': 'PM2.5'
}

In [7]:
all_data = []

for state in target_states:
    for year in years:
        # encodes the start and end dates
        bdate = f"{year}0101"
        edate = f"{year}1231"
        
        # api fetch
        url = (f"https://aqs.epa.gov/data/api/annualData/byState?"
               f"email={email}&key={api_key}&bdate={bdate}&edate={edate}"
               f"&param={params}&state={state}")
        
        # loop to get all the data based on the stuff we set in the last code chunk
        try:
            r = requests.get(url, timeout=10)
            if r.status_code == 200:
                data = r.json().get('Data', [])
                if data:
                    temp_df = pd.DataFrame(data)
                    all_data.append(temp_df)
            else:
                print(f"  > Error: State {state} | Year {year} | Status: {r.status_code}")
        except Exception as e:
            print(f"  > Failed: {e}")
        
        time.sleep(0.5)

In [8]:
# combine all the data into one 
if all_data:
    df_raw = pd.concat(all_data, ignore_index=True)
    print(f"total rows: {len(df_raw)}")
    print(df_raw.head(3))

total rows: 42171
  state_code county_code site_number parameter_code  poc  latitude  longitude  \
0         06         025        1003          42602    1  32.79222 -115.56306   
1         06         025        0005          42602    1  32.67618 -115.48307   
2         06         007        0008          42602    1  39.76168 -121.84047   

   datum               parameter sample_duration_code  ...  \
0  WGS84  Nitrogen dioxide (NO2)                    1  ...   
1  NAD83  Nitrogen dioxide (NO2)                    1  ...   
2  NAD83  Nitrogen dioxide (NO2)                    1  ...   

  fiftieth_percentile tenth_percentile        local_site_name  \
0                11.9              4.0   El Centro-9th Street   
1                28.0              8.8  Calexico-Ethel Street   
2                13.8              5.8      Chico-East Avenue   

                          site_address       state    county       city  \
0               150 9TH ST., EL CENTRO  California  Imperial  El Centro 

In [9]:
# select useful columns (we can edit this as well)
cols_to_keep = ['state_code', 'county_code', 'year', 'parameter_code', 'first_max_value', 'arithmetic_mean', 'observation_count']
df_clean = df_raw[cols_to_keep].copy()

# data cleaning
df_clean['County_ID'] = df_clean['state_code'] + "_" + df_clean['county_code']

df_clean['Pollutant'] = df_clean['parameter_code'].map(pollutant_map)

print(df_clean.head())

  state_code county_code  year parameter_code  first_max_value  \
0         06         025  2018          42602             34.1   
1         06         025  2018          42602             73.0   
2         06         007  2018          42602             51.9   
3         06         023  2018          42602             58.1   
4         06         023  2018          42602             11.0   

   arithmetic_mean  observation_count County_ID Pollutant  
0        13.433537               3794    06_025       NO2  
1        28.111461               8092    06_025       NO2  
2        15.176731               8338    06_007       NO2  
3         7.043844               7962    06_023       NO2  
4         1.633441               7465    06_023       NO2  
