# Create AQI file from EPA data

In [2]:
# Libraries and Preliminaries

import pandas as pd
import os
import numpy as np
from functools import reduce
project_root_dir = 'put_your_root_directory_here/aqi_data/'

The following code builds the AQI EPA dataset.

In [3]:
aqi_classes = ['Good', 'Moderate', 'Unhealthy for sensitive groups', 'Unhealthy', 'Very unhealthy', 'Hazardous']
aqi_cutoffs = np.array([0, 50, 100, 150, 200, 300, np.inf])
year_list = [2016, 2017, 2018, 2019, 2020]

In [None]:
def load_data_in_year_range(base_file, file_type, year_range):
    """
    Loads all the data in a given year range for the specified pollutant.
    
    Args:
        base_file: file prefix for loading
        file_type: one of 'ozone' or 'pm2.5' (can add more if using every pollutant)
        year_range: list of years to look for in the directory
        
    Returns:
        Dataframe with dates, site ID, and daily AQI value for the specified range and particle.
    """
    file_list = []
    for year in year_range:
        file_list.append(pd.read_csv(base_file + '_' + file_type + '_' + str(year) + '.csv'))

    final_file = reduce(lambda x, y: pd.concat([x, y], ignore_index = True), file_list)
    final_file = final_file[["Date", "Site ID", "DAILY_AQI_VALUE"]] # extract only these columns
    renamed_column = file_type + '_AQI'
    final_file = final_file.rename(columns = {'Date': 'Date', 'Site ID': 'Site_ID','DAILY_AQI_VALUE': renamed_column})
    final_file['Date'] = pd.to_datetime(final_file['Date'])
    return final_file

In [None]:
def clean_AQI_csv(aqi_classes, aqi_cutoffs, year_list):
    """
    Clean all of the AQI files in the current AQI directory.
    
    Args:
        aqi_classes: list of air quality class names under consideration
        aqi_cutoffs: numpy array of the cutoffs corresponding to the above classes
        year_list: list of years to be loaded
        
    Returns:
        Dataframe with AQI information for training.
    """
    
    base_file = os.path.join(project_root_dir, 'all_sites')

    # Load datasets
    ozone_data = load_data_in_year_range(base_file, 'ozone', year_list)
    pm_data = load_data_in_year_range(base_file, 'pm2.5', year_list)

    # Merge the datasets and calculate the AQI from the max of each particle's AQI
    site = pd.merge(ozone_data, pm_data, how = 'outer', on = ['Date', 'Site_ID'])
    site = site.fillna(0)
    site = site[~((site["ozone_AQI"] == 0) & (site["pm2.5_AQI"] == 0))]
    site['AQI'] = np.array([np.max(aqi_types) for aqi_types in site.loc[:, ['ozone_AQI', 'pm2.5_AQI']].to_numpy()])

    # Cast the AQI into classes
    site['class'] = pd.cut(site['AQI'], bins = aqi_cutoffs, labels = np.arange(6))
    site['pm2.5_class'] = pd.cut(site['pm2.5_AQI'], bins = aqi_cutoffs, labels = np.arange(6))
    site['AQI_class'] = pd.cut(site['AQI'], bins = aqi_cutoffs, labels = aqi_classes)
    site['pm2.5_AQI_class'] = pd.cut(site['pm2.5_AQI'], bins = aqi_cutoffs, labels = aqi_classes)
    
    # Date cutoff to replicate our analysis
    site = site.sort_values(['Date', 'Site_ID'])
    site = site[(site[('Date')] <= '2020-10-11')]
    site = site[~site[["Date", "Site_ID"]].duplicated()]
    
    # Save final output file
    site.to_csv(os.path.join(project_root_dir, 'final_data', 'all_sites_data.csv'), index = False)

In [None]:
# Run the cleaning function
clean_AQI_csv(aqi_classes, aqi_cutoffs, year_list)