In [1]:
import pandas as pd
import numpy as np

In [2]:
# import zipfile
# import io
# from google.colab import files

# def save_csv_from_zip(zip_file_path):
#     with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
#         for file_name in zip_ref.namelist():
#             with zip_ref.open(file_name) as csv_file:
#                 csv_data = io.StringIO(csv_file.read().decode('utf-8'))
#                 df = pd.read_csv(csv_data)
#                 file_name = file_name.replace('.zip', '.csv')
#                 file_path = '/content/' + file_name
#                 df.to_csv(file_path, index=False)
#                 files.download(file_path)

From https://www.epa.gov/outdoor-air-quality-data/about-air-data-reports:

"The AirData Air Quality Index Summary Report displays an annual summary of Air Quality Index (AQI) values for counties or core based statistical areas (CBSA). Air Quality Index is an indicator of overall air quality, because it takes into account all of the criteria air pollutants measured within a geographic area. Although AQI includes all available pollutant measurements, you should be aware that many areas have monitoring stations for some, but not all, of the pollutants. Each row of the AQI Report lists summary values for one year for one county or CBSA. The summary values include both qualitative measures (days of the year having "good" air quality, for example) and descriptive statistics (median AQI value, for example)."



# EPA Air Quality Index

Use EPA data:
https://aqs.epa.gov/aqsweb/airdata/download_files.html#Annual

## Function to get annual median AQI values

In [3]:
state_names = ["CA", "TX", "ND", "WY", "PA", "WV", "OK", "FL", "NY", "OH"]

In [4]:
def get_annual_median_AQI_values(year_csv, year, state_names):

    annual_aqi_df = pd.read_csv(year_csv)

    # keep only CBSA and Median AQI cols
    annual_aqi_df = annual_aqi_df[['CBSA', 'Median AQI']]

    # create new 'state' col
    annual_aqi_df['state'] = annual_aqi_df['CBSA'].str.split(', ').str[1]

    # remove CBSA column
    annual_aqi_df = annual_aqi_df[['state', 'Median AQI']]

    # group data by state and calculate mean of Median AQI
    annual_aqi_by_state = annual_aqi_df.groupby('state').mean().reset_index()

    # only include desired states
    annual_aqi_by_state = annual_aqi_by_state[annual_aqi_by_state['state'].isin(state_names)].reset_index(drop=True)

    # create dict with state abbreviations as keys and Median AQI values as values
    state_aqi_medians = dict(zip(annual_aqi_by_state['state'], annual_aqi_by_state['Median AQI']))

    # create df with one row (year) and columns for each state
    annual_state_aqi_medians_df = pd.DataFrame({
        'year': [year],
        'CA': [state_aqi_medians.get('CA')],
        'TX': [state_aqi_medians.get('TX')],
        'ND': [state_aqi_medians.get('ND')],
        'WY': [state_aqi_medians.get('WY')],
        'PA': [state_aqi_medians.get('PA')],
        'WV': [state_aqi_medians.get('WV')],
        'OK': [state_aqi_medians.get('OK')],
        'FL': [state_aqi_medians.get('FL')],
        'NY': [state_aqi_medians.get('NY')],
        'OH': [state_aqi_medians.get('OH')]
    })

    return annual_state_aqi_medians_df


# Retrieve data and create CSV

#### Create the initial data frame with 1 row (year 2000)

In [5]:
# create df for 2000
annual_median_AQI_2000_df = get_annual_median_AQI_values('annual_aqi_by_cbsa_2000.csv', '2000', state_names)
annual_median_AQI_2000_df

Unnamed: 0,year,CA,TX,ND,WY,PA,WV,OK,FL,NY,OH
0,2000,53.6,41.588235,35.0,27.714286,40.722222,52.666667,41.076923,40.3,32.066667,41.090909


In [6]:
# create total_annual_median_AQI_df
total_annual_median_AQI_df = annual_median_AQI_2000_df.copy()
total_annual_median_AQI_df

Unnamed: 0,year,CA,TX,ND,WY,PA,WV,OK,FL,NY,OH
0,2000,53.6,41.588235,35.0,27.714286,40.722222,52.666667,41.076923,40.3,32.066667,41.090909


In [7]:
# create list for years 2001 to 2022
years = []
for year in range(2001, 2023):
    years.append(str(year))
print(years)

['2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022']


In [8]:
# for each year
for year in years:
  
  file_name = 'annual_aqi_by_cbsa_'+year+'.csv'
  
  # create df for that year
  annual_median_AQI_df = get_annual_median_AQI_values(file_name, year, state_names)
  
  # add as row to total_annual_median_AQI_df
  total_annual_median_AQI_df = pd.concat([total_annual_median_AQI_df, annual_median_AQI_df], axis=0)

In [9]:
# reset index
total_annual_median_AQI_df = total_annual_median_AQI_df.reset_index(drop=True)
total_annual_median_AQI_df

Unnamed: 0,year,CA,TX,ND,WY,PA,WV,OK,FL,NY,OH
0,2000,53.6,41.588235,35.0,27.714286,40.722222,52.666667,41.076923,40.3,32.066667,41.090909
1,2001,54.257143,41.611111,35.0,31.571429,45.333333,52.166667,41.083333,38.681818,35.0,43.333333
2,2002,57.617647,41.388889,33.5,30.142857,43.666667,48.5,38.833333,38.52381,34.6,45.894737
3,2003,55.0,41.736842,29.0,31.571429,42.055556,47.833333,40.5,38.380952,33.133333,44.315789
4,2004,53.294118,38.555556,30.0,28.428571,39.684211,46.5,37.636364,41.761905,32.769231,41.105263
5,2005,49.264706,44.25,34.5,28.0,43.631579,49.5,41.0,41.857143,36.153846,43.388889
6,2006,51.382353,44.052632,36.5,32.571429,40.1,47.666667,42.777778,43.47619,33.0,39.888889
7,2007,53.323529,43.3,34.5,36.625,41.35,52.5,41.777778,41.761905,36.153846,41.5
8,2008,54.794118,42.428571,37.0,36.0,41.631579,45.666667,40.555556,39.5,36.307692,41.388889
9,2009,51.294118,40.565217,25.333333,33.875,40.631579,44.666667,45.0,39.318182,34.923077,38.277778


In [10]:
# save as CSV
total_annual_median_AQI_df.to_csv('total_annual_median_AQI.csv')