### AQS Data Exploration

In [1]:
import pandas as pd
import pandasql as ps
import numpy as np
from datetime import datetime 

In [2]:
aqs_df = pd.read_csv('../data/aqs_data.csv')

In [3]:
aqi_bp = pd.read_csv('../data/aqi_breakpoints.csv')
aqi_bp.columns = aqi_bp.columns.str.lower().str.replace(' ', '_')
aqi_bp = aqi_bp.loc[aqi_bp.aqi_category != 'NONE']

In [4]:
aqi_bp = aqi_bp.loc[aqi_bp.aqi_category != 'NONE']

#### Declare the Static Variables

In [5]:
# Associated Cities and Counties
city_county_dict = {'Los Angeles': 'Los Angeles',
                    'San Diego': 'San Diego',
                    'Denver': 'Denver',
                    'Davidson': 'Nashville',
                    'Fulton, DeKalb': 'Atlanta'}

#### Give More Concise Names to Relevant Fields

In [6]:
aqs_df = aqs_df.rename(columns={'units_of_measure':'measure_units',
                                'first_max_value':'max_value1',
                                'first_max_datetime':'max_month1', 
                                'second_max_value':'max_value2', 
                                'second_max_datetime':'max_month2',
                                'third_max_value':'max_value3', 
                                'third_max_datetime':'max_month3', 
                                'fourth_max_value':'max_value4', 
                                'fourth_max_datetime':'max_month4',
                                'ninety_ninth_percentile':'percentile99',
                                'ninety_fifth_percentile':'percentile95',
                                'ninetieth_percentile':'percentile90',
                                'seventy_fifth_percentile':'percentile75',
                                'sample_duration_code':'duration_code'
                               })

#### Create a Calculated Field for the Average of the Four Max Measurement Values

In [7]:
aqs_df['avg_max_value'] = [round(np.mean([row.max_value1, row.max_value2, row.max_value3, row.max_value4]), 1) for ind, row in aqs_df.iterrows()]

##### ##### Classify Measurement Percentile???

#### Relabel Params

In [8]:
aqs_df.parameter.value_counts()

PM2.5 - Local Conditions                  718
Ozone                                     380
Nitrogen dioxide (NO2)                    202
Nitric oxide (NO)                         155
Acceptable PM2.5 AQI & Speciation Mass    155
Sulfur dioxide                            149
Carbon monoxide                           140
PM10 Total 0-10um STP                     123
Benzene                                    75
Name: parameter, dtype: int64

#### Relabel Atlanta's Counties as a Joint County

In [9]:
aqs_df['county'] = ['Fulton, DeKalb' if row.state == 'Georgia' else row.county for index, row in aqs_df.iterrows()]

#### Add Associated Cities (for potential merging purposes)

In [10]:
aqs_df['assoc_city'] = [city_county_dict[row.county] for index, row in aqs_df.iterrows()]

#### Subset for Parameters with Data in All Four States

In [11]:
state_counts = pd.DataFrame(aqs_df.groupby(['parameter_code','state']).state.count())
state_counts = state_counts.rename(columns={'state':'count'}).reset_index()

In [12]:
aqs_params = state_counts.groupby('parameter_code').state.count()
valid_params = list(aqs_params[aqs_params.values == 4].index)
aqs_df = aqs_df.loc[aqs_df.parameter_code.isin(valid_params)]
valid_params

[42101, 42401, 42601, 42602, 44201, 81102, 88101, 88502]

#### Extract the Month from the Date Fields

In [13]:
pd.set_option('mode.chained_assignment', None)
cols = ['max_month1', 'max_month2', 'max_month3', 'max_month4']
for col in cols:
    aqs_df[col] = pd.to_datetime(aqs_df[col]).dt.month
aqs_df.head(1)

Unnamed: 0,state_code,county_code,site_number,parameter_code,poc,latitude,longitude,datum,parameter,duration_code,...,local_site_name,site_address,state,county,city,cbsa_code,cbsa,date_of_last_change,avg_max_value,assoc_city
0,6,37,6012,42101,1,34.38344,-118.5284,WGS84,Carbon monoxide,1,...,Santa Clarita,"22224 PLACERITA CANYON RD, SANTA CLARITA",California,Los Angeles,Santa Clarita,31080,"Los Angeles-Long Beach-Anaheim, CA",2021-11-08,1.2,Los Angeles


#### Subset for and Reorder Relevant Fields

In [14]:
aqs_df = aqs_df[['parameter', 'year', 'state', 'assoc_city', 'county',
                 'measure_units', 'avg_max_value',
                 'max_month1', 'max_value1', 'max_month2', 'max_value2', 
                 'max_month3', 'max_value3', 'max_month4', 'max_value4', 
                 'percentile99', 'percentile95', 'percentile90', 'percentile75', 
                 'parameter_code', 'duration_code', 'sample_duration',
                 'latitude', 'longitude']]

In [15]:
aqs_df.groupby(['parameter_code', 'measure_units']).parameter.count()

parameter_code  measure_units                
42101           Parts per million                140
42401           Parts per billion                149
42601           Parts per billion                155
42602           Parts per billion                202
44201           Parts per million                380
81102           Micrograms/cubic meter (25 C)    123
88101           Micrograms/cubic meter (LC)      718
88502           Micrograms/cubic meter (LC)      155
Name: parameter, dtype: int64

# NOTE: 300 Records are 'NOT VALID'
# CAN FILTER ON THIS WHEN USING MORE YEARS
aqs_df.validity_indicator.value_counts()

In [16]:
aqs_df.to_csv('../data/aqs_data_cleaned.csv', index=False)

In [17]:
aqs_df.shape

(2022, 24)

#### Merge the AQI Category Data with the Filtered Parameter Data

In [18]:
aqs_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2022 entries, 0 to 2096
Data columns (total 24 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   parameter        2022 non-null   object 
 1   year             2022 non-null   int64  
 2   state            2022 non-null   object 
 3   assoc_city       2022 non-null   object 
 4   county           2022 non-null   object 
 5   measure_units    2022 non-null   object 
 6   avg_max_value    2022 non-null   float64
 7   max_month1       2022 non-null   int64  
 8   max_value1       2022 non-null   float64
 9   max_month2       2022 non-null   int64  
 10  max_value2       2022 non-null   float64
 11  max_month3       2022 non-null   int64  
 12  max_value3       2022 non-null   float64
 13  max_month4       2022 non-null   int64  
 14  max_value4       2022 non-null   float64
 15  percentile99     2022 non-null   float64
 16  percentile95     2022 non-null   float64
 17  percentile90  

In [19]:
aqi_bp.parameter.unique()

array(['Acceptable PM2.5 AQI & Speciation Mass', 'Carbon monoxide',
       'Nitrogen dioxide (NO2)', 'Ozone', 'PM10 Total 0-10um STP',
       'PM2.5 - Local Conditions', 'Sulfur dioxide'], dtype=object)

In [20]:
aqs_df.parameter.unique()

array(['Carbon monoxide', 'Sulfur dioxide', 'Nitric oxide (NO)',
       'Nitrogen dioxide (NO2)', 'Ozone', 'PM10 Total 0-10um STP',
       'PM2.5 - Local Conditions',
       'Acceptable PM2.5 AQI & Speciation Mass'], dtype=object)

In [21]:
aqi_params = list(aqi_bp.parameter.unique())
aqi_params

['Acceptable PM2.5 AQI & Speciation Mass',
 'Carbon monoxide',
 'Nitrogen dioxide (NO2)',
 'Ozone',
 'PM10 Total 0-10um STP',
 'PM2.5 - Local Conditions',
 'Sulfur dioxide']

In [22]:
aqs_params= list(aqs_df.parameter.unique())
aqs_params

['Carbon monoxide',
 'Sulfur dioxide',
 'Nitric oxide (NO)',
 'Nitrogen dioxide (NO2)',
 'Ozone',
 'PM10 Total 0-10um STP',
 'PM2.5 - Local Conditions',
 'Acceptable PM2.5 AQI & Speciation Mass']

In [23]:
unique_params = [param for param in aqs_params if param not in aqi_params]
print(unique_params)

['Nitric oxide (NO)']


In [24]:
aqi_bp[aqi_bp.parameter_code == 42401].duration_description.unique()

array(['1 HOUR', '24-HR BLK AVG'], dtype=object)

In [25]:
aqs_df[aqs_df.parameter_code == 42401].sample_duration.unique()

array(['1 HOUR', '24-HR BLK AVG', '3-HR BLK AVG', '5 MINUTE'],
      dtype=object)

In [26]:
aqi_bp[aqi_bp.parameter_code == 42101].duration_description.unique()

array(['8-HR RUN AVG END HOUR'], dtype=object)

In [27]:
aqs_df[aqs_df.parameter_code == 42101].sample_duration.unique()

array(['1 HOUR', '8-HR RUN AVG END HOUR'], dtype=object)

In [28]:
query = """
        SELECT aqs_df.year,
               aqs_df.state,
               aqs_df.assoc_city,
               aqs_df.county,
               aqs_df.parameter_code,
               aqs_df.parameter,
               aqs_df.measure_units,
               aqs_df.duration_code AS dur_code,
               aqs_df.sample_duration AS dur_desc,
               aqs_df.max_month1,
               aqs_df.max_value1,
               aqs_df.max_month2,
               aqs_df.max_value2,
               aqs_df.max_month3,
               aqs_df.max_value3,
               aqs_df.max_month4,
               aqs_df.max_value4,
               aqs_df.percentile99,
               aqs_df.percentile95,
               aqs_df.percentile90,
               aqs_df.percentile75,
               aqs_df.avg_max_value,
               aqi_bp.low_breakpoint,
               aqi_bp.high_breakpoint,
               aqi_bp.aqi_category
        FROM aqs_df
        INNER JOIN aqi_bp
        ON (aqs_df.parameter = aqi_bp.parameter) &
           (aqs_df.parameter_code = aqi_bp.parameter_code) &
           (aqs_df.duration_code = aqi_bp.duration_code) &
           (aqs_df.sample_duration = aqi_bp.duration_description) &
           (aqs_df.avg_max_value BETWEEN aqi_bp.low_breakpoint AND aqi_bp.high_breakpoint)
        """

aqs_category_data = ps.sqldf(query,locals())

In [29]:
aqs_category_data.shape

(1489, 25)

In [30]:
aqs_category_data.head(3)

Unnamed: 0,year,state,assoc_city,county,parameter_code,parameter,measure_units,dur_code,dur_desc,max_month1,...,max_month4,max_value4,percentile99,percentile95,percentile90,percentile75,avg_max_value,low_breakpoint,high_breakpoint,aqi_category
0,2011,California,Los Angeles,Los Angeles,42101,Carbon monoxide,Parts per million,Z,8-HR RUN AVG END HOUR,12,...,12,0.8,0.6,0.5,0.4,0.3,0.8,0.0,4.4,GOOD
1,2011,California,Los Angeles,Los Angeles,42101,Carbon monoxide,Parts per million,Z,8-HR RUN AVG END HOUR,12,...,1,1.3,1.1,0.8,0.7,0.5,1.4,0.0,4.4,GOOD
2,2011,California,Los Angeles,Los Angeles,42101,Carbon monoxide,Parts per million,Z,8-HR RUN AVG END HOUR,12,...,12,1.5,1.1,0.8,0.7,0.5,1.5,0.0,4.4,GOOD


In [31]:
aqs_category_data.to_excel(r'../data/aqs_category_data.xlsx', sheet_name='aqs_category_data', index=False)