### AQS Data Exploration

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime 

In [2]:
aqs_df = pd.read_csv('../data/epa_aqs_data.csv', index_col=0)

#### Filter the Resulting DataFrame (Only Keep Parameters with Data For All Four States)

In [3]:
aqs_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2191 entries, 0 to 2190
Data columns (total 56 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   state_code                   2191 non-null   int64  
 1   county_code                  2191 non-null   int64  
 2   site_number                  2191 non-null   int64  
 3   parameter_code               2191 non-null   int64  
 4   poc                          2191 non-null   int64  
 5   latitude                     2191 non-null   float64
 6   longitude                    2191 non-null   float64
 7   datum                        2191 non-null   object 
 8   parameter                    2191 non-null   object 
 9   sample_duration_code         2191 non-null   object 
 10  sample_duration              2191 non-null   object 
 11  pollutant_standard           1638 non-null   object 
 12  metric_used                  2191 non-null   object 
 13  method            

In [4]:
aqs_df.head()

Unnamed: 0,state_code,county_code,site_number,parameter_code,poc,latitude,longitude,datum,parameter,sample_duration_code,...,fiftieth_percentile,tenth_percentile,local_site_name,site_address,state,county,city,cbsa_code,cbsa,date_of_last_change
0,6,37,6012,42101,1,34.38344,-118.5284,WGS84,Carbon monoxide,1,...,0.2,0.1,Santa Clarita,"22224 PLACERITA CANYON RD, SANTA CLARITA",California,Los Angeles,Santa Clarita,31080,"Los Angeles-Long Beach-Anaheim, CA",2021-11-08
1,6,37,6012,42101,1,34.38344,-118.5284,WGS84,Carbon monoxide,Z,...,0.2,0.1,Santa Clarita,"22224 PLACERITA CANYON RD, SANTA CLARITA",California,Los Angeles,Santa Clarita,31080,"Los Angeles-Long Beach-Anaheim, CA",2021-11-08
2,6,37,113,42101,1,34.05111,-118.45636,WGS84,Carbon monoxide,1,...,0.3,0.2,West Los Angeles,"VA HOSPITAL, WEST LOS ANGELES",California,Los Angeles,West Los Angeles,31080,"Los Angeles-Long Beach-Anaheim, CA",2021-11-18
3,6,37,113,42101,1,34.05111,-118.45636,WGS84,Carbon monoxide,Z,...,0.3,0.2,West Los Angeles,"VA HOSPITAL, WEST LOS ANGELES",California,Los Angeles,West Los Angeles,31080,"Los Angeles-Long Beach-Anaheim, CA",2021-11-18
4,6,37,1701,42101,1,34.06703,-117.7514,WGS84,Carbon monoxide,1,...,0.3,0.0,Pomona,"924 N. GAREY AVE., POMONA",California,Los Angeles,Pomona,31080,"Los Angeles-Long Beach-Anaheim, CA",2021-11-08


#### Rename Relevant Fields

In [5]:
aqs_df = aqs_df.rename(columns={'first_max_value':'max_value_1', 
                                'first_max_datetime':'max_month_1', 
                                'second_max_value':'max_value_2', 
                                'second_max_datetime':'max_month_2',
                                'third_max_value':'max_value_3', 
                                'third_max_datetime':'max_month_3', 
                                'fourth_max_value':'max_value_4', 
                                'fourth_max_datetime':'max_month_4',
                                'ninety_ninth_percentile':'perc_99',
                                'ninety_fifth_percentile':'perc_95',
                                'ninetieth_percentile':'perc_90',
                                'seventy_fifth_percentile':'perc_75'})



#### Create a Calculated Field for the Average of the Four Max Measurement Values

In [6]:
aqs_df['avg_max_value'] = [round(np.mean([row.max_value_1, row.max_value_2, row.max_value_3, row.max_value_4]), 1) for ind, row in aqs_df.iterrows()]

In [7]:
aqs_df

Unnamed: 0,state_code,county_code,site_number,parameter_code,poc,latitude,longitude,datum,parameter,sample_duration_code,...,tenth_percentile,local_site_name,site_address,state,county,city,cbsa_code,cbsa,date_of_last_change,avg_max_value
0,6,37,6012,42101,1,34.383440,-118.52840,WGS84,Carbon monoxide,1,...,0.1,Santa Clarita,"22224 PLACERITA CANYON RD, SANTA CLARITA",California,Los Angeles,Santa Clarita,31080,"Los Angeles-Long Beach-Anaheim, CA",2021-11-08,1.2
1,6,37,6012,42101,1,34.383440,-118.52840,WGS84,Carbon monoxide,Z,...,0.1,Santa Clarita,"22224 PLACERITA CANYON RD, SANTA CLARITA",California,Los Angeles,Santa Clarita,31080,"Los Angeles-Long Beach-Anaheim, CA",2021-11-08,0.8
2,6,37,113,42101,1,34.051110,-118.45636,WGS84,Carbon monoxide,1,...,0.2,West Los Angeles,"VA HOSPITAL, WEST LOS ANGELES",California,Los Angeles,West Los Angeles,31080,"Los Angeles-Long Beach-Anaheim, CA",2021-11-18,2.8
3,6,37,113,42101,1,34.051110,-118.45636,WGS84,Carbon monoxide,Z,...,0.2,West Los Angeles,"VA HOSPITAL, WEST LOS ANGELES",California,Los Angeles,West Los Angeles,31080,"Los Angeles-Long Beach-Anaheim, CA",2021-11-18,1.4
4,6,37,1701,42101,1,34.067030,-117.75140,WGS84,Carbon monoxide,1,...,0.0,Pomona,"924 N. GAREY AVE., POMONA",California,Los Angeles,Pomona,31080,"Los Angeles-Long Beach-Anaheim, CA",2021-11-08,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2186,47,37,23,88502,3,36.176382,-86.73890,WGS84,Acceptable PM2.5 AQI & Speciation Mass,X,...,5.9,Lockeland,105 SOUTH 17TH ST @ LOCKELAND SCHOOL,Tennessee,Davidson,Nashville,34980,"Nashville-Davidson--Murfreesboro--Franklin, TN",2020-12-21,25.5
2187,47,37,23,88502,3,36.176382,-86.73890,WGS84,Acceptable PM2.5 AQI & Speciation Mass,1,...,4.6,Lockeland,105 SOUTH 17TH ST @ LOCKELAND SCHOOL,Tennessee,Davidson,Nashville,34980,"Nashville-Davidson--Murfreesboro--Franklin, TN",2020-12-21,37.2
2188,47,37,23,88502,5,36.176382,-86.73890,WGS84,Acceptable PM2.5 AQI & Speciation Mass,7,...,5.3,Lockeland,105 SOUTH 17TH ST @ LOCKELAND SCHOOL,Tennessee,Davidson,Nashville,34980,"Nashville-Davidson--Murfreesboro--Franklin, TN",2020-05-21,23.7
2189,47,37,23,88502,3,36.176382,-86.73890,WGS84,Acceptable PM2.5 AQI & Speciation Mass,X,...,6.3,Lockeland,105 SOUTH 17TH ST @ LOCKELAND SCHOOL,Tennessee,Davidson,Nashville,34980,"Nashville-Davidson--Murfreesboro--Franklin, TN",2020-12-21,18.6


##### Classify Measurement Percentile???

#### Relabel Params

In [34]:
aqs_df.parameter.value_counts()

PM2.5 - Local Conditions                  718
Ozone                                     380
Nitrogen dioxide (NO2)                    202
Nitric oxide (NO)                         155
Acceptable PM2.5 AQI & Speciation Mass    155
Sulfur dioxide                            149
Carbon monoxide                           140
PM10 Total 0-10um STP                     123
Name: parameter, dtype: int64

In [8]:
for ind, row in aqs_df.iterrows():
    if row.parameter == 'PM2.5 - Local Conditions':
        aqs_df.loc[ind, 'parameter'] = 'PM2.5 - LC'
    elif row.parameter == 'Acceptable PM2.5 AQI & Speciation Mass':
        aqs_df.loc[ind, 'parameter'] = 'PM2.5 - Acc/SM'
aqs_df['parameter'] = ['PM2.5 - LC' if row.parameter == 'PM2.5 - Local Conditions' else row.county for index, row in aqs_df.iterrows()]

#### Relabel Atlanta's Counties as a Joint County

In [8]:
aqs_df['county'] = ['Fulton, DeKalb' if row.state == 'Georgia' else row.county for index, row in aqs_df.iterrows()]

#### Subset for Parameters with Data in All Four States

In [9]:
state_counts = pd.DataFrame(aqs_df.groupby(['parameter_code','state']).state.count())
state_counts = state_counts.rename(columns={'state':'count'}).reset_index()

In [10]:
aqs_params = state_counts.groupby('parameter_code').state.count()
valid_params = list(aqs_params[aqs_params.values == 4].index)
aqs_df = aqs_df.loc[aqs_df.parameter_code.isin(valid_params)]

#### Extract the Month from the Date Fields

In [11]:
aqs_df['max_month_1']

0       2011-02-07 08:00
1       2011-12-21 23:00
2       2011-10-04 05:00
3       2011-12-31 23:00
4       2011-12-31 20:00
              ...       
2186    2011-05-31 00:00
2187    2011-08-01 06:00
2188    2011-06-08 00:00
2189    2016-07-22 00:00
2190    2016-11-13 21:00
Name: max_month_1, Length: 2022, dtype: object

In [12]:
pd.to_datetime(aqs_df['max_month_1']).dt.month

0        2
1       12
2       10
3       12
4       12
        ..
2186     5
2187     8
2188     6
2189     7
2190    11
Name: max_month_1, Length: 2022, dtype: int64

In [13]:
pd.set_option('mode.chained_assignment', None)
cols = ['max_month_1', 'max_month_2', 'max_month_3', 'max_month_4']
for col in cols:
    aqs_df[col] = pd.to_datetime(aqs_df[col]).dt.month
aqs_df.head()

Unnamed: 0,state_code,county_code,site_number,parameter_code,poc,latitude,longitude,datum,parameter,sample_duration_code,...,tenth_percentile,local_site_name,site_address,state,county,city,cbsa_code,cbsa,date_of_last_change,avg_max_value
0,6,37,6012,42101,1,34.38344,-118.5284,WGS84,Carbon monoxide,1,...,0.1,Santa Clarita,"22224 PLACERITA CANYON RD, SANTA CLARITA",California,Los Angeles,Santa Clarita,31080,"Los Angeles-Long Beach-Anaheim, CA",2021-11-08,1.2
1,6,37,6012,42101,1,34.38344,-118.5284,WGS84,Carbon monoxide,Z,...,0.1,Santa Clarita,"22224 PLACERITA CANYON RD, SANTA CLARITA",California,Los Angeles,Santa Clarita,31080,"Los Angeles-Long Beach-Anaheim, CA",2021-11-08,0.8
2,6,37,113,42101,1,34.05111,-118.45636,WGS84,Carbon monoxide,1,...,0.2,West Los Angeles,"VA HOSPITAL, WEST LOS ANGELES",California,Los Angeles,West Los Angeles,31080,"Los Angeles-Long Beach-Anaheim, CA",2021-11-18,2.8
3,6,37,113,42101,1,34.05111,-118.45636,WGS84,Carbon monoxide,Z,...,0.2,West Los Angeles,"VA HOSPITAL, WEST LOS ANGELES",California,Los Angeles,West Los Angeles,31080,"Los Angeles-Long Beach-Anaheim, CA",2021-11-18,1.4
4,6,37,1701,42101,1,34.06703,-117.7514,WGS84,Carbon monoxide,1,...,0.0,Pomona,"924 N. GAREY AVE., POMONA",California,Los Angeles,Pomona,31080,"Los Angeles-Long Beach-Anaheim, CA",2021-11-08,2.0


#### Subset for Relevant Fields

In [14]:
aqs_df = aqs_df[['parameter', 'year', 'state', 'county', 'metric_used', 'method', 
                 'event_type', 'observation_count', 'observation_percent', 'validity_indicator', 'valid_day_count', 'required_day_count', 
                 'poc', 'exceptional_data_count', 'primary_exceedance_count', 'certification_indicator',
                 'arithmetic_mean', 'standard_deviation', 'units_of_measure', 'avg_max_value',
                 'max_month_1', 'max_value_1', 'max_month_2', 'max_value_2', 
                 'max_month_3', 'max_value_3', 'max_month_4', 'max_value_4', 
                 'perc_99', 'perc_95', 'perc_90', 'perc_75', 
                 'state_code', 'county_code', 'parameter_code', 'duration''latitude', 'longitude']]

In [15]:
aqs_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2022 entries, 0 to 2190
Data columns (total 37 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   year                      2022 non-null   int64  
 1   state                     2022 non-null   object 
 2   county                    2022 non-null   object 
 3   parameter                 2022 non-null   object 
 4   metric_used               2022 non-null   object 
 5   method                    1348 non-null   object 
 6   event_type                2022 non-null   object 
 7   observation_count         2022 non-null   int64  
 8   observation_percent       2022 non-null   float64
 9   validity_indicator        2022 non-null   object 
 10  valid_day_count           2022 non-null   int64  
 11  required_day_count        2022 non-null   int64  
 12  poc                       2022 non-null   int64  
 13  exceptional_data_count    2022 non-null   int64  
 14  primary_

In [16]:
aqs_df.metric_used.value_counts()

Daily Mean                                                              439
Observed Values                                                         418
Quarterly Means of Daily Means                                          341
Daily maximum of 8 hour running average of observed hourly values       190
Observed values                                                         101
Daily Maximum 1-hour average                                            101
Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM)     95
Daily maximum of 8-hour running average                                  95
Obseved hourly values                                                    70
8-Hour running average (end hour) of observed hourly values              70
Daily maximum 1-hour average                                             34
Daily Average of observed values                                         34
3-Hour block average of observed hourly values                           34
Name: metric

In [17]:
aqs_df.groupby(['parameter_code', 'units_of_measure']).parameter.count()

parameter_code  units_of_measure             
42101           Parts per million                140
42401           Parts per billion                149
42601           Parts per billion                155
42602           Parts per billion                202
44201           Parts per million                380
81102           Micrograms/cubic meter (25 C)    123
88101           Micrograms/cubic meter (LC)      718
88502           Micrograms/cubic meter (LC)      155
Name: parameter, dtype: int64

In [22]:
# NOTE: 300 Records are 'NOT VALID'
aqs_df.validity_indicator.value_counts()

Y    1707
N     315
Name: validity_indicator, dtype: int64

In [30]:
param_avgs = aqs_df.groupby(['parameter', 'year', 'state', 'county', 'units_of_measure']).avg_max_value.mean()

parameter                               year  state       county          units_of_measure           
Acceptable PM2.5 AQI & Speciation Mass  2011  California  Los Angeles     Micrograms/cubic meter (LC)     68.407407
                                                          San Diego       Micrograms/cubic meter (LC)     79.020000
                                              Colorado    Denver          Micrograms/cubic meter (LC)     16.700000
                                              Georgia     Fulton, DeKalb  Micrograms/cubic meter (LC)     43.133333
                                              Tennessee   Davidson        Micrograms/cubic meter (LC)     28.800000
                                        2016  California  Los Angeles     Micrograms/cubic meter (LC)     65.666667
                                                          San Diego       Micrograms/cubic meter (LC)     50.057143
                                              Georgia     Fulton, DeKalb  Micrograms/c

In [31]:
param_avgs.head(20)

parameter                               year  state       county          units_of_measure           
Acceptable PM2.5 AQI & Speciation Mass  2011  California  Los Angeles     Micrograms/cubic meter (LC)     68.407407
                                                          San Diego       Micrograms/cubic meter (LC)     79.020000
                                              Colorado    Denver          Micrograms/cubic meter (LC)     16.700000
                                              Georgia     Fulton, DeKalb  Micrograms/cubic meter (LC)     43.133333
                                              Tennessee   Davidson        Micrograms/cubic meter (LC)     28.800000
                                        2016  California  Los Angeles     Micrograms/cubic meter (LC)     65.666667
                                                          San Diego       Micrograms/cubic meter (LC)     50.057143
                                              Georgia     Fulton, DeKalb  Micrograms/c

In [20]:
aqs_df.to_csv('../data/aqs_data_cleaned.csv')

#### Merge the AQI Category Data with the Filtered Parameter Data

#You can use the module pandasql to join two dataframed based on value between two values

print df_1

  timestamp              A          B
0 2016-05-14 10:54:33    0.020228   0.026572
1 2016-05-14 10:54:34    0.057780   0.175499
2 2016-05-14 10:54:35    0.098808   0.620986
3 2016-05-14 10:54:36    0.158789   1.014819
4 2016-05-14 10:54:39    0.038129   2.384590


print df_2

  start                end                  event    
0 2016-05-14 10:54:31  2016-05-14 10:54:33  E1
1 2016-05-14 10:54:34  2016-05-14 10:54:37  E2
2 2016-05-14 10:54:38  2016-05-14 10:54:42  E3

import pandasql as ps

sqlcode = '''
select df_1.timestamp
,df_1.A
,df_1.B
,df_2.event
from df_1 
inner join df_2 
on df_1.timestamp ?[where df_1.timestamp]? between df_2.start and df_2.end
'''

newdf = ps.sqldf(sqlcode,locals())

SELECT *
FROM A,B
WHERE A_value between B_low and B_high