In [1]:
# packages
import numpy as np
import pandas as pd
import altair as alt
alt.data_transformers.disable_max_rows()
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# raw data
air_raw = pd.read_csv('/Users/roshanmehta/Downloads/PSTAT/PSTAT 100/Projects/MP1/air-quality.csv')
cbsa_info = pd.read_csv('/Users/roshanmehta/Downloads/PSTAT/PSTAT 100/Projects/MP1/cbsa-info.csv')

# merging data
data = pd.merge(air_raw, cbsa_info, how = 'left', on = 'CBSA')

# combining columns
data['Pollutant statistic'] = data[['Pollutant','Trend Statistic']].agg('-'.join, axis=1)

# dropping irrelevant columns
data.drop(columns = ['Pollutant', 'Trend Statistic', 'Number of Trends Sites'])

# reordering columns
data = data.loc[:,['CBSA','Core Based Statistical Area','Pollutant statistic', "2000", "2001", "2002", 
                   "2003", "2004", "2005", '2006', "2007", "2008", "2009", "2010", "2011", "2012", 
                   "2013", "2014", "2015", "2016", "2017", "2018", "2019"]]

data.head(3)

Unnamed: 0,CBSA,Core Based Statistical Area,Pollutant statistic,2000,2001,2002,2003,2004,2005,2006,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,10100,"Aberdeen, SD",PM10-2nd Max,50.0,58.0,59.0,66.0,39.0,48.0,51.0,...,46.0,29.0,62.0,66.0,36.0,43.0,65.0,40.0,49.0,35.0
1,10100,"Aberdeen, SD",PM2.5-Weighted Annual Mean,8.6,8.6,7.9,8.4,8.1,9.0,8.2,...,8.7,7.1,7.5,7.3,6.2,6.2,5.4,5.8,6.6,5.9
2,10100,"Aberdeen, SD",PM2.5-98th Percentile,23.0,23.0,20.0,21.0,23.0,23.0,21.0,...,27.0,18.0,23.0,22.0,17.0,14.0,14.0,13.0,22.0,18.0


In [4]:
# tidying the data
# we first need to melt the years into a single 'year' variable and then pivot
tidy_data = data.copy()
tidy_data = tidy_data.drop('Core Based Statistical Area', axis = 1)
tidy_data = tidy_data.melt(
    id_vars =['CBSA', 'Pollutant statistic'],
    value_vars = ["2000", "2001", "2002", "2003", "2004", "2005", '2006', "2007", "2008", "2009", "2010", "2011",
                 "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019"],
    var_name = 'Year',
    value_name = 'Concentration'
).pivot_table(
    index = ['CBSA', 'Year'], 
    columns = ['Pollutant statistic'], 
    values = 'Concentration').reset_index()

tidy_data.head(5)

Pollutant statistic,CBSA,Year,CO-2nd Max,NO2-98th Percentile,NO2-Annual Mean,O3-4th Max,PM10-2nd Max,PM2.5-98th Percentile,PM2.5-Weighted Annual Mean,Pb-Max 3-Month Average,SO2-99th Percentile
0,10100,2000,,,,,50.0,23.0,8.6,,
1,10100,2001,,,,,58.0,23.0,8.6,,
2,10100,2002,,,,,59.0,20.0,7.9,,
3,10100,2003,,,,,66.0,21.0,8.4,,
4,10100,2004,,,,,39.0,23.0,8.1,,


In [3]:
## PART I - this is merging the data
##########

# number of CBSAs included in the data
data.CBSA.nunique() # there are 351 unique CBSA

# in how many states and territories do the CBSA's reside?
data_mod1 = data.copy()
data_mod1[['City','State/Territory']] = data_mod1['Core Based Statistical Area'].str.split(", ",expand=True)
data_mod1['State/Territory'].str.split('-').explode('State/Territory').unique()
# there are 52 unique territories in the data set - the two territories are PR and DC. 


# In which years were data values recorded?
print(data.columns[3:24]) # we can see that the data was recorded for years 2000-2019.

# How many observations are recorded? / How many variables are measured?
# we will use the tidied data set
tidy_data.shape # 7020 observations were recorded and 9 different variables were measured over a 20 yr. period

# Which variables are non-missing most of the time (i.e., in at least 50% of instances)?
# 7020/2 = 3510
# from this, we can see that the variables 'O3-4th Max', 'PM2.5-98th Percentile', & 'PM2.5-Weighted Annual Mean'
# are non-missing most of the time since they are over 50%.
(7020 - tidy_data.isna().sum()) / int(tidy_data.shape[0]) * 100

# What is PM 2.5 and why is it important?
# PM is particulate matter and 2.5 specifies the size of that particle (2.5 microns or less in diameter). 
# Particulate matter is airborne and not just a single pollutant. It is made up of a mixture of many 
# different chemical substances. For example, the combustion of gasoline, oil, diesel fuel, or wood produce create emmisions, 
# and these make up much of the PM2.5 pollution found the air outdoors. This is important to measure and keep track of 
# because PM2.5 can travel into our respiratory tract, reaching the lungs and even enter the blood stream, posing huge
# health risks. Exposure to increased levels of PM2.5 for extended periods of time is associated with increased chances 
# of early mortality, lung cancer, and heart disease

Index(['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008',
       '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017',
       '2018', '2019'],
      dtype='object')


Pollutant statistic
CBSA                          100.000000
Year                          100.000000
CO-2nd Max                     16.809117
NO2-98th Percentile            19.088319
NO2-Annual Mean                25.356125
O3-4th Max                     80.911681
PM10-2nd Max                   29.344729
PM2.5-98th Percentile          60.968661
PM2.5-Weighted Annual Mean     60.968661
Pb-Max 3-Month Average          4.273504
SO2-99th Percentile            25.356125
dtype: float64

In [5]:
# Has PM 2.5 air pollution improved in the U.S. on the whole since 2000?
MeanP25 = tidy_data.loc[:,['Year', 'PM2.5-Weighted Annual Mean','PM2.5-98th Percentile']].groupby(
    'Year').mean().reset_index().rename(
    columns= {'PM2.5-Weighted Annual Mean': 'PM2.5 Weighted Avg. Mean', 'PM2.5-98th Percentile': 'PM2.5 98th Perc.'})

MeanP25 = MeanP25.melt(id_vars = 'Year',
    var_name = 'P25 Statistic',
    value_name = 'P25 Value').reset_index()

alt.Chart(MeanP25).mark_line().encode(
    x = alt.X('Year:T', scale = alt.Scale(zero = False)),
    y = alt.Y('P25 Value', title = 'Overall PM 2.5', scale = alt.Scale(zero = False)),
    color = alt.Size('P25 Statistic') # change here
).properties(
    width = 250, 
    height = 250
)

In [6]:
# Over time, has PM 2.5 pollution become more variable, less variable, 
# or about equally variable from city to city in the U.S.?

SD_P25 = tidy_data.loc[:,['Year', 'PM2.5-Weighted Annual Mean','PM2.5-98th Percentile' ]].groupby(
    'Year').std().reset_index().rename(
    columns= {'PM2.5-Weighted Annual Mean': 'PM2.5 Weighted Avg. Mean', 'PM2.5-98th Percentile': 'PM2.5 98th Perc.'})

SD_P25 = SD_P25.melt(id_vars = 'Year',
    var_name = 'P25 Statistic',
    value_name = 'P25 Value').reset_index()

alt.Chart(SD_P25).mark_line().encode(
    x = alt.X('Year:T', scale = alt.Scale(zero = False)),
    y = alt.Y('P25 Value', title = 'Overall PM 2.5', scale = alt.Scale(zero = False)),
    color = alt.Size('P25 Statistic')
).properties(
    width = 250, 
    height = 250
)

In [10]:
# Which state has seen the greatest improvement in PM 2.5 pollution over time? Montana!

x = pd.merge(air_raw, cbsa_info, how = 'left', on = 'CBSA')
x['Pollutant statistic'] = x[['Pollutant','Trend Statistic']].agg('-'.join, axis=1)

x[['City','State']] = x['Core Based Statistical Area'].str.split(", ",expand=True)
#x['State/Territory'].str.split('-', expand=True)
#x['State/Territory'].str.split('-').explode('State/Territory')
x_mod1 = x.assign(State = x['State'].str.split('-')).explode('State')

x_mod1 = x_mod1.reset_index().melt(
    id_vars =['CBSA', 'Pollutant statistic', 'Core Based Statistical Area', 'City', 'State'],
    value_vars = ["2000", "2001", "2002", "2003", "2004", "2005", '2006', "2007", "2008", "2009", "2010", "2011",
                 "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019"],
    var_name = 'Year',
    value_name = 'Concentration').pivot_table(
    index = ['CBSA', 'Year', 'City', 'State'], 
    columns = ['Pollutant statistic'], 
    values = 'Concentration').reset_index()
x_mod1 = x_mod1[['CBSA', 'Year', 'City', 'State', 'PM2.5-Weighted Annual Mean']]

x_mod1 = x_mod1.drop(columns = ["CBSA"]).groupby(['Year', 'City', 'State']).mean()
x_mod1 = x_mod1.reset_index().pivot_table(
    index = "State",
    columns = 'Year',
    values = 'PM2.5-Weighted Annual Mean')

# change in PM2.5 between 2000 and 2019
x_mod1['change'] = ((x_mod1['2019'] - x_mod1['2000']) / x_mod1['2000'])
x_mod1.sort_values(by=['change']).head()

Year,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,change
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MT,13.5,7.0,6.8,9.7,8.5,10.3,10.8,12.8,10.1,9.8,...,9.9,11.2,9.6,9.1,10.5,7.3,13.3,5.6,5.3,-0.607407
VA,14.725,14.0,13.425,13.25,13.275,13.775,12.75,12.575,11.325,9.275,...,9.7,8.675,8.15,8.375,7.975,7.15,7.225,6.925,6.85,-0.534805
TN,16.6,14.933333,14.2,14.5,13.533333,14.8,13.633333,14.233333,11.666667,10.166667,...,10.666667,9.633333,9.233333,9.433333,8.366667,8.333333,8.066667,7.766667,7.8,-0.53012
AL,16.944444,14.5,13.755556,14.088889,13.522222,14.333333,13.9,14.322222,11.988889,10.3,...,11.266667,9.666667,9.066667,9.533333,9.088889,8.077778,8.288889,7.8,8.055556,-0.52459
NH,10.4,10.433333,10.133333,10.1,9.666667,10.166667,9.533333,9.233333,8.733333,8.333333,...,8.266667,7.866667,6.966667,6.833333,6.833333,5.333333,5.166667,5.333333,4.966667,-0.522436


In [11]:
# Which city has seen the greatest improvement? Portsmouth!

x = pd.merge(air_raw, cbsa_info, how = 'left', on = 'CBSA')
x['Pollutant statistic'] = x[['Pollutant','Trend Statistic']].agg('-'.join, axis=1)

x[['City','State']] = x['Core Based Statistical Area'].str.split(", ",expand=True)
#x['State/Territory'].str.split('-', expand=True)
#x['State/Territory'].str.split('-').explode('State/Territory')
x_mod1 = x.assign(State = x['State'].str.split('-')).explode('State')

x_mod1 = x_mod1.reset_index().melt(
    id_vars =['CBSA', 'Pollutant statistic', 'Core Based Statistical Area', 'City', 'State'],
    value_vars = ["2000", "2001", "2002", "2003", "2004", "2005", '2006', "2007", "2008", "2009", "2010", "2011",
                 "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019"],
    var_name = 'Year',
    value_name = 'Concentration').pivot_table(
    index = ['CBSA', 'Year', 'City', 'State'], 
    columns = ['Pollutant statistic'], 
    values = 'Concentration').reset_index()
x_mod1 = x_mod1[['CBSA', 'Year', 'City', 'State', 'PM2.5-Weighted Annual Mean']]

x_mod1 = x_mod1.drop(columns = ["CBSA"]).groupby(['Year', 'City', 'State']).mean()
x_mod1 = x_mod1.reset_index().pivot_table(
    index = "City",
    columns = 'Year',
    values = 'PM2.5-Weighted Annual Mean')

# change in PM2.5 between 2000 and 2019
x_mod1['change'] = ((x_mod1['2019'] - x_mod1['2000']) / x_mod1['2000'])
x_mod1.sort_values(by=['change']).head()

Year,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,change
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Portsmouth,21.1,20.3,16.7,14.7,12.9,16.2,14.3,14.0,12.1,10.9,...,10.1,9.8,9.0,8.2,8.5,8.3,6.9,7.1,6.7,-0.682464
Reno,8.9,10.3,9.2,7.3,7.9,8.9,7.6,8.0,10.2,7.9,...,6.7,6.0,10.1,7.6,7.6,6.5,7.4,8.0,3.0,-0.662921
Kingsport-Bristol-Bristol,16.6,15.1,14.1,13.8,13.8,14.3,13.5,13.9,10.6,9.2,...,9.8,8.8,8.5,8.6,7.4,8.0,7.2,6.7,6.4,-0.614458
Butte-Silver Bow,13.5,7.0,6.8,9.7,8.5,10.3,10.8,12.8,10.1,9.8,...,9.9,11.2,9.6,9.1,10.5,7.3,13.3,5.6,5.3,-0.607407
Asheville,15.4,13.5,13.8,12.6,12.3,13.1,12.4,12.2,9.0,8.4,...,9.2,8.6,8.1,7.9,7.0,8.5,6.7,6.0,6.1,-0.603896


In [None]:
# Choosing a meaningful location & checking if that place was in compliance with 
# EPA primary standards as of the most recent measurement.

