In [2]:
pip install altair vega_datasets

Collecting altair
  Downloading altair-4.2.2-py3-none-any.whl (813 kB)
[K     |████████████████████████████████| 813 kB 4.7 MB/s eta 0:00:01
[?25hCollecting vega_datasets
  Downloading vega_datasets-0.9.0-py3-none-any.whl (210 kB)
[K     |████████████████████████████████| 210 kB 11.1 MB/s eta 0:00:01
Installing collected packages: vega-datasets, altair
Successfully installed altair-4.2.2 vega-datasets-0.9.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
# packages
import numpy as np
import pandas as pd
import altair as alt

# raw data
air_raw = pd.read_csv('/Users/roshanmehta/Downloads/PSTAT/PSTAT 100/Projects/MP1/air-quality.csv')
cbsa_info = pd.read_csv('/Users/roshanmehta/Downloads/PSTAT/PSTAT 100/Projects/MP1/cbsa-info.csv')

# merging data
data = pd.merge(air_raw, cbsa_info, how = 'left', on = 'CBSA')

# combining columns
data['Pollutant statistic'] = data[['Pollutant','Trend Statistic']].agg('-'.join, axis=1)

# dropping irrelevant columns
data.drop(columns = ['Pollutant', 'Trend Statistic', 'Number of Trends Sites'])

# reordering columns
data = data.loc[:,['CBSA','Core Based Statistical Area','Pollutant statistic', "2000", "2001", "2002", 
                   "2003", "2004", "2005", '2006', "2007", "2008", "2009", "2010", "2011", "2012", 
                   "2013", "2014", "2015", "2016", "2017", "2018", "2019"]]

data.head(3)

Unnamed: 0,CBSA,Core Based Statistical Area,Pollutant statistic,2000,2001,2002,2003,2004,2005,2006,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,10100,"Aberdeen, SD",PM10-2nd Max,50.0,58.0,59.0,66.0,39.0,48.0,51.0,...,46.0,29.0,62.0,66.0,36.0,43.0,65.0,40.0,49.0,35.0
1,10100,"Aberdeen, SD",PM2.5-Weighted Annual Mean,8.6,8.6,7.9,8.4,8.1,9.0,8.2,...,8.7,7.1,7.5,7.3,6.2,6.2,5.4,5.8,6.6,5.9
2,10100,"Aberdeen, SD",PM2.5-98th Percentile,23.0,23.0,20.0,21.0,23.0,23.0,21.0,...,27.0,18.0,23.0,22.0,17.0,14.0,14.0,13.0,22.0,18.0


In [4]:
# tidying the data
# we first need to melt the years into a single 'year' variable and then pivot
tidy_data = data.copy()
tidy_data = tidy_data.drop('Core Based Statistical Area', axis = 1)
tidy_data = tidy_data.melt(
    id_vars =['CBSA', 'Pollutant statistic'],
    value_vars = ["2000", "2001", "2002", "2003", "2004", "2005", '2006', "2007", "2008", "2009", "2010", "2011",
                 "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019"],
    var_name = 'Year',
    value_name = 'Concentration'
).pivot_table(
    index = ['CBSA', 'Year'], 
    columns = ['Pollutant statistic'], 
    values = 'Concentration')

tidy_data.head(21)

Unnamed: 0_level_0,Pollutant statistic,CO-2nd Max,NO2-98th Percentile,NO2-Annual Mean,O3-4th Max,PM10-2nd Max,PM2.5-98th Percentile,PM2.5-Weighted Annual Mean,Pb-Max 3-Month Average,SO2-99th Percentile
CBSA,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
10100,2000,,,,,50.0,23.0,8.6,,
10100,2001,,,,,58.0,23.0,8.6,,
10100,2002,,,,,59.0,20.0,7.9,,
10100,2003,,,,,66.0,21.0,8.4,,
10100,2004,,,,,39.0,23.0,8.1,,
10100,2005,,,,,48.0,23.0,9.0,,
10100,2006,,,,,51.0,21.0,8.2,,
10100,2007,,,,,49.0,17.0,8.0,,
10100,2008,,,,,69.0,28.0,7.7,,
10100,2009,,,,,53.0,23.0,8.1,,


In [5]:
## PART I - this is merging the data
##########

# number of CBSAs included in the data
data.CBSA.nunique() # there are 351 unique CBSA

# in how many states and territories do the CBSA's reside?
data_mod1 = data.copy()
data_mod1[['City','State/Territory']] = data_mod1['Core Based Statistical Area'].str.split(", ",expand=True)
data_mod1['State/Territory'].str.split('-').explode('State/Territory').unique()
# there are 52 unique territories in the data set - the two territories are PR and DC. 


# In which years were data values recorded?
print(data.columns[3:24]) # we can see that the data was recorded for years 2000-2019.

# How many observations are recorded? / How many variables are measured?
# we will use the tidied data set
tidy_data.shape # 7020 observations were recorded and 9 different variables were measured over a 20 yr. period

# Which variables are non-missing most of the time (i.e., in at least 50% of instances)?
# 7020/2 = 3510
# from this, we can see that the variables 'O3-4th Max', 'PM2.5-98th Percentile', & 'PM2.5-Weighted Annual Mean'
# are non-missing most of the time since they are over 50%.
(7020 - tidy_data.isna().sum()) / int(tidy_data.shape[0]) * 100

# What is PM 2.5 and why is it important?
# PM is particulate matter and 2.5 specifies the size of that particle (2.5 microns or less in diameter). 
# Particulate matter is airborne and not just a single pollutant. It is made up of a mixture of many 
# different chemical substances. For example, the combustion of gasoline, oil, diesel fuel, or wood produce create emmisions, 
# and these make up much of the PM2.5 pollution found the air outdoors. This is important to measure and keep track of 
# because PM2.5 can travel into our respiratory tract, reaching the lungs and even enter the blood stream, posing huge
# health risks. Exposure to increased levels of PM2.5 for extended periods of time is associated with increased chances 
# of early mortality, lung cancer, and heart disease

Index(['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008',
       '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017',
       '2018', '2019'],
      dtype='object')


Pollutant statistic
CO-2nd Max                    16.809117
NO2-98th Percentile           19.088319
NO2-Annual Mean               25.356125
O3-4th Max                    80.911681
PM10-2nd Max                  29.344729
PM2.5-98th Percentile         60.968661
PM2.5-Weighted Annual Mean    60.968661
Pb-Max 3-Month Average         4.273504
SO2-99th Percentile           25.356125
dtype: float64