In [24]:
# Import Dependencies
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [25]:
# Create a reference to the CSV and import it into a Pandas DataFrame
csv_path = "Resources/pollution_us_2000_2016.csv"
df = pd.read_csv(csv_path)

In [26]:
#pandas datetimeindex docs: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DatetimeIndex.html
#efficient way to extract year from string format date
df['year'] = pd.DatetimeIndex(df['Date Local']).year
df['Date Local'] = pd.to_datetime(df['Date Local'],format='%Y-%m-%d') 

In [27]:
# Delete extraneous column
df = df.drop(['Unnamed: 0','State Code','County Code','Address','Site Num','NO2 Units','O3 Units','SO2 Units','CO Units'], axis=1)
# Removing Mexico
df = df[df.State!='Country Of Mexico']
df.head()

Unnamed: 0,State,County,City,Date Local,NO2 Mean,NO2 1st Max Value,NO2 1st Max Hour,NO2 AQI,O3 Mean,O3 1st Max Value,...,O3 AQI,SO2 Mean,SO2 1st Max Value,SO2 1st Max Hour,SO2 AQI,CO Mean,CO 1st Max Value,CO 1st Max Hour,CO AQI,year
0,Arizona,Maricopa,Phoenix,2000-01-01,19.041667,49.0,19,46,0.0225,0.04,...,34,3.0,9.0,21,13.0,1.145833,4.2,21,,2000
1,Arizona,Maricopa,Phoenix,2000-01-01,19.041667,49.0,19,46,0.0225,0.04,...,34,3.0,9.0,21,13.0,0.878947,2.2,23,25.0,2000
2,Arizona,Maricopa,Phoenix,2000-01-01,19.041667,49.0,19,46,0.0225,0.04,...,34,2.975,6.6,23,,1.145833,4.2,21,,2000
3,Arizona,Maricopa,Phoenix,2000-01-01,19.041667,49.0,19,46,0.0225,0.04,...,34,2.975,6.6,23,,0.878947,2.2,23,25.0,2000
4,Arizona,Maricopa,Phoenix,2000-01-02,22.958333,36.0,19,34,0.013375,0.032,...,27,1.958333,3.0,22,4.0,0.85,1.6,23,,2000


In [28]:
df.State.unique()

array(['Arizona', 'California', 'Colorado', 'District Of Columbia',
       'Florida', 'Illinois', 'Indiana', 'Kansas', 'Kentucky',
       'Louisiana', 'Michigan', 'Missouri', 'New Jersey', 'New York',
       'North Carolina', 'Oklahoma', 'Pennsylvania', 'Texas', 'Virginia',
       'Massachusetts', 'Nevada', 'New Hampshire', 'Tennessee',
       'South Carolina', 'Connecticut', 'Iowa', 'Maine', 'Maryland',
       'Wisconsin', 'Arkansas', 'Oregon', 'Wyoming', 'North Dakota',
       'Idaho', 'Ohio', 'Georgia', 'Delaware', 'Hawaii', 'Minnesota',
       'New Mexico', 'Rhode Island', 'South Dakota', 'Utah', 'Alabama',
       'Washington', 'Alaska'], dtype=object)

In [6]:
df_grouped = df.groupby(['State','Date Local']).mean()
df_grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,NO2 Mean,NO2 1st Max Value,NO2 1st Max Hour,NO2 AQI,O3 Mean,O3 1st Max Value,O3 1st Max Hour,O3 AQI,SO2 Mean,SO2 1st Max Value,SO2 1st Max Hour,SO2 AQI,CO Mean,CO 1st Max Value,CO 1st Max Hour,CO AQI,year
State,Date Local,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Alabama,2013-12-01,17.208333,39.3,18.0,37.0,0.013542,0.026,9.0,24.0,0.28539,0.75,11.0,1.0,0.262879,0.65,8.5,6.0,2013.0
Alabama,2013-12-02,20.6875,32.4,7.0,30.0,0.009375,0.013,0.0,12.0,0.531666,2.1,11.0,3.0,0.352812,0.485,8.0,6.0,2013.0
Alabama,2013-12-03,14.9125,22.4,17.0,21.0,0.008167,0.012,22.0,11.0,0.252632,1.35,12.5,3.0,0.237575,0.325,4.0,3.0,2013.0
Alabama,2013-12-04,7.825,19.3,17.0,18.0,0.011125,0.014,8.0,13.0,0.123052,0.95,17.0,1.0,0.115152,0.22,18.5,2.0,2013.0
Alabama,2013-12-05,8.004762,16.0,7.0,15.0,0.010083,0.014,18.0,13.0,-0.014285,0.6,2.5,1.0,0.117575,0.2,17.5,2.0,2013.0


In [None]:
# Dropping the NA values at this point would end up losing a lot of data for O3 and CO2 data, so we want to maintain separate dfs once we get to that point. 

In [None]:
#NO2 data
NO2_data = df[["State", "County", "City","NO2 Mean","NO2 1st Max Value", "NO2 1st Max Hour", "NO2 AQI","Date Local","year"
                          ]]
NO2_data.head()

In [None]:
#NO2 data group it 
NO2_group = df[["State","County","City","NO2 AQI","Date Local","year"
                          ]]
NO2_group.head()

In [None]:
# Grouping the DataFrame by "Assignee"
citycounty_group = NO2_group.groupby(["State","County","City"])

citycounty_group


In [None]:
# Count how many of each component Assignees worked on and create DataFrame of the data
citycounty_grpdisplay = pd.DataFrame(citycounty_group["City"].value_counts())
citycounty_grpdisplay

In [None]:
#O3 data

O3_data = df[["State", "County", "City","O3 Mean","O3 1st Max Value", "O3 1st Max Hour", "O3 AQI"
                          ]]
O3_data.head()

In [8]:
# the following two groupbys need a dropped dataframe
df_drop = df.dropna(how='any')
df_drop.count()

State                436876
County               436876
City                 436876
Date Local           436876
NO2 Mean             436876
NO2 1st Max Value    436876
NO2 1st Max Hour     436876
NO2 AQI              436876
O3 Mean              436876
O3 1st Max Value     436876
O3 1st Max Hour      436876
O3 AQI               436876
SO2 Mean             436876
SO2 1st Max Value    436876
SO2 1st Max Hour     436876
SO2 AQI              436876
CO Mean              436876
CO 1st Max Value     436876
CO 1st Max Hour      436876
CO AQI               436876
year                 436876
dtype: int64

In [9]:
#SO2 data
SO2_data = df_drop[["State","County", "City","SO2 Mean","SO2 1st Max Value", "SO2 1st Max Hour", "SO2 AQI","Date Local","year"
                   ]]
SO2_data.head()

Unnamed: 0,State,County,City,SO2 Mean,SO2 1st Max Value,SO2 1st Max Hour,SO2 AQI,Date Local,year
1,Arizona,Maricopa,Phoenix,3.0,9.0,21,13.0,2000-01-01,2000
5,Arizona,Maricopa,Phoenix,1.958333,3.0,22,4.0,2000-01-02,2000
9,Arizona,Maricopa,Phoenix,5.25,11.0,19,16.0,2000-01-03,2000
13,Arizona,Maricopa,Phoenix,7.083333,16.0,8,23.0,2000-01-04,2000
17,Arizona,Maricopa,Phoenix,8.708333,15.0,7,21.0,2000-01-05,2000


In [29]:
#CO data
CO_data = df_drop[["State", "County", "City","CO Mean","CO 1st Max Value", "CO 1st Max Hour", "CO AQI", "Date Local","year"
                  ]]
state_CO_data = CO_data.groupby(['State']).mean()

In [30]:
state_CO_data.sort_values('CO AQI', ascending=False)

Unnamed: 0_level_0,CO Mean,CO 1st Max Value,CO 1st Max Hour,CO AQI,year
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Country Of Mexico,0.856178,1.561309,9.40705,17.698699,2007.623584
District Of Columbia,0.791535,1.018286,7.908412,11.602395,2008.052869
Arizona,0.492256,0.805822,7.067376,9.190681,2007.697521
Colorado,0.445569,0.673503,7.862629,7.724804,2009.410862
Missouri,0.469007,0.6472,7.433192,7.41318,2004.677178
California,0.449641,0.651882,6.514624,7.405755,2007.483773
Kansas,0.410451,0.57501,7.437047,6.541608,2006.123364
Alaska,0.424387,0.581781,7.052632,6.52834,2014.637652
Michigan,0.350807,0.563019,7.189057,6.469956,2003.424035
New Jersey,0.403735,0.56551,7.24091,6.437528,2008.131977
