In [1]:
import pandas as pd
import sqlalchemy
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import TextBox
import scipy.stats as stats
from scipy.stats import ttest_ind
import statsmodels.api as sm
from statsmodels.formula.api import ols
from pathlib import Path
import plotly.graph_objects as go
import plotly.express as px
from ipywidgets import widgets, interactive
import dash
from dash import dcc
from dash import html
from dash.dependencies import Input, Output


In [2]:
#Read in the historic data and store in a DataFrame
path = "data/HistoricalPollution.csv"
historic_df = pd.read_csv(path)

#Take a look at the data
historic_df

Unnamed: 0,Primary Key,CBSA,City,State,Pollutant,Trend Statistic,Number of Sites,1990,1991,1992,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,1,10420,Akron,OH,O3,4th Max,2,0.090,0.101,0.087,...,0.059,0.060,0.065,0.060,0.066,0.068,0.062,0.063,0.067,0.070
1,2,10420,Akron,OH,PM2.5,Weighted Annual Mean,3,,,,...,9.700,9.900,10.400,8.200,7.900,7.900,8.200,7.900,8.000,7.100
2,3,10420,Akron,OH,PM2.5,98th Percentile,3,,,,...,24.000,22.000,23.000,17.000,18.000,18.000,21.000,21.000,20.000,17.000
3,4,10420,Akron,OH,SO2,99th Percentile,1,161.000,183.000,181.000,...,23.000,21.000,14.000,8.000,3.000,6.000,3.000,4.000,5.000,6.000
4,5,10500,Albany,GA,PM2.5,Weighted Annual Mean,1,,,,...,10.000,10.300,9.000,8.700,9.400,8.400,9.300,9.100,10.100,8.300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
708,709,49700,Yuba City,CA,NO2,Annual Mean,1,17.000,17.000,17.000,...,10.000,8.000,7.000,7.000,7.000,7.000,6.000,6.000,5.000,7.000
709,710,49700,Yuba City,CA,NO2,98th Percentile,1,70.000,70.000,70.000,...,52.000,44.000,39.000,40.000,42.000,41.000,40.000,38.000,34.000,37.000
710,711,49700,Yuba City,CA,O3,4th Max,1,0.076,0.080,0.090,...,0.060,0.069,0.064,0.063,0.067,0.065,0.061,0.066,0.072,0.058
711,712,49700,Yuba City,CA,PM2.5,Weighted Annual Mean,1,,,,...,8.200,9.400,9.600,8.100,9.300,10.300,8.400,16.400,14.500,10.700


In [3]:
#Dropping the years 1990 - 1999 because most of the pollutant data is missing from these years. We will get more accurate data to compare looking at years 2000-2022

drop_col = ['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999']
mean_2000 = historic_df.drop(columns=drop_col)
mean_2000

Unnamed: 0,Primary Key,CBSA,City,State,Pollutant,Trend Statistic,Number of Sites,2000,2001,2002,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,1,10420,Akron,OH,O3,4th Max,2,0.085,0.096,0.10,...,0.059,0.060,0.065,0.060,0.066,0.068,0.062,0.063,0.067,0.070
1,2,10420,Akron,OH,PM2.5,Weighted Annual Mean,3,16.200,16.200,16.00,...,9.700,9.900,10.400,8.200,7.900,7.900,8.200,7.900,8.000,7.100
2,3,10420,Akron,OH,PM2.5,98th Percentile,3,37.000,44.000,40.00,...,24.000,22.000,23.000,17.000,18.000,18.000,21.000,21.000,20.000,17.000
3,4,10420,Akron,OH,SO2,99th Percentile,1,163.000,132.000,145.00,...,23.000,21.000,14.000,8.000,3.000,6.000,3.000,4.000,5.000,6.000
4,5,10500,Albany,GA,PM2.5,Weighted Annual Mean,1,16.600,14.600,13.80,...,10.000,10.300,9.000,8.700,9.400,8.400,9.300,9.100,10.100,8.300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
708,709,49700,Yuba City,CA,NO2,Annual Mean,1,13.000,14.000,15.00,...,10.000,8.000,7.000,7.000,7.000,7.000,6.000,6.000,5.000,7.000
709,710,49700,Yuba City,CA,NO2,98th Percentile,1,67.000,64.000,62.00,...,52.000,44.000,39.000,40.000,42.000,41.000,40.000,38.000,34.000,37.000
710,711,49700,Yuba City,CA,O3,4th Max,1,0.079,0.081,0.08,...,0.060,0.069,0.064,0.063,0.067,0.065,0.061,0.066,0.072,0.058
711,712,49700,Yuba City,CA,PM2.5,Weighted Annual Mean,1,10.600,11.900,13.10,...,8.200,9.400,9.600,8.100,9.300,10.300,8.400,16.400,14.500,10.700


In [4]:
#store all the years as a list to call easier
years = ['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022']

In [5]:
#Create a dataframe containing state, city, and year total pollutiion

state_df = (mean_2000.groupby(['State', 'City'], sort=True, as_index=False)[years].sum())
state_df


Unnamed: 0,State,City,2000,2001,2002,2003,2004,2005,2006,2007,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,AK,Anchorage,31.200,36.900,35.500,35.300,43.300,33.500,44.200,25.800,...,30.700,38.500,37.300,32.100,37.100,25.700,44.800,31.300,27.400,30.100
1,AK,Juneau,30.600,31.600,26.200,29.400,32.600,43.100,41.500,46.600,...,28.900,35.700,27.600,30.000,27.600,28.900,31.800,21.800,21.700,28.300
2,AL,Birmingham,320.791,189.683,148.185,157.877,162.974,180.584,168.486,164.292,...,87.065,84.364,113.268,73.670,70.562,69.265,66.471,61.961,68.861,63.064
3,AL,Daphne,50.500,32.600,33.400,41.200,39.400,37.700,37.200,32.400,...,25.400,25.900,26.600,21.200,26.400,24.100,22.500,24.900,22.300,22.300
4,AL,Decatur,44.100,44.100,44.100,43.700,41.200,49.700,42.200,54.400,...,24.600,30.300,25.800,20.400,23.700,23.400,21.600,23.100,33.000,29.500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240,WY,Gillette,73.000,65.000,69.000,66.000,63.000,56.000,88.000,94.000,...,64.000,47.000,78.000,50.000,95.000,39.000,39.000,61.000,84.000,72.000
241,WY,Laramie,0.043,0.051,0.044,0.049,0.048,0.038,0.047,0.042,...,0.069,0.065,0.064,0.064,0.066,0.070,0.065,0.066,0.072,0.067
242,WY,Riverton,91.600,85.600,99.600,84.100,99.600,88.700,77.700,72.500,...,75.800,83.700,61.200,58.800,73.400,71.300,67.000,105.300,98.500,81.800
243,WY,Rock Springs,32.200,32.500,42.000,32.200,32.000,31.200,36.500,40.200,...,37.000,29.300,35.300,30.700,45.800,45.000,35.000,69.800,47.300,46.200


In [6]:
#Making a new dataframe from the above containing the averages for a future line graph
state_avg = state_df.groupby('State').mean().reset_index()
state_avg.head()

  state_avg = state_df.groupby('State').mean().reset_index()


Unnamed: 0,State,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,AK,30.9,34.25,30.85,32.35,37.95,38.3,42.85,36.2,35.4,...,29.8,37.1,32.45,31.05,32.35,27.3,38.3,26.55,24.55,29.2
1,AL,83.67325,56.981,51.205875,52.017125,53.2905,58.441375,52.945,55.918125,47.11625,...,30.514,32.12725,36.114625,27.2525,27.500375,27.114875,25.96425,27.372125,28.748875,27.363625
2,AR,24.8095,23.4,22.658,19.5035,20.1475,26.9055,19.1585,21.4435,18.8525,...,15.767,16.1625,14.9115,14.2595,13.659,15.214,14.308,13.157,15.2135,14.5115
3,AZ,74.7196,68.3162,63.182,75.2802,50.197,59.0206,71.339,61.176,61.5572,...,70.1172,71.5148,47.2544,57.8126,62.9152,78.0154,46.8926,59.6956,71.0154,57.314
4,CA,107.987429,114.352964,106.465679,91.628786,86.646643,88.873179,90.562429,90.169214,94.056393,...,79.676286,76.061071,71.457071,65.588286,84.593143,112.890643,65.724429,108.160893,83.341393,71.395857


In [7]:
state_avg.set_index('State', inplace=True)

In [8]:
#Transpose the data to build a line graph
state_avg_pose = state_avg.transpose()
state_avg_pose.head()

State,AK,AL,AR,AZ,CA,CO,CT,DE,FL,GA,...,SC,TN,TX,UT,VA,VT,WA,WI,WV,WY
2000,30.9,83.67325,24.8095,74.7196,107.987429,73.302,72.685667,47.2,96.115667,53.947625,...,51.851333,4.84275,72.9597,63.231667,100.2305,37.58,24.644,45.371857,127.1944,48.7886
2001,34.25,56.981,23.4,68.3162,114.352964,76.385167,76.666,47.0,89.755167,49.681875,...,50.180333,4.58625,67.9428,80.727,104.739,34.419667,23.343,43.190286,129.7734,46.8302
2002,30.85,51.205875,22.658,63.182,106.465679,72.355,72.639667,60.1,80.041,46.156375,...,43.218,4.095,67.7384,73.963,106.645,39.756667,22.7195,38.604857,136.3164,49.7488
2003,32.35,52.017125,19.5035,75.2802,91.628786,67.056833,74.493,43.7,73.236167,45.568,...,42.876667,2.082,67.0863,65.259667,104.6285,38.515667,21.55,38.360714,110.2342,43.6898
2004,37.95,53.2905,20.1475,50.197,86.646643,68.984167,64.746,43.6,77.194,49.467625,...,47.776667,4.07475,64.7533,86.322333,98.973,37.381667,24.34825,38.281714,119.4276,48.7296


In [9]:
state_avg_pose.reset_index(inplace=True)

In [10]:
state_avg_pose.rename(columns={'index':'Year'})

State,Year,AK,AL,AR,AZ,CA,CO,CT,DE,FL,...,SC,TN,TX,UT,VA,VT,WA,WI,WV,WY
0,2000,30.9,83.67325,24.8095,74.7196,107.987429,73.302,72.685667,47.2,96.115667,...,51.851333,4.84275,72.9597,63.231667,100.2305,37.58,24.644,45.371857,127.1944,48.7886
1,2001,34.25,56.981,23.4,68.3162,114.352964,76.385167,76.666,47.0,89.755167,...,50.180333,4.58625,67.9428,80.727,104.739,34.419667,23.343,43.190286,129.7734,46.8302
2,2002,30.85,51.205875,22.658,63.182,106.465679,72.355,72.639667,60.1,80.041,...,43.218,4.095,67.7384,73.963,106.645,39.756667,22.7195,38.604857,136.3164,49.7488
3,2003,32.35,52.017125,19.5035,75.2802,91.628786,67.056833,74.493,43.7,73.236167,...,42.876667,2.082,67.0863,65.259667,104.6285,38.515667,21.55,38.360714,110.2342,43.6898
4,2004,37.95,53.2905,20.1475,50.197,86.646643,68.984167,64.746,43.6,77.194,...,47.776667,4.07475,64.7533,86.322333,98.973,37.381667,24.34825,38.281714,119.4276,48.7296
5,2005,38.3,58.441375,26.9055,59.0206,88.873179,61.2875,73.593333,48.2,75.828333,...,51.180333,4.58125,65.8329,58.364,106.829,37.580667,23.49375,48.432714,113.2126,43.6876
6,2006,42.85,52.945,19.1585,71.339,90.562429,57.705833,66.092,42.9,76.704917,...,41.846667,4.331,72.7729,57.760667,100.8785,31.611,22.1235,43.182286,105.0314,47.1894
7,2007,36.2,55.918125,21.4435,61.176,90.169214,67.487167,62.524667,43.1,74.102167,...,48.076333,4.58475,64.5671,86.294,103.229,32.584667,21.4185,45.044143,115.8332,48.6284
8,2008,35.4,47.11625,18.8525,61.5572,94.056393,64.153,57.114667,39.3,64.2495,...,36.710667,3.82675,59.545,87.826,73.827,27.847,20.89025,42.765571,98.1894,42.6088
9,2009,38.2,37.085375,16.9465,59.873,77.4805,55.5005,54.671667,30.5,55.052917,...,37.031,3.06725,53.1353,91.323667,65.014,25.742667,22.64175,42.194571,95.5846,45.8098


In [60]:
#Creating a dash app to display a line chart that shows overall pollution by year with a dropdown for each state

# Create a Dash app
appState = dash.Dash(__name__)


dropdown_options = [{'label': col, 'value': col} for col in state_avg_pose.columns[1:]]

# Define app layout
appState.layout = html.Div([
    dcc.Dropdown(
        id='dropdown',
        options=[{'label': state, 'value': state} for state in state_avg_pose.columns[1:]],
        value='AK'
    ),
    dcc.Graph(id='line-plot1')
])

# Define callback to update the line plot
@appState.callback(
    Output('line-plot1', 'figure'),
    [Input('dropdown', 'value')]
)
def update_line_plot(selected_column):
    fig = px.line(state_avg_pose, x='index', y=selected_column)

    title = f'{selected_column} pollution'

    return fig

# Run the app
if __name__ == '__main__':
    appState.run_server(debug=True, port=8051)

In [11]:
#Using the above table, perform a t-test for each state checking for significant difference at the 0.05 significance level between overall pollution in 1990 and 2022.

for state in state_df['State'].unique():
    data_2000 = state_df.loc[state_df['State'] == state, '2000']
    data_2022 = state_df.loc[state_df['State'] == state, '2022']

    t_statistic, p_value = ttest_ind(data_2000, data_2022)

    print(f'State: {state}')
    print(f't_statistic: {t_statistic}')
    print(f'p_value: {p_value}')
    if p_value < 0.05:
        print('There is a significant difference between pollution in 2000 and 2022')

        if p_value > 0:
            print('There is a significant decrease in overall pollution')

            if t_statistic > 0:
                print('There was a significant decrease in pollution between 2000 and 2022')
            else:
                print('There was a significant increase in pollution between 2000 and 2022')

        else:
            print('There is a significant increase in ovall pollution')

    else:
        print('No significant difference between pollution in 2000 and 2022')

    print('-------------------------------')
    print('-------------------------------')



State: AK
t_statistic: 1.7919573407620772
p_value: 0.21501293588266346
No significant difference between pollution in 2000 and 2022
-------------------------------
-------------------------------
State: AL
t_statistic: 1.509119356832565
p_value: 0.15350272780958907
No significant difference between pollution in 2000 and 2022
-------------------------------
-------------------------------
State: AR
t_statistic: 0.35898337317828977
p_value: 0.7539633195973889
No significant difference between pollution in 2000 and 2022
-------------------------------
-------------------------------
State: AZ
t_statistic: 0.37094151853907686
p_value: 0.7203114747322884
No significant difference between pollution in 2000 and 2022
-------------------------------
-------------------------------
State: CA
t_statistic: 1.6670329313806622
p_value: 0.10129822141399748
No significant difference between pollution in 2000 and 2022
-------------------------------
-------------------------------
State: CO
t_statistic

  t_statistic, p_value = ttest_ind(data_1990, data_2022)
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero


For the following we will look at each pollutant seperately.

In [13]:
#Create a dataframe containing only the pollutant O3
O3 = ['O3']
O3_df = mean_2000[mean_2000['Pollutant'].isin(O3) == True]
state_O3 = (O3_df.groupby(['State', 'City'], sort=True, as_index=False)[years].sum())
state_O3

Unnamed: 0,State,City,2000,2001,2002,2003,2004,2005,2006,2007,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,AL,Birmingham,0.091,0.083,0.085,0.077,0.074,0.084,0.086,0.092,...,0.065,0.064,0.068,0.070,0.062,0.065,0.071,0.061,0.061,0.064
1,AL,Fort Payne,0.034,0.032,0.029,0.030,0.027,0.030,0.038,0.020,...,0.062,0.062,0.065,0.064,0.058,0.064,0.062,0.057,0.058,0.061
2,AL,Huntsville,0.088,0.080,0.078,0.079,0.077,0.075,0.079,0.082,...,0.064,0.064,0.063,0.066,0.063,0.065,0.063,0.057,0.061,0.065
3,AL,Mobile,0.089,0.076,0.075,0.080,0.074,0.073,0.085,0.077,...,0.060,0.068,0.060,0.062,0.065,0.065,0.059,0.054,0.057,0.061
4,AL,Montgomery,0.084,0.077,0.080,0.071,0.072,0.069,0.072,0.074,...,0.061,0.060,0.061,0.058,0.055,0.060,0.059,0.048,0.054,0.058
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,WI,Wausau,0.073,0.072,0.073,0.074,0.065,0.075,0.066,0.072,...,0.063,0.064,0.063,0.068,0.060,0.065,0.059,0.055,0.060,0.057
171,WI,Whitewater,0.076,0.088,0.084,0.081,0.069,0.083,0.072,0.075,...,0.067,0.073,0.067,0.072,0.065,0.070,0.062,0.074,0.069,0.070
172,WV,Charleston,0.085,0.083,0.087,0.088,0.069,0.079,0.077,0.082,...,0.067,0.067,0.067,0.069,0.067,0.066,0.065,0.057,0.065,0.061
173,WV,Parkersburg,0.087,0.084,0.095,0.083,0.069,0.084,0.080,0.084,...,0.064,0.067,0.071,0.067,0.058,0.061,0.061,0.058,0.057,0.062


In [14]:
O3_avg = state_O3.groupby('State').mean().reset_index()
O3_avg.head()


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



Unnamed: 0,State,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,AL,0.0772,0.0696,0.0694,0.0674,0.0648,0.0662,0.072,0.069,0.066,...,0.0624,0.0636,0.0634,0.064,0.0606,0.0638,0.0628,0.0554,0.0582,0.0618
1,AR,0.0595,0.05,0.058,0.0535,0.0475,0.0555,0.0585,0.0435,0.0525,...,0.067,0.0625,0.0615,0.0595,0.059,0.064,0.058,0.057,0.0635,0.0615
2,AZ,0.0745,0.07025,0.0775,0.07525,0.07125,0.07575,0.07375,0.07,0.0715,...,0.0715,0.0685,0.068,0.06575,0.069,0.06925,0.06575,0.0695,0.06925,0.0675
3,CA,0.078476,0.080143,0.08281,0.081238,0.076476,0.073762,0.078476,0.073238,0.079952,...,0.068381,0.071905,0.071333,0.070095,0.07181,0.068476,0.065905,0.071667,0.069476,0.065905
4,CO,0.070667,0.070333,0.076667,0.080333,0.068333,0.075,0.078333,0.074333,0.072667,...,0.075333,0.070333,0.072,0.069333,0.070333,0.072333,0.066,0.073667,0.077667,0.071667


In [15]:
state_O3.set_index('City', inplace=True)

In [16]:
state_O3.head()

Unnamed: 0_level_0,State,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Birmingham,AL,0.091,0.083,0.085,0.077,0.074,0.084,0.086,0.092,0.077,...,0.065,0.064,0.068,0.07,0.062,0.065,0.071,0.061,0.061,0.064
Fort Payne,AL,0.034,0.032,0.029,0.03,0.027,0.03,0.038,0.02,0.036,...,0.062,0.062,0.065,0.064,0.058,0.064,0.062,0.057,0.058,0.061
Huntsville,AL,0.088,0.08,0.078,0.079,0.077,0.075,0.079,0.082,0.073,...,0.064,0.064,0.063,0.066,0.063,0.065,0.063,0.057,0.061,0.065
Mobile,AL,0.089,0.076,0.075,0.08,0.074,0.073,0.085,0.077,0.076,...,0.06,0.068,0.06,0.062,0.065,0.065,0.059,0.054,0.057,0.061
Montgomery,AL,0.084,0.077,0.08,0.071,0.072,0.069,0.072,0.074,0.068,...,0.061,0.06,0.061,0.058,0.055,0.06,0.059,0.048,0.054,0.058


In [17]:
city_O3 = state_O3.transpose()


In [18]:
city_O3.reset_index(inplace=True)

In [20]:
city_O3 = city_O3.drop(index=0)

In [21]:
city_O3.head()

City,index,Birmingham,Fort Payne,Huntsville,Mobile,Montgomery,Arkadelphia,Little Rock,Flagstaff,Phoenix,...,Bellingham,Seattle,Spokane,Madison,Milwaukee,Wausau,Whitewater,Charleston,Parkersburg,Laramie
1,2000,0.091,0.034,0.088,0.089,0.084,0.029,0.09,0.071,0.082,...,0.052,0.056,0.068,0.072,0.082,0.073,0.076,0.085,0.087,0.043
2,2001,0.083,0.032,0.08,0.076,0.077,0.021,0.079,0.068,0.079,...,0.05,0.051,0.071,0.078,0.094,0.072,0.088,0.083,0.084,0.051
3,2002,0.085,0.029,0.078,0.075,0.08,0.031,0.085,0.079,0.082,...,0.053,0.054,0.071,0.081,0.096,0.073,0.084,0.087,0.095,0.044
4,2003,0.077,0.03,0.079,0.08,0.071,0.032,0.075,0.073,0.081,...,0.056,0.065,0.079,0.078,0.092,0.074,0.081,0.088,0.083,0.049
5,2004,0.074,0.027,0.077,0.074,0.072,0.022,0.073,0.072,0.074,...,0.062,0.062,0.069,0.065,0.073,0.065,0.069,0.069,0.069,0.048


In [63]:
#Creating a dash app to display a line chart that shows O3 pollution by year with a dropdown for each city

# Create a Dash app
appO3 = dash.Dash(__name__)


dropdown_options = [{'label': col, 'value': col} for col in city_O3.columns[1:]]

# Define app layout
appO3.layout = html.Div([
    dcc.Dropdown(
        id='dropdown-O3',
        options=dropdown_options,
        value='Birmingham'
    ),
    dcc.Graph(id='line-plot2')
])

# Define callback to update the line plot
@appO3.callback(
    Output('line-plot2', 'figure'),
    [Input('dropdown-O3', 'value')]
)
def update_line_plot(selected_column):
    fig = px.line(city_O3, x='index', y=selected_column)

    title = f'{selected_column} O3'

    fig.update_layout(title=title, xaxis_title = 'Year' ,yaxis_title = 'O3 Concentration')

    return fig

# Run the app
if __name__ == '__main__':
    appO3.run_server(debug=True, port=8052)

In [23]:
#Using the above table, perform a t-test for each state checking for significant difference at the 0.05 significance level for O3 pollution betweeen 1990 and 2022.
#If values return "nan" it means there was only one record for that state, which does not allow for a t-test.

for state in state_O3['State'].unique():
    data_2000 = state_O3.loc[state_O3['State'] == state, '2000']
    data_2022 = state_O3.loc[state_O3['State'] == state, '2022']

    t_statistic, p_value = ttest_ind(data_2000, data_2022)

    print(f'State: {state}')
    print(f't_statistic: {t_statistic}')
    print(f'p_value: {p_value}')
    if p_value < 0.05:
        print('There is a significant difference in O3 pollution between 2000 and 2022')
        if t_statistic > 0:
            print('There was a significant decrease in O3 pollution between 2000 and 2022')
        else:
            print('There was a significant increase in O3 pollution between 2000 and 2022')
            
    else:
        print('No significant difference in O3 pollution between 2000 and 2022')


    print('-------------------------------')
    print('-------------------------------')

State: AL
t_statistic: 1.4088771121384105
p_value: 0.19653270107293333
No significant difference in O3 pollution between 2000 and 2022
-------------------------------
-------------------------------
State: AR
t_statistic: -0.06549461226639136
p_value: 0.9537378997940775
No significant difference in O3 pollution between 2000 and 2022
-------------------------------
-------------------------------
State: AZ
t_statistic: 1.9051586888313579
p_value: 0.10541314582633256
No significant difference in O3 pollution between 2000 and 2022
-------------------------------
-------------------------------
State: CA
t_statistic: 3.0086178285435383
p_value: 0.004524666556723182
There is a significant difference in O3 pollution between 2000 and 2022
There was a significant decrease in O3 pollution between 2000 and 2022
-------------------------------
-------------------------------
State: CO
t_statistic: -0.4242640687119297
p_value: 0.6931965033474388
No significant difference in O3 pollution between 20


Precision loss occurred in moment calculation due to catastrophic cancellation. This occurs when the data are nearly identical. Results may be unreliable.


divide by zero encountered in divide


invalid value encountered in scalar multiply



In [24]:
#Create a dataframe containing only PM2.5 and continue for each pollutant
pm_2 = ['PM2.5']
pm25_df = mean_2000[mean_2000['Pollutant'].isin(pm_2) == True]
state_pm25 = (pm25_df.groupby(['State', 'City', 'Trend Statistic'], sort=True, as_index=False)[years].sum())
state_pm25

Unnamed: 0,State,City,Trend Statistic,2000,2001,2002,2003,2004,2005,2006,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,AK,Anchorage,98th Percentile,20.0,25.0,25.0,23.0,30.0,22.0,33.0,...,22.0,29.0,28.0,23.0,28.0,18.0,35.0,23.0,20.0,23.0
1,AK,Anchorage,Weighted Annual Mean,5.8,6.2,5.8,6.6,6.9,6.7,6.9,...,5.6,7.0,6.5,6.1,5.6,5.0,7.4,5.3,5.2,4.7
2,AK,Juneau,98th Percentile,24.0,26.0,20.0,23.0,26.0,35.0,33.0,...,23.0,28.0,21.0,24.0,22.0,22.0,25.0,17.0,17.0,23.0
3,AK,Juneau,Weighted Annual Mean,6.6,5.6,6.2,6.4,6.6,8.1,8.5,...,5.9,7.7,6.6,6.0,5.6,6.9,6.8,4.8,4.7,5.3
4,AL,Birmingham,98th Percentile,47.0,39.0,36.0,36.0,39.0,43.0,38.0,...,21.0,22.0,20.0,17.0,20.0,19.0,18.0,19.0,23.0,19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333,WV,Parkersburg,Weighted Annual Mean,17.8,17.4,15.8,14.9,14.9,16.4,14.7,...,9.4,9.6,9.3,7.9,8.1,7.7,7.7,7.0,7.9,7.4
334,WY,Riverton,98th Percentile,33.0,33.0,33.0,26.0,40.0,30.0,23.0,...,29.0,26.0,20.0,22.0,27.0,22.0,26.0,37.0,29.0,15.0
335,WY,Riverton,Weighted Annual Mean,9.6,9.6,9.6,8.1,9.6,7.7,7.7,...,7.8,6.7,6.2,6.8,7.4,7.3,7.0,7.3,8.5,5.8
336,WY,Sheridan,98th Percentile,35.0,40.0,28.0,26.0,39.0,33.0,24.0,...,17.0,19.0,25.0,23.0,25.0,21.0,18.0,34.0,29.0,29.0


In [25]:
state_pm25 = state_pm25[state_pm25['Trend Statistic'] != '98th Percentile']
state_pm25

Unnamed: 0,State,City,Trend Statistic,2000,2001,2002,2003,2004,2005,2006,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
1,AK,Anchorage,Weighted Annual Mean,5.8,6.2,5.8,6.6,6.9,6.7,6.9,...,5.6,7.0,6.5,6.1,5.6,5.0,7.4,5.3,5.2,4.7
3,AK,Juneau,Weighted Annual Mean,6.6,5.6,6.2,6.4,6.6,8.1,8.5,...,5.9,7.7,6.6,6.0,5.6,6.9,6.8,4.8,4.7,5.3
5,AL,Birmingham,Weighted Annual Mean,20.0,17.3,16.4,15.7,16.0,18.0,17.3,...,10.3,10.9,10.3,9.2,9.5,9.0,9.2,8.8,9.8,9.2
7,AL,Daphne,Weighted Annual Mean,14.5,10.6,10.4,12.2,11.4,11.7,11.2,...,8.4,8.9,8.6,7.2,7.4,7.1,7.5,7.9,7.3,7.3
9,AL,Decatur,Weighted Annual Mean,13.1,13.1,13.1,13.7,12.2,13.7,13.2,...,8.6,9.3,8.8,7.4,7.7,7.4,7.6,7.1,9.0,8.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329,WV,Fairmont,Weighted Annual Mean,16.0,15.9,15.3,15.0,14.1,15.9,14.6,...,9.3,9.5,9.5,8.6,7.7,7.4,7.6,7.4,7.3,7.1
331,WV,Morgantown,Weighted Annual Mean,15.0,14.9,15.2,14.6,13.8,15.0,13.5,...,8.9,8.6,8.2,7.4,7.3,6.9,7.2,6.2,7.7,7.0
333,WV,Parkersburg,Weighted Annual Mean,17.8,17.4,15.8,14.9,14.9,16.4,14.7,...,9.4,9.6,9.3,7.9,8.1,7.7,7.7,7.0,7.9,7.4
335,WY,Riverton,Weighted Annual Mean,9.6,9.6,9.6,8.1,9.6,7.7,7.7,...,7.8,6.7,6.2,6.8,7.4,7.3,7.0,7.3,8.5,5.8


In [26]:
city_pm25 = state_pm25.set_index('City')

In [27]:
city_pm25.head()


Unnamed: 0_level_0,State,Trend Statistic,2000,2001,2002,2003,2004,2005,2006,2007,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Anchorage,AK,Weighted Annual Mean,5.8,6.2,5.8,6.6,6.9,6.7,6.9,5.2,...,5.6,7.0,6.5,6.1,5.6,5.0,7.4,5.3,5.2,4.7
Juneau,AK,Weighted Annual Mean,6.6,5.6,6.2,6.4,6.6,8.1,8.5,6.6,...,5.9,7.7,6.6,6.0,5.6,6.9,6.8,4.8,4.7,5.3
Birmingham,AL,Weighted Annual Mean,20.0,17.3,16.4,15.7,16.0,18.0,17.3,17.2,...,10.3,10.9,10.3,9.2,9.5,9.0,9.2,8.8,9.8,9.2
Daphne,AL,Weighted Annual Mean,14.5,10.6,10.4,12.2,11.4,11.7,11.2,10.4,...,8.4,8.9,8.6,7.2,7.4,7.1,7.5,7.9,7.3,7.3
Decatur,AL,Weighted Annual Mean,13.1,13.1,13.1,13.7,12.2,13.7,13.2,14.4,...,8.6,9.3,8.8,7.4,7.7,7.4,7.6,7.1,9.0,8.5


In [28]:
city_pm25.drop(columns=['Trend Statistic'], inplace=True)

In [29]:
city_pm25.set_index('City', inplace=True)

KeyError: "None of ['City'] are in the columns"

In [30]:
city_25 = city_pm25.transpose()

In [31]:
city_25.reset_index(inplace=True)

In [32]:
city_25 = city_25.drop(index=0)

In [33]:
city_25.head()

City,index,Anchorage,Juneau,Birmingham,Daphne,Decatur,Fort Payne,Gadsden,Huntsville,Little Rock,...,Madison,Milwaukee,Platteville,Charleston,Clarksburg,Fairmont,Morgantown,Parkersburg,Riverton,Sheridan
1,2000,5.8,6.6,20.0,14.5,13.1,17.2,19.5,16.3,15.5,...,12.8,13.1,12.3,18.1,14.9,16.0,15.0,17.8,9.6,12.1
2,2001,6.2,5.6,17.3,10.6,13.1,14.7,17.2,14.6,14.7,...,13.3,13.8,11.9,18.1,14.4,15.9,14.9,17.4,9.6,11.0
3,2002,5.8,6.2,16.4,10.4,13.1,14.4,14.8,13.8,13.2,...,12.3,12.7,10.9,17.1,14.0,15.3,15.2,15.8,9.6,10.1
4,2003,6.6,6.4,15.7,12.2,13.7,15.0,14.3,13.8,12.9,...,11.9,12.3,11.3,16.1,13.4,15.0,14.6,14.9,8.1,10.1
5,2004,6.9,6.6,16.0,11.4,12.2,14.1,14.3,13.4,12.2,...,11.3,12.0,10.8,15.9,13.3,14.1,13.8,14.9,9.6,10.0


In [64]:
#Creating a dash app to display a line chart that shows PM2.5 pollution by year with a dropdown for each city

# Create a Dash app
appPM25 = dash.Dash(__name__)


dropdown_options = [{'label': col, 'value': col} for col in city_25.columns[2:]]

# Define app layout
appPM25.layout = html.Div([
    dcc.Dropdown(
        id='dropdown-PM25',
        options=dropdown_options,
        value='Anchorage'
    ),
    dcc.Graph(id='line-plot3')
])

# Define callback to update the line plot
@appPM25.callback(
    Output('line-plot3', 'figure'),
    [Input('dropdown-PM25', 'value')]
)
def update_line_plot(selected_column):
    fig = px.line(city_25, x='index', y=selected_column)

    title = f'{selected_column} PM2.5'

    fig.update_layout(title = title, xaxis_title = 'Year', yaxis_title = 'PM2.5 Concentration')

    return fig

# Run the app
if __name__ == '__main__':
    appPM25.run_server(debug=True, port=8053)

In [65]:
#Using the above table, perform a t-test for each state checking for significant difference at the 0.05 significance level of PM2.5 pollution between 2000 and 2022.
#If values return "nan" it means there was only one record for that state, which does not allow for a t-test.

for state in state_pm25['State'].unique():
    data_2000 = state_pm25.loc[state_pm25['State'] == state, '2000']
    data_2022 = state_pm25.loc[state_pm25['State'] == state, '2022']

    t_statistic, p_value = ttest_ind(data_2000, data_2022)

    print(f'State: {state}')
    print(f't_statistic: {t_statistic}')
    print(f'p_value: {p_value}')
    if p_value < 0.05:
        print('There is a significant difference in PM2.5 pollution between 2000 and 2022')
        if t_statistic > 0:
            print('There was a significant decrease in PM2.5 pollution between 2000 and 2022')
        else:
            print('There was a significant increase in PM2.5 pollution between 2000 and 2022')
            
    else:
        print('No significant difference in PM2.5 pollution between 2000 and 2022')


    print('-------------------------------')

State: AK
t_statistic: 2.3999999999999995
p_value: 0.1384502096587143
No significant difference in PM2.5 pollution between 2000 and 2022
-------------------------------
State: AL
t_statistic: 7.601710220633044
p_value: 1.8364299284931228e-05
There is a significant difference in PM2.5 pollution between 2000 and 2022
There was a significant decrease in PM2.5 pollution between 2000 and 2022
-------------------------------
State: AR
t_statistic: nan
p_value: nan
No significant difference in PM2.5 pollution between 2000 and 2022
-------------------------------
State: AZ
t_statistic: 1.1214296958515246
p_value: 0.32487611072332695
No significant difference in PM2.5 pollution between 2000 and 2022
-------------------------------
State: CA
t_statistic: 2.8265393387646105
p_value: 0.007175150014392773
There is a significant difference in PM2.5 pollution between 2000 and 2022
There was a significant decrease in PM2.5 pollution between 2000 and 2022
-------------------------------
State: CO
t_sta


Precision loss occurred in moment calculation due to catastrophic cancellation. This occurs when the data are nearly identical. Results may be unreliable.


divide by zero encountered in divide


invalid value encountered in scalar multiply



In [36]:
SO2 = ['SO2']
SO2_df = mean_2000[mean_2000['Pollutant'].isin(SO2) == True]
state_SO2 = (SO2_df.groupby(['State', 'City'], sort=True, as_index=False)[years].sum())
state_SO2.head()

Unnamed: 0,State,City,2000,2001,2002,2003,2004,2005,2006,2007,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,AL,Birmingham,149.0,63.0,50.0,53.0,56.0,70.0,70.0,50.0,...,22.0,17.0,17.0,11.0,12.0,11.0,10.0,5.0,7.0,6.0
1,CA,San Francisco,20.0,21.0,20.0,15.0,20.0,16.0,22.0,18.0,...,9.0,12.0,8.0,9.0,9.0,10.0,10.0,8.0,9.0,7.0
2,CA,Santa Maria,4.0,8.0,7.0,5.0,11.0,7.0,5.0,5.0,...,3.0,4.0,2.0,2.0,1.0,2.0,3.0,3.0,2.0,2.0
3,CO,Denver,65.0,73.0,73.0,58.0,40.0,39.0,33.0,30.0,...,34.0,16.0,15.0,14.0,13.0,7.0,6.0,6.0,6.0,5.0
4,FL,Jacksonville,107.0,99.0,100.0,68.0,80.0,90.0,94.0,53.0,...,33.0,37.0,27.0,25.0,20.0,22.0,24.0,28.0,20.0,20.0


In [37]:
#Using the above table, perform a t-test for each state checking for significant difference at the 0.05 significance level between SO2 pollution in 1990 and 2022. 
#If values return "nan" it means there was only one record for that state, which does not allow for a t-test.

for state in state_SO2['State'].unique():
    data_2000 = state_SO2.loc[state_SO2['State'] == state, '2000']
    data_2022 = state_SO2.loc[state_SO2['State'] == state, '2022']

    t_statistic, p_value = ttest_ind(data_2000, data_2022)

    print(f'State: {state}')
    print(f't_statistic: {t_statistic}')
    print(f'p_value: {p_value}')
    if p_value < 0.05:
        print('There is a significant difference in SO2 pollution between 2000 and 2022')
        if t_statistic > 0:
            print('There was a significant decrease in SO2 pollution between 2000 and 2022')
        else:
            print('There was a significant increase in SO2 pollution between 2000 and 2022')
            
    else:
        print('No significant difference in SO2 pollution between 2000 and 2022')


    print('-------------------------------')
    print('-------------------------------')

State: AL
t_statistic: nan
p_value: nan
No significant difference in SO2 pollution between 2000 and 2022
-------------------------------
-------------------------------
State: CA
t_statistic: 0.8948249794078403
p_value: 0.4653077456503607
No significant difference in SO2 pollution between 2000 and 2022
-------------------------------
-------------------------------
State: CO
t_statistic: nan
p_value: nan
No significant difference in SO2 pollution between 2000 and 2022
-------------------------------
-------------------------------
State: FL
t_statistic: 2.3408832387529346
p_value: 0.04127736090068851
There is a significant difference in SO2 pollution between 2000 and 2022
There was a significant decrease in SO2 pollution between 2000 and 2022
-------------------------------
-------------------------------
State: GA
t_statistic: nan
p_value: nan
No significant difference in SO2 pollution between 2000 and 2022
-------------------------------
-------------------------------
State: HI
t_st


Precision loss occurred in moment calculation due to catastrophic cancellation. This occurs when the data are nearly identical. Results may be unreliable.


divide by zero encountered in divide


invalid value encountered in scalar multiply



In [38]:
NO2 = ['NO2']
NO2_df = mean_2000[mean_2000['Pollutant'].isin(NO2) == True]
NO2_df

Unnamed: 0,Primary Key,CBSA,City,State,Pollutant,Trend Statistic,Number of Sites,2000,2001,2002,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
10,11,10740,Albuquerque,NM,NO2,Annual Mean,1,17.0,17.0,19.0,...,12.0,12.0,11.0,10.0,10.0,10.0,9.0,8.0,8.0,9.0
11,12,10740,Albuquerque,NM,NO2,98th Percentile,1,55.0,54.0,57.0,...,45.0,42.0,43.0,44.0,45.0,45.0,44.0,41.0,44.0,44.0
35,36,12060,Atlanta,GA,NO2,Annual Mean,1,18.0,17.0,15.0,...,9.0,11.0,10.0,11.0,9.0,8.0,9.0,8.0,8.0,8.0
44,45,12540,Bakersfield,CA,NO2,Annual Mean,2,15.0,16.0,16.0,...,10.0,9.0,8.0,8.0,7.0,8.0,7.0,7.0,7.0,7.0
45,46,12540,Bakersfield,CA,NO2,98th Percentile,2,52.0,56.0,55.0,...,42.0,41.0,35.0,32.0,33.0,33.0,34.0,30.0,30.0,29.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,696,49180,Winston,NC,NO2,98th Percentile,1,59.0,59.0,52.0,...,37.0,41.0,39.0,36.0,38.0,35.0,34.0,30.0,32.0,35.0
702,703,49620,York,PA,NO2,Annual Mean,1,18.0,20.0,17.0,...,10.0,11.0,10.0,10.0,8.0,6.0,7.0,7.0,8.0,9.0
703,704,49620,York,PA,NO2,98th Percentile,1,65.0,66.0,63.0,...,39.0,45.0,52.0,47.0,42.0,41.0,40.0,39.0,40.0,39.0
708,709,49700,Yuba City,CA,NO2,Annual Mean,1,13.0,14.0,15.0,...,10.0,8.0,7.0,7.0,7.0,7.0,6.0,6.0,5.0,7.0


In [39]:
CO = ['CO']
CO_df = mean_2000[mean_2000['Pollutant'].isin(CO) == True]
state_CO = (CO_df.groupby(['State', 'City'], sort=True, as_index=False)[years].sum())
state_CO

Unnamed: 0,State,City,2000,2001,2002,2003,2004,2005,2006,2007,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,AK,Anchorage,5.4,5.7,4.7,5.7,6.4,4.8,4.3,3.6,...,3.1,2.5,2.8,3.0,3.5,2.7,2.4,3.0,2.2,2.4
1,AL,Birmingham,3.7,6.3,3.7,3.1,3.9,2.5,3.1,2.0,...,1.7,2.4,1.9,1.4,1.0,1.2,1.2,1.1,1.0,0.8
2,AZ,Phoenix,5.2,4.5,4.4,3.8,3.4,3.6,3.3,3.0,...,2.0,2.1,1.9,1.9,2.2,2.0,1.7,1.9,1.9,1.7
3,AZ,Tucson,4.7,2.9,2.5,2.6,2.0,2.1,1.8,1.9,...,1.1,1.0,1.0,0.9,1.0,0.8,0.6,0.8,0.7,0.7
4,CA,Fresno,3.3,2.8,2.5,1.8,1.7,1.8,2.1,1.7,...,1.4,1.1,1.0,1.3,1.2,1.3,1.0,2.2,1.1,1.0
5,CA,Los Angeles,5.3,3.9,3.5,3.4,2.9,2.6,2.5,2.1,...,1.7,1.6,1.6,1.3,1.5,1.4,1.3,1.5,1.4,1.2
6,CA,Riverside,2.6,2.2,2.1,2.4,1.8,1.6,1.5,1.5,...,1.2,1.6,1.1,1.3,1.0,1.0,0.7,1.1,1.0,0.9
7,CA,Sacramento,4.6,5.0,3.5,3.8,2.8,3.3,3.0,2.8,...,2.1,1.7,1.9,1.7,1.6,3.3,1.2,2.0,1.1,1.1
8,CA,San Francisco,2.7,2.8,2.2,2.1,1.8,1.7,1.7,1.6,...,1.2,1.2,1.1,1.0,1.2,1.6,0.9,1.6,0.9,0.9
9,CA,Santa Maria,1.1,1.1,1.2,1.0,0.9,0.8,1.3,1.1,...,0.6,0.6,0.5,3.2,1.3,0.7,0.5,0.7,0.6,0.6


In [40]:
#Using the above table, perform a t-test for each state checking for significant difference at the 0.05 significance level between CO pollution in 1990 and 2022. 
#If values return "nan" it means there was only one record for that state, which does not allow for a t-test.

for state in state_CO['State'].unique():
    data_2000 = state_CO.loc[state_CO['State'] == state, '2000']
    data_2022 = state_CO.loc[state_CO['State'] == state, '2022']

    t_statistic, p_value = ttest_ind(data_2000, data_2022)

    print(f'State: {state}')
    print(f't_statistic: {t_statistic}')
    print(f'p_value: {p_value}')
    if p_value < 0.05:
        print('There is a significant difference in CO pollution between 2000 and 2022')
        if t_statistic > 0:
            print('There was a significant decrease in CO pollution between 2000 and 2022')
        else:
            print('There was a significant increase in CO pollution between 2000 and 2022')
            
    else:
        print('No significant difference in CO pollution between 2000 and 2022')


    print('-------------------------------')
    print('-------------------------------')

State: AK
t_statistic: nan
p_value: nan
No significant difference in CO pollution between 2000 and 2022
-------------------------------
-------------------------------
State: AL
t_statistic: nan
p_value: nan
No significant difference in CO pollution between 2000 and 2022
-------------------------------
-------------------------------
State: AZ
t_statistic: 6.7082039324993685
p_value: 0.02150789041983671
There is a significant difference in CO pollution between 2000 and 2022
There was a significant decrease in CO pollution between 2000 and 2022
-------------------------------
-------------------------------
State: CA
t_statistic: 4.2024635912800745
p_value: 0.0012265180738523498
There is a significant difference in CO pollution between 2000 and 2022
There was a significant decrease in CO pollution between 2000 and 2022
-------------------------------
-------------------------------
State: CO
t_statistic: 10.999999999999998
p_value: 0.008163401865824484
There is a significant difference 


Precision loss occurred in moment calculation due to catastrophic cancellation. This occurs when the data are nearly identical. Results may be unreliable.


divide by zero encountered in divide


invalid value encountered in scalar multiply



In [41]:
pm10 = ['PM10']
pm10_df = mean_2000[mean_2000['Pollutant'].isin(pm10) == True]
pm10_df.head()

Unnamed: 0,Primary Key,CBSA,City,State,Pollutant,Trend Statistic,Number of Sites,2000,2001,2002,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
13,14,10740,Albuquerque,NM,PM10,2nd Max,1,35.0,41.0,63.0,...,92.0,106.0,44.0,63.0,46.0,60.0,49.0,67.0,85.0,100.0
73,74,13820,Birmingham,AL,PM10,2nd Max,1,101.0,64.0,42.0,...,32.0,32.0,64.0,35.0,28.0,29.0,28.0,28.0,28.0,28.0
84,85,14500,Boulder,CO,PM10,2nd Max,1,68.0,47.0,41.0,...,44.0,55.0,42.0,41.0,38.0,57.0,50.0,65.0,51.0,39.0
113,114,16700,Charleston,SC,PM10,2nd Max,1,52.0,63.0,39.0,...,34.0,30.0,39.0,66.0,29.0,31.0,54.0,45.0,40.0,53.0
122,123,17460,Cleveland,OH,PM10,2nd Max,4,63.0,66.5,58.8,...,37.3,33.5,35.8,44.5,35.3,38.3,37.3,30.9,33.5,38.5


In [42]:

#create a dataframe summing the total pollutants in each city by year
city_total_df = (mean_2000.groupby('City', sort=True, as_index=False)[years].sum())


In [43]:
#Convert list of cities to be used in a dropdown menu for future plots
city_list = city_total_df['City'].tolist()

In [44]:
#Set index to prepare for dataframe inversion
city_total_df.set_index('City', inplace=True)

In [45]:
city_total_df.head()

Unnamed: 0_level_0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Akron,216.285,192.296,201.1,215.19,220.879,198.791,181.874,91.187,79.975,68.767,...,56.759,52.96,47.465,33.26,28.966,31.968,32.262,32.963,33.067,30.17
Albany,97.07,93.989,89.991,86.381,93.175,98.282,90.27,99.178,80.378,75.166,...,63.163,59.561,57.862,53.068,56.361,56.966,54.355,61.954,65.661,49.059
Albuquerque,136.972,140.371,166.174,167.076,126.071,154.073,133.671,140.369,131.164,114.663,...,171.667,185.662,127.066,141.665,122.669,136.174,124.067,141.271,160.571,174.371
Altoona,62.08,68.083,51.089,62.083,61.073,72.077,48.071,56.071,55.075,51.065,...,29.065,30.06,31.069,13.062,7.063,8.064,8.064,4.063,4.061,6.062
Americus,0.092,0.082,0.07,0.072,0.07,0.071,0.077,0.076,0.068,0.06,...,0.06,0.059,0.057,0.065,0.058,0.062,0.062,0.054,0.059,0.059


In [46]:
#Transposing the dataframe
cities_df = city_total_df.transpose()
cities_df.head()


City,Akron,Albany,Albuquerque,Altoona,Americus,Anchorage,Ann Arbor,Appleton,Arkadelphia,Asheville,...,Washington Court House,Watertown,Wausau,Whitewater,Wichita,Wilmington,Winston,Yakima,York,Yuba City
2000,216.285,97.07,136.972,62.08,0.092,31.2,44.335,43.5,0.029,49.49,...,0.024,0.072,0.073,0.076,36.28,0.08,196.489,48.5,218.79,128.679
2001,192.296,93.989,140.371,68.083,0.082,36.9,54.431,44.1,0.021,44.576,...,0.022,0.102,0.072,0.088,33.984,0.078,201.194,48.5,191.787,143.981
2002,201.1,89.991,166.174,51.089,0.07,35.5,45.93,39.8,0.031,44.89,...,0.02,0.1,0.073,0.084,36.379,0.08,221.093,45.2,208.901,124.18
2003,215.19,86.381,167.076,62.083,0.072,35.3,53.634,36.4,0.032,43.67,...,0.02,0.089,0.074,0.081,34.68,0.076,191.581,45.2,189.481,114.576
2004,220.879,93.175,126.071,61.073,0.07,43.3,44.724,38.7,0.022,39.373,...,0.024,0.071,0.065,0.069,34.368,0.07,203.078,54.9,230.477,112.073


In [47]:
#reset the index
cities_df.reset_index(inplace=True)

In [48]:
cities_df.head()

City,index,Akron,Albany,Albuquerque,Altoona,Americus,Anchorage,Ann Arbor,Appleton,Arkadelphia,...,Washington Court House,Watertown,Wausau,Whitewater,Wichita,Wilmington,Winston,Yakima,York,Yuba City
0,2000,216.285,97.07,136.972,62.08,0.092,31.2,44.335,43.5,0.029,...,0.024,0.072,0.073,0.076,36.28,0.08,196.489,48.5,218.79,128.679
1,2001,192.296,93.989,140.371,68.083,0.082,36.9,54.431,44.1,0.021,...,0.022,0.102,0.072,0.088,33.984,0.078,201.194,48.5,191.787,143.981
2,2002,201.1,89.991,166.174,51.089,0.07,35.5,45.93,39.8,0.031,...,0.02,0.1,0.073,0.084,36.379,0.08,221.093,45.2,208.901,124.18
3,2003,215.19,86.381,167.076,62.083,0.072,35.3,53.634,36.4,0.032,...,0.02,0.089,0.074,0.081,34.68,0.076,191.581,45.2,189.481,114.576
4,2004,220.879,93.175,126.071,61.073,0.07,43.3,44.724,38.7,0.022,...,0.024,0.071,0.065,0.069,34.368,0.07,203.078,54.9,230.477,112.073


In [49]:
##Creating a dash app to display a line chart that shows overall pollution by year with a dropdown for each state

# Create a Dash app
app = dash.Dash(__name__)


dropdown_options = [{'label': col, 'value': col} for col in cities_df.columns[1:]]

# Define app layout
app.layout = html.Div([
    dcc.Dropdown(
        id='dropdown-column',
        options=dropdown_options,
        value='Akron'
    ),
    dcc.Graph(id='line-plot')
])

# Define callback to update the line plot
@app.callback(
    Output('line-plot', 'figure'),
    [Input('dropdown-column', 'value')]
)
def update_line_plot(selected_column):
    fig = px.line(cities_df, x='index', y=selected_column)

    return fig

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True, port = 8050)

The following will include some statistics found from these above data starting with the state level.

In [50]:
state_avg.reset_index(inplace=True)


In [51]:
#What are the best and worst states for pollution?
lowest_pollutant = state_avg['2022'].min()
best_state = state_avg.loc[state_avg['2022'].idxmin(), 'State']
print(f'{best_state} is the state with the lowest overall pollution concentration with a pollutant density value of {lowest_pollutant}.')

most_pollutant = state_avg['2022'].max()
worst_state = state_avg.loc[state_avg['2022'].idxmax(), 'State']
print(f'{worst_state} is the state with the highest overall pollution concentration with a pollutant density value of {most_pollutant}.')

ME is the state with the lowest overall pollution concentration with a pollutant density value of 0.057666666666666665.
OR is the state with the highest overall pollution concentration with a pollutant density value of 145.52075.


In [52]:
#Which states improved and deproved over time the most in pollution concentration?

state_avg['Change_Over_Time'] = state_avg['2022'] - state_avg['2000']

best_change = state_avg['Change_Over_Time'].min()
best_change_state = state_avg.loc[state_avg['Change_Over_Time'].idxmin(), 'State']
print(f'{best_change_state} is the state with the best overall pollution change over time with a pollutant density change of {best_change}.')


worst_change = state_avg['Change_Over_Time'].max()
worst_change_state = state_avg.loc[state_avg['Change_Over_Time'].idxmax(), 'State']
print(f'{worst_change_state} is the state with the worst overall pollution change over time with a pollutant density change of {worst_change}.')



WV is the state with the best overall pollution change over time with a pollutant density change of -99.66980000000001.
OR is the state with the worst overall pollution change over time with a pollutant density change of 91.85024999999999.


The following will take a look at the same statistics at the city level.

In [53]:
city_total_df.reset_index(inplace=True)

In [54]:
#What are the best and worst cities for pollution?
low_pollutant = city_total_df['2022'].min()
best_city = city_total_df.loc[city_total_df['2022'].idxmin(), 'City']
print(f'{best_city} is the city with the lowest overall pollution concentration with a value of {low_pollutant}.')

high_pollutant = city_total_df['2022'].max()
worst_city = city_total_df.loc[city_total_df['2022'].idxmax(), 'City']
print(f'{worst_city} is the city with the highest pollution concentration with a value of {high_pollutant}.')

Bellingham is the city with the lowest overall pollution concentration with a value of 0.048.
Eugene is the city with the highest pollution concentration with a value of 506.758.


In [55]:
#Which cities improved and deproved over time the most in pollution concentration?

city_total_df['Change_Over_Time'] = city_total_df['2022'] - city_total_df['2000']

best_change_c = city_total_df['Change_Over_Time'].min()
best_change_city = city_total_df.loc[city_total_df['Change_Over_Time'].idxmin(), 'City']
print(f'{best_change_city} is the city with the best overall pollution change over time with a pollutant density change of {best_change_c}.')


worst_change_c = city_total_df['Change_Over_Time'].max()
worst_change_city = city_total_df.loc[city_total_df['Change_Over_Time'].idxmax(), 'City']
print(f'{worst_change_city} is the city with the worst overall pollution change over time with a pollutant density change of {worst_change_c}.')

Birmingham is the city with the best overall pollution change over time with a pollutant density change of -257.727.
Eugene is the city with the worst overall pollution change over time with a pollutant density change of 380.402.


Now we will take a look at some statistics at the individual pollutant level.

In [71]:
#Summary statistics by each year for O3 concentration by state.
O3_avg.describe()

Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
count,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,...,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0
mean,0.073771,0.074851,0.0786,0.073609,0.067585,0.072209,0.07119,0.070748,0.067098,0.063244,...,0.064828,0.064436,0.065622,0.065211,0.06529,0.066836,0.062243,0.061393,0.064952,0.063329
std,0.012556,0.013507,0.014898,0.010678,0.009215,0.012046,0.010641,0.0125,0.009695,0.007225,...,0.006249,0.004877,0.005427,0.006716,0.005681,0.00589,0.00534,0.007371,0.006631,0.006182
min,0.043,0.042,0.043,0.038,0.046,0.038,0.04,0.033,0.041,0.0465,...,0.047,0.052,0.048,0.047,0.046,0.046,0.051,0.044,0.045,0.044
25%,0.067167,0.070116,0.072875,0.070813,0.064429,0.0669,0.066625,0.065411,0.063354,0.061232,...,0.062025,0.061714,0.0631,0.063625,0.0625,0.063725,0.059083,0.056542,0.061083,0.061
50%,0.075125,0.078875,0.079833,0.075292,0.069,0.074,0.07175,0.074381,0.068333,0.064,...,0.0655,0.064417,0.065,0.065417,0.064583,0.067125,0.06265,0.059714,0.0635,0.062625
75%,0.0835,0.082625,0.08781,0.081179,0.07295,0.07975,0.0785,0.07825,0.073292,0.067562,...,0.068202,0.067938,0.068375,0.069,0.067944,0.069875,0.064917,0.065812,0.067667,0.067125
max,0.095,0.099333,0.106333,0.093,0.082,0.093333,0.092,0.091333,0.084,0.07475,...,0.082667,0.074,0.079667,0.079,0.078333,0.08,0.079,0.08,0.082,0.075333


In [56]:
# Which states have the lowest and highest O3 concentration?
low_O3 = O3_avg['2022'].min()
best_state_O3 = O3_avg.loc[O3_avg['2022'].idxmin(), 'State']
print(f'{best_state_O3} is the state with the lowest concentration of O3 with a value of {low_O3}')

high_O3 = O3_avg['2022'].max()
worst_state_O3 = O3_avg.loc[O3_avg['2022'].idxmax(), 'State']
print(f'{worst_state_O3} is the state with the highest concentration of O3 with a value of {high_O3}')

HI is the state with the lowest concentration of O3 with a value of 0.044
CT is the state with the highest concentration of O3 with a value of 0.07533333333333332


In [72]:
#Summary statistics of PM25 concentration for each year by city.
city_pm25.describe()

Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
count,169.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0,...,169.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0
mean,13.040828,12.67574,12.399408,11.859172,11.466864,12.362722,11.404142,11.579882,10.651479,9.688757,...,8.839645,8.730769,8.389941,7.676923,8.018935,8.252071,7.585799,8.207101,8.569231,7.898225
std,3.607275,3.372605,3.397539,2.993453,2.873353,3.314969,2.80606,3.140662,2.649688,2.354894,...,2.360721,2.180842,1.972177,1.800793,1.986082,2.519427,1.566779,2.77387,2.322048,2.065749
min,4.2,4.2,4.0,3.7,3.4,3.8,3.5,4.0,4.4,3.8,...,4.7,3.9,4.6,2.9,3.7,3.2,3.0,2.9,2.5,2.5
25%,10.9,10.7,10.3,9.7,9.7,10.3,9.3,9.6,9.1,8.3,...,7.5,7.5,7.3,6.8,7.1,7.1,6.8,7.0,7.4,6.9
50%,13.1,12.7,12.7,12.0,11.6,12.8,11.7,12.0,10.7,9.7,...,8.7,8.9,8.4,7.6,7.8,7.9,7.6,7.8,8.3,7.7
75%,15.6,14.8,14.5,14.0,13.5,14.6,13.5,14.0,12.2,11.1,...,9.7,9.8,9.3,8.6,8.6,8.8,8.5,8.4,9.4,8.5
max,23.9,22.5,23.2,18.2,18.1,18.9,19.0,21.9,22.7,20.7,...,21.4,20.1,17.1,15.3,17.1,18.6,12.9,20.0,20.7,20.7


In [69]:
# Which states have the lowest and highest PM2.5 concentration?
low_PM25 = city_pm25['2022'].min()
best_state_PM25 = city_pm25.loc[city_pm25['2022'].idxmin(), 'State']
print(f'{best_state_PM25} is the state with the lowest concentration of PM2.5 with an average value of {low_PM25}')

high_PM25 = city_pm25['2022'].max()
worst_state_PM25 = city_pm25.loc[city_pm25['2022'].idxmax(), 'State']
print(f'{worst_state_PM25} is the state with the highest concentration of PM2.5 with an average value of {high_PM25}')

HI is the state with the lowest concentration of PM2.5 with an average value of 2.5
CA is the state with the highest concentration of PM2.5 with an average value of 20.7
