In [1]:
import pandas as pd
import sqlalchemy
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import TextBox
import scipy.stats as stats
from scipy.stats import ttest_ind
import statsmodels.api as sm
from statsmodels.formula.api import ols
from pathlib import Path
import sqlite3
import plotly.graph_objects as go
import plotly.express as px
from ipywidgets import widgets, interactive
import dash
from dash import dcc
from dash import html
from dash.dependencies import Input, Output


In [2]:
#Connecting to the historical sqlite server
conn = sqlite3.connect('data/historical.sqlite')

In [3]:
#Saving the historic table as a dataframe to be explored
historic_df = pd.read_sql_query("SELECT * FROM historical;", conn)

In [4]:
#Looking at the historic dataframe
historic_df.head()

Unnamed: 0,Primary Key,CBSA,City,State,Pollutant,Trend Statistic,Number of Sites,1990,1991,1992,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,1,10420,Akron,OH,O3,4th Max,2,0.09,0.101,0.087,...,0.059,0.06,0.065,0.06,0.066,0.068,0.062,0.063,0.067,0.07
1,2,10420,Akron,OH,PM2.5,Weighted Annual Mean,3,,,,...,9.7,9.9,10.4,8.2,7.9,7.9,8.2,7.9,8.0,7.1
2,3,10420,Akron,OH,PM2.5,98th Percentile,3,,,,...,24.0,22.0,23.0,17.0,18.0,18.0,21.0,21.0,20.0,17.0
3,4,10420,Akron,OH,SO2,99th Percentile,1,161.0,183.0,181.0,...,23.0,21.0,14.0,8.0,3.0,6.0,3.0,4.0,5.0,6.0
4,5,10500,Albany,GA,PM2.5,Weighted Annual Mean,1,,,,...,10.0,10.3,9.0,8.7,9.4,8.4,9.3,9.1,10.1,8.3


In [5]:
#Dropping the years 1990 - 1999 because most of the pollutant data is missing from these years. We will get more accurate data to compare looking at years 2000-2022

drop_col = ['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999']
mean_2000 = historic_df.drop(columns=drop_col)
mean_2000

Unnamed: 0,Primary Key,CBSA,City,State,Pollutant,Trend Statistic,Number of Sites,2000,2001,2002,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,1,10420,Akron,OH,O3,4th Max,2,0.085,0.096,0.10,...,0.059,0.060,0.065,0.060,0.066,0.068,0.062,0.063,0.067,0.070
1,2,10420,Akron,OH,PM2.5,Weighted Annual Mean,3,16.200,16.200,16.00,...,9.700,9.900,10.400,8.200,7.900,7.900,8.200,7.900,8.000,7.100
2,3,10420,Akron,OH,PM2.5,98th Percentile,3,37.000,44.000,40.00,...,24.000,22.000,23.000,17.000,18.000,18.000,21.000,21.000,20.000,17.000
3,4,10420,Akron,OH,SO2,99th Percentile,1,163.000,132.000,145.00,...,23.000,21.000,14.000,8.000,3.000,6.000,3.000,4.000,5.000,6.000
4,5,10500,Albany,GA,PM2.5,Weighted Annual Mean,1,16.600,14.600,13.80,...,10.000,10.300,9.000,8.700,9.400,8.400,9.300,9.100,10.100,8.300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1421,709,49700,Yuba City,CA,NO2,Annual Mean,1,13.000,14.000,15.00,...,10.000,8.000,7.000,7.000,7.000,7.000,6.000,6.000,5.000,7.000
1422,710,49700,Yuba City,CA,NO2,98th Percentile,1,67.000,64.000,62.00,...,52.000,44.000,39.000,40.000,42.000,41.000,40.000,38.000,34.000,37.000
1423,711,49700,Yuba City,CA,O3,4th Max,1,0.079,0.081,0.08,...,0.060,0.069,0.064,0.063,0.067,0.065,0.061,0.066,0.072,0.058
1424,712,49700,Yuba City,CA,PM2.5,Weighted Annual Mean,1,10.600,11.900,13.10,...,8.200,9.400,9.600,8.100,9.300,10.300,8.400,16.400,14.500,10.700


In [6]:
#store all the years as a list to call easier
years = ['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022']

In [7]:
#Create a dataframe containing state, city, and year total pollutiion

state_df = (mean_2000.groupby(['State', 'City'], sort=True, as_index=False)[years].sum())
state_df


Unnamed: 0,State,City,2000,2001,2002,2003,2004,2005,2006,2007,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,AK,Anchorage,62.400,73.800,71.000,70.600,86.600,67.000,88.400,51.600,...,61.400,77.000,74.600,64.200,74.200,51.40,89.600,62.600,54.800,60.200
1,AK,Juneau,61.200,63.200,52.400,58.800,65.200,86.200,83.000,93.200,...,57.800,71.400,55.200,60.000,55.200,57.80,63.600,43.600,43.400,56.600
2,AL,Birmingham,641.582,379.366,296.370,315.754,325.948,361.168,336.972,328.584,...,174.130,168.728,226.536,147.340,141.124,138.53,132.942,123.922,137.722,126.128
3,AL,Daphne,101.000,65.200,66.800,82.400,78.800,75.400,74.400,64.800,...,50.800,51.800,53.200,42.400,52.800,48.20,45.000,49.800,44.600,44.600
4,AL,Decatur,88.200,88.200,88.200,87.400,82.400,99.400,84.400,108.800,...,49.200,60.600,51.600,40.800,47.400,46.80,43.200,46.200,66.000,59.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240,WY,Gillette,146.000,130.000,138.000,132.000,126.000,112.000,176.000,188.000,...,128.000,94.000,156.000,100.000,190.000,78.00,78.000,122.000,168.000,144.000
241,WY,Laramie,0.086,0.102,0.088,0.098,0.096,0.076,0.094,0.084,...,0.138,0.130,0.128,0.128,0.132,0.14,0.130,0.132,0.144,0.134
242,WY,Riverton,183.200,171.200,199.200,168.200,199.200,177.400,155.400,145.000,...,151.600,167.400,122.400,117.600,146.800,142.60,134.000,210.600,197.000,163.600
243,WY,Rock Springs,64.400,65.000,84.000,64.400,64.000,62.400,73.000,80.400,...,74.000,58.600,70.600,61.400,91.600,90.00,70.000,139.600,94.600,92.400


In [8]:
#Making a new dataframe from the above containing the averages for a future line graph
state_avg = state_df.groupby('State').mean().reset_index()
state_avg.head()

  state_avg = state_df.groupby('State').mean().reset_index()


Unnamed: 0,State,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,AK,61.8,68.5,61.7,64.7,75.9,76.6,85.7,72.4,70.8,...,59.6,74.2,64.9,62.1,64.7,54.6,76.6,53.1,49.1,58.4
1,AL,167.3465,113.962,102.41175,104.03425,106.581,116.88275,105.89,111.83625,94.2325,...,61.028,64.2545,72.22925,54.505,55.00075,54.22975,51.9285,54.74425,57.49775,54.72725
2,AR,49.619,46.8,45.316,39.007,40.295,53.811,38.317,42.887,37.705,...,31.534,32.325,29.823,28.519,27.318,30.428,28.616,26.314,30.427,29.023
3,AZ,149.4392,136.6324,126.364,150.5604,100.394,118.0412,142.678,122.352,123.1144,...,140.2344,143.0296,94.5088,115.6252,125.8304,156.0308,93.7852,119.3912,142.0308,114.628
4,CA,215.974857,228.705929,212.931357,183.257571,173.293286,177.746357,181.124857,180.338429,188.112786,...,159.352571,152.122143,142.914143,131.176571,169.186286,225.781286,131.448857,216.321786,166.682786,142.791714


In [9]:
state_avg.set_index('State', inplace=True)

In [10]:
#Transpose the data to build a line graph
state_avg_pose = state_avg.transpose()
state_avg_pose.head()

State,AK,AL,AR,AZ,CA,CO,CT,DE,FL,GA,...,SC,TN,TX,UT,VA,VT,WA,WI,WV,WY
2000,61.8,167.3465,49.619,149.4392,215.974857,146.604,145.371333,94.4,192.231333,107.89525,...,103.702667,9.6855,145.9194,126.463333,200.461,75.16,49.288,90.743714,254.3888,97.5772
2001,68.5,113.962,46.8,136.6324,228.705929,152.770333,153.332,94.0,179.510333,99.36375,...,100.360667,9.1725,135.8856,161.454,209.478,68.839333,46.686,86.380571,259.5468,93.6604
2002,61.7,102.41175,45.316,126.364,212.931357,144.71,145.279333,120.2,160.082,92.31275,...,86.436,8.19,135.4768,147.926,213.29,79.513333,45.439,77.209714,272.6328,99.4976
2003,64.7,104.03425,39.007,150.5604,183.257571,134.113667,148.986,87.4,146.472333,91.136,...,85.753333,4.164,134.1726,130.519333,209.257,77.031333,43.1,76.721429,220.4684,87.3796
2004,75.9,106.581,40.295,100.394,173.293286,137.968333,129.492,87.2,154.388,98.93525,...,95.553333,8.1495,129.5066,172.644667,197.946,74.763333,48.6965,76.563429,238.8552,97.4592


In [11]:
state_avg_pose.reset_index(inplace=True)

In [12]:
#Renaming the column to year
state_avg_pose.rename(columns={'index':'Year'})

State,Year,AK,AL,AR,AZ,CA,CO,CT,DE,FL,...,SC,TN,TX,UT,VA,VT,WA,WI,WV,WY
0,2000,61.8,167.3465,49.619,149.4392,215.974857,146.604,145.371333,94.4,192.231333,...,103.702667,9.6855,145.9194,126.463333,200.461,75.16,49.288,90.743714,254.3888,97.5772
1,2001,68.5,113.962,46.8,136.6324,228.705929,152.770333,153.332,94.0,179.510333,...,100.360667,9.1725,135.8856,161.454,209.478,68.839333,46.686,86.380571,259.5468,93.6604
2,2002,61.7,102.41175,45.316,126.364,212.931357,144.71,145.279333,120.2,160.082,...,86.436,8.19,135.4768,147.926,213.29,79.513333,45.439,77.209714,272.6328,99.4976
3,2003,64.7,104.03425,39.007,150.5604,183.257571,134.113667,148.986,87.4,146.472333,...,85.753333,4.164,134.1726,130.519333,209.257,77.031333,43.1,76.721429,220.4684,87.3796
4,2004,75.9,106.581,40.295,100.394,173.293286,137.968333,129.492,87.2,154.388,...,95.553333,8.1495,129.5066,172.644667,197.946,74.763333,48.6965,76.563429,238.8552,97.4592
5,2005,76.6,116.88275,53.811,118.0412,177.746357,122.575,147.186667,96.4,151.656667,...,102.360667,9.1625,131.6658,116.728,213.658,75.161333,46.9875,96.865429,226.4252,87.3752
6,2006,85.7,105.89,38.317,142.678,181.124857,115.411667,132.184,85.8,153.409833,...,83.693333,8.662,145.5458,115.521333,201.757,63.222,44.247,86.364571,210.0628,94.3788
7,2007,72.4,111.83625,42.887,122.352,180.338429,134.974333,125.049333,86.2,148.204333,...,96.152667,9.1695,129.1342,172.588,206.458,65.169333,42.837,90.088286,231.6664,97.2568
8,2008,70.8,94.2325,37.705,123.1144,188.112786,128.306,114.229333,78.6,128.499,...,73.421333,7.6535,119.09,175.652,147.654,55.694,41.7805,85.531143,196.3788,85.2176
9,2009,76.4,74.17075,33.893,119.746,154.961,111.001,109.343333,61.0,110.105833,...,74.062,6.1345,106.2706,182.647333,130.028,51.485333,45.2835,84.389143,191.1692,91.6196


In [13]:
#Creating a dash app to display a line chart that shows overall pollution by year with a dropdown for each state

# Create a Dash app
appState = dash.Dash(__name__)


dropdown_options = [{'label': col, 'value': col} for col in state_avg_pose.columns[1:]]

# Define app layout
appState.layout = html.Div([
    dcc.Dropdown(
        id='dropdown',
        options=[{'label': state, 'value': state} for state in state_avg_pose.columns[1:]],
        value='AK'
    ),
    dcc.Graph(id='line-plot1')
])

# Define callback to update the line plot
@appState.callback(
    Output('line-plot1', 'figure'),
    [Input('dropdown', 'value')]
)
def update_line_plot(selected_column):
    fig = px.line(state_avg_pose, x='index', y=selected_column)

    title = f'{selected_column} pollution'

    return fig

# Run the app
if __name__ == '__main__':
    appState.run_server(debug=True, port=8051)

In [14]:
#Using the above table, perform a t-test for each state checking for significant difference at the 0.05 significance level between overall pollution in 2000 and 2022.

for state in state_df['State'].unique():
    data_2000 = state_df.loc[state_df['State'] == state, '2000']
    data_2022 = state_df.loc[state_df['State'] == state, '2022']

    t_statistic, p_value = ttest_ind(data_2000, data_2022)

    print(f'State: {state}')
    print(f't_statistic: {t_statistic}')
    print(f'p_value: {p_value}')
    if p_value < 0.05:
        print('There is a significant difference between pollution in 2000 and 2022')

        if p_value > 0:
            print('There is a significant decrease in overall pollution')

            if t_statistic > 0:
                print('There was a significant decrease in pollution between 2000 and 2022')
            else:
                print('There was a significant increase in pollution between 2000 and 2022')

        else:
            print('There is a significant increase in ovall pollution')

    else:
        print('No significant difference between pollution in 2000 and 2022')

    print('-------------------------------')
    print('-------------------------------')



State: AK
t_statistic: 1.7919573407620772
p_value: 0.21501293588266346
No significant difference between pollution in 2000 and 2022
-------------------------------
-------------------------------
State: AL
t_statistic: 1.509119356832565
p_value: 0.15350272780958907
No significant difference between pollution in 2000 and 2022
-------------------------------
-------------------------------
State: AR
t_statistic: 0.35898337317828977
p_value: 0.7539633195973889
No significant difference between pollution in 2000 and 2022
-------------------------------
-------------------------------
State: AZ
t_statistic: 0.37094151853907703
p_value: 0.7203114747322883
No significant difference between pollution in 2000 and 2022
-------------------------------
-------------------------------
State: CA
t_statistic: 1.6670329313806622
p_value: 0.10129822141399748
No significant difference between pollution in 2000 and 2022
-------------------------------
-------------------------------
State: CO
t_statistic


Precision loss occurred in moment calculation due to catastrophic cancellation. This occurs when the data are nearly identical. Results may be unreliable.


divide by zero encountered in divide


invalid value encountered in scalar multiply



For the following we will look at each pollutant seperately.

In [15]:
#Create a dataframe containing only the pollutant O3
O3 = ['O3']
O3_df = mean_2000[mean_2000['Pollutant'].isin(O3) == True]
state_O3 = (O3_df.groupby(['State', 'City'], sort=True, as_index=False)[years].sum())
state_O3

Unnamed: 0,State,City,2000,2001,2002,2003,2004,2005,2006,2007,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,AL,Birmingham,0.182,0.166,0.170,0.154,0.148,0.168,0.172,0.184,...,0.130,0.128,0.136,0.140,0.124,0.130,0.142,0.122,0.122,0.128
1,AL,Fort Payne,0.068,0.064,0.058,0.060,0.054,0.060,0.076,0.040,...,0.124,0.124,0.130,0.128,0.116,0.128,0.124,0.114,0.116,0.122
2,AL,Huntsville,0.176,0.160,0.156,0.158,0.154,0.150,0.158,0.164,...,0.128,0.128,0.126,0.132,0.126,0.130,0.126,0.114,0.122,0.130
3,AL,Mobile,0.178,0.152,0.150,0.160,0.148,0.146,0.170,0.154,...,0.120,0.136,0.120,0.124,0.130,0.130,0.118,0.108,0.114,0.122
4,AL,Montgomery,0.168,0.154,0.160,0.142,0.144,0.138,0.144,0.148,...,0.122,0.120,0.122,0.116,0.110,0.120,0.118,0.096,0.108,0.116
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,WI,Wausau,0.146,0.144,0.146,0.148,0.130,0.150,0.132,0.144,...,0.126,0.128,0.126,0.136,0.120,0.130,0.118,0.110,0.120,0.114
171,WI,Whitewater,0.152,0.176,0.168,0.162,0.138,0.166,0.144,0.150,...,0.134,0.146,0.134,0.144,0.130,0.140,0.124,0.148,0.138,0.140
172,WV,Charleston,0.170,0.166,0.174,0.176,0.138,0.158,0.154,0.164,...,0.134,0.134,0.134,0.138,0.134,0.132,0.130,0.114,0.130,0.122
173,WV,Parkersburg,0.174,0.168,0.190,0.166,0.138,0.168,0.160,0.168,...,0.128,0.134,0.142,0.134,0.116,0.122,0.122,0.116,0.114,0.124


In [16]:
O3_avg = state_O3.groupby('State').mean().reset_index()
O3_avg.head()


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



Unnamed: 0,State,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,AL,0.1544,0.1392,0.1388,0.1348,0.1296,0.1324,0.144,0.138,0.132,...,0.1248,0.1272,0.1268,0.128,0.1212,0.1276,0.1256,0.1108,0.1164,0.1236
1,AR,0.119,0.1,0.116,0.107,0.095,0.111,0.117,0.087,0.105,...,0.134,0.125,0.123,0.119,0.118,0.128,0.116,0.114,0.127,0.123
2,AZ,0.149,0.1405,0.155,0.1505,0.1425,0.1515,0.1475,0.14,0.143,...,0.143,0.137,0.136,0.1315,0.138,0.1385,0.1315,0.139,0.1385,0.135
3,CA,0.156952,0.160286,0.165619,0.162476,0.152952,0.147524,0.156952,0.146476,0.159905,...,0.136762,0.14381,0.142667,0.14019,0.143619,0.136952,0.13181,0.143333,0.138952,0.13181
4,CO,0.141333,0.140667,0.153333,0.160667,0.136667,0.15,0.156667,0.148667,0.145333,...,0.150667,0.140667,0.144,0.138667,0.140667,0.144667,0.132,0.147333,0.155333,0.143333


In [17]:
state_O3.set_index('City', inplace=True)

In [18]:
state_O3.head()

Unnamed: 0_level_0,State,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Birmingham,AL,0.182,0.166,0.17,0.154,0.148,0.168,0.172,0.184,0.154,...,0.13,0.128,0.136,0.14,0.124,0.13,0.142,0.122,0.122,0.128
Fort Payne,AL,0.068,0.064,0.058,0.06,0.054,0.06,0.076,0.04,0.072,...,0.124,0.124,0.13,0.128,0.116,0.128,0.124,0.114,0.116,0.122
Huntsville,AL,0.176,0.16,0.156,0.158,0.154,0.15,0.158,0.164,0.146,...,0.128,0.128,0.126,0.132,0.126,0.13,0.126,0.114,0.122,0.13
Mobile,AL,0.178,0.152,0.15,0.16,0.148,0.146,0.17,0.154,0.152,...,0.12,0.136,0.12,0.124,0.13,0.13,0.118,0.108,0.114,0.122
Montgomery,AL,0.168,0.154,0.16,0.142,0.144,0.138,0.144,0.148,0.136,...,0.122,0.12,0.122,0.116,0.11,0.12,0.118,0.096,0.108,0.116


In [19]:
city_O3 = state_O3.transpose()


In [20]:
city_O3.reset_index(inplace=True)

In [21]:
city_O3 = city_O3.drop(index=0)

In [22]:
city_O3.head()

City,index,Birmingham,Fort Payne,Huntsville,Mobile,Montgomery,Arkadelphia,Little Rock,Flagstaff,Phoenix,...,Bellingham,Seattle,Spokane,Madison,Milwaukee,Wausau,Whitewater,Charleston,Parkersburg,Laramie
1,2000,0.182,0.068,0.176,0.178,0.168,0.058,0.18,0.142,0.164,...,0.104,0.112,0.136,0.144,0.164,0.146,0.152,0.17,0.174,0.086
2,2001,0.166,0.064,0.16,0.152,0.154,0.042,0.158,0.136,0.158,...,0.1,0.102,0.142,0.156,0.188,0.144,0.176,0.166,0.168,0.102
3,2002,0.17,0.058,0.156,0.15,0.16,0.062,0.17,0.158,0.164,...,0.106,0.108,0.142,0.162,0.192,0.146,0.168,0.174,0.19,0.088
4,2003,0.154,0.06,0.158,0.16,0.142,0.064,0.15,0.146,0.162,...,0.112,0.13,0.158,0.156,0.184,0.148,0.162,0.176,0.166,0.098
5,2004,0.148,0.054,0.154,0.148,0.144,0.044,0.146,0.144,0.148,...,0.124,0.124,0.138,0.13,0.146,0.13,0.138,0.138,0.138,0.096


In [23]:
O3_test = city_O3

In [24]:
O3_test['index'] = pd.to_numeric(O3_test['index'].str.replace('[^\d.]', '', regex=True), errors='coerce')

In [25]:
O3_test = O3_test.apply(pd.to_numeric, errors='coerce')
O3_test.head()

City,index,Birmingham,Fort Payne,Huntsville,Mobile,Montgomery,Arkadelphia,Little Rock,Flagstaff,Phoenix,...,Bellingham,Seattle,Spokane,Madison,Milwaukee,Wausau,Whitewater,Charleston,Parkersburg,Laramie
1,2000,0.182,0.068,0.176,0.178,0.168,0.058,0.18,0.142,0.164,...,0.104,0.112,0.136,0.144,0.164,0.146,0.152,0.17,0.174,0.086
2,2001,0.166,0.064,0.16,0.152,0.154,0.042,0.158,0.136,0.158,...,0.1,0.102,0.142,0.156,0.188,0.144,0.176,0.166,0.168,0.102
3,2002,0.17,0.058,0.156,0.15,0.16,0.062,0.17,0.158,0.164,...,0.106,0.108,0.142,0.162,0.192,0.146,0.168,0.174,0.19,0.088
4,2003,0.154,0.06,0.158,0.16,0.142,0.064,0.15,0.146,0.162,...,0.112,0.13,0.158,0.156,0.184,0.148,0.162,0.176,0.166,0.098
5,2004,0.148,0.054,0.154,0.148,0.144,0.044,0.146,0.144,0.148,...,0.124,0.124,0.138,0.13,0.146,0.13,0.138,0.138,0.138,0.096


In [26]:
x = O3_test['index']
y = O3_test['Birmingham']

slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
print(r_value)

-0.8223616462784891


In [27]:
#Creating a dash app to display a line chart that shows O3 pollution by year with a dropdown for each city

# Create a Dash app
appO3 = dash.Dash(__name__)


dropdown_options = [{'label': col, 'value': col} for col in O3_test.columns[1:]]

# Define app layout
appO3.layout = html.Div([
    dcc.Dropdown(
        id='dropdown-O3',
        options=dropdown_options,
        value='Birmingham'
    ),
    dcc.Graph(id='line-plot2')
])

# Define callback to update the line plot
@appO3.callback(
    Output('line-plot2', 'figure'),
    [Input('dropdown-O3', 'value')]
)
def update_line_plot(selected_column):

    #Perform linear regression
    x = O3_test['index']
    y = O3_test[selected_column]

    slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
    regression_line = slope * x + intercept


    fig = px.line(O3_test, x='index', y=selected_column)
    fig.add_trace(go.Scatter(x=x, y=regression_line, mode='lines', name='Linear Regression Line'))
    fig.add_annotation(
        text=f'p-value: {p_value:.3f}, r-value: {r_value:.3f}',
        xref='paper', yref='paper',
        x=1.185, y=0.1,
        showarrow=False
    )

    title = f'{selected_column} O3'

    fig.update_layout(title=title, xaxis_title = 'Year' ,yaxis_title = 'O3 Concentration')

    return fig

# Run the app
if __name__ == '__main__':
    appO3.run_server(debug=True, port=8052)

In [28]:
#Using the above table, perform a t-test for each state checking for significant difference at the 0.05 significance level for O3 pollution betweeen 2000 and 2022.
#If values return "nan" it means there was only one record for that state, which does not allow for a t-test.

for state in state_O3['State'].unique():
    data_2000 = state_O3.loc[state_O3['State'] == state, '2000']
    data_2022 = state_O3.loc[state_O3['State'] == state, '2022']

    t_statistic, p_value = ttest_ind(data_2000, data_2022)

    print(f'State: {state}')
    print(f't_statistic: {t_statistic}')
    print(f'p_value: {p_value}')
    if p_value < 0.05:
        print('There is a significant difference in O3 pollution between 2000 and 2022')
        if t_statistic > 0:
            print('There was a significant decrease in O3 pollution between 2000 and 2022')
        else:
            print('There was a significant increase in O3 pollution between 2000 and 2022')
            
    else:
        print('No significant difference in O3 pollution between 2000 and 2022')


    print('-------------------------------')
    print('-------------------------------')

State: AL
t_statistic: 1.4088771121384105
p_value: 0.19653270107293333
No significant difference in O3 pollution between 2000 and 2022
-------------------------------
-------------------------------
State: AR
t_statistic: -0.06549461226639136
p_value: 0.9537378997940775
No significant difference in O3 pollution between 2000 and 2022
-------------------------------
-------------------------------
State: AZ
t_statistic: 1.9051586888313579
p_value: 0.10541314582633256
No significant difference in O3 pollution between 2000 and 2022
-------------------------------
-------------------------------
State: CA
t_statistic: 3.0086178285435383
p_value: 0.004524666556723182
There is a significant difference in O3 pollution between 2000 and 2022
There was a significant decrease in O3 pollution between 2000 and 2022
-------------------------------
-------------------------------
State: CO
t_statistic: -0.4242640687119297
p_value: 0.6931965033474388
No significant difference in O3 pollution between 20


Precision loss occurred in moment calculation due to catastrophic cancellation. This occurs when the data are nearly identical. Results may be unreliable.


divide by zero encountered in divide


invalid value encountered in scalar multiply



In [29]:
#Create a dataframe containing only PM2.5 and continue for each pollutant
pm_2 = ['PM2.5']
pm25_df = mean_2000[mean_2000['Pollutant'].isin(pm_2) == True]
state_pm25 = (pm25_df.groupby(['State', 'City', 'Trend Statistic'], sort=True, as_index=False)[years].sum())
state_pm25

Unnamed: 0,State,City,Trend Statistic,2000,2001,2002,2003,2004,2005,2006,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,AK,Anchorage,98th Percentile,40.0,50.0,50.0,46.0,60.0,44.0,66.0,...,44.0,58.0,56.0,46.0,56.0,36.0,70.0,46.0,40.0,46.0
1,AK,Anchorage,Weighted Annual Mean,11.6,12.4,11.6,13.2,13.8,13.4,13.8,...,11.2,14.0,13.0,12.2,11.2,10.0,14.8,10.6,10.4,9.4
2,AK,Juneau,98th Percentile,48.0,52.0,40.0,46.0,52.0,70.0,66.0,...,46.0,56.0,42.0,48.0,44.0,44.0,50.0,34.0,34.0,46.0
3,AK,Juneau,Weighted Annual Mean,13.2,11.2,12.4,12.8,13.2,16.2,17.0,...,11.8,15.4,13.2,12.0,11.2,13.8,13.6,9.6,9.4,10.6
4,AL,Birmingham,98th Percentile,94.0,78.0,72.0,72.0,78.0,86.0,76.0,...,42.0,44.0,40.0,34.0,40.0,38.0,36.0,38.0,46.0,38.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333,WV,Parkersburg,Weighted Annual Mean,35.6,34.8,31.6,29.8,29.8,32.8,29.4,...,18.8,19.2,18.6,15.8,16.2,15.4,15.4,14.0,15.8,14.8
334,WY,Riverton,98th Percentile,66.0,66.0,66.0,52.0,80.0,60.0,46.0,...,58.0,52.0,40.0,44.0,54.0,44.0,52.0,74.0,58.0,30.0
335,WY,Riverton,Weighted Annual Mean,19.2,19.2,19.2,16.2,19.2,15.4,15.4,...,15.6,13.4,12.4,13.6,14.8,14.6,14.0,14.6,17.0,11.6
336,WY,Sheridan,98th Percentile,70.0,80.0,56.0,52.0,78.0,66.0,48.0,...,34.0,38.0,50.0,46.0,50.0,42.0,36.0,68.0,58.0,58.0


In [30]:
state_pm25 = state_pm25[state_pm25['Trend Statistic'] != '98th Percentile']
state_pm25

Unnamed: 0,State,City,Trend Statistic,2000,2001,2002,2003,2004,2005,2006,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
1,AK,Anchorage,Weighted Annual Mean,11.6,12.4,11.6,13.2,13.8,13.4,13.8,...,11.2,14.0,13.0,12.2,11.2,10.0,14.8,10.6,10.4,9.4
3,AK,Juneau,Weighted Annual Mean,13.2,11.2,12.4,12.8,13.2,16.2,17.0,...,11.8,15.4,13.2,12.0,11.2,13.8,13.6,9.6,9.4,10.6
5,AL,Birmingham,Weighted Annual Mean,40.0,34.6,32.8,31.4,32.0,36.0,34.6,...,20.6,21.8,20.6,18.4,19.0,18.0,18.4,17.6,19.6,18.4
7,AL,Daphne,Weighted Annual Mean,29.0,21.2,20.8,24.4,22.8,23.4,22.4,...,16.8,17.8,17.2,14.4,14.8,14.2,15.0,15.8,14.6,14.6
9,AL,Decatur,Weighted Annual Mean,26.2,26.2,26.2,27.4,24.4,27.4,26.4,...,17.2,18.6,17.6,14.8,15.4,14.8,15.2,14.2,18.0,17.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329,WV,Fairmont,Weighted Annual Mean,32.0,31.8,30.6,30.0,28.2,31.8,29.2,...,18.6,19.0,19.0,17.2,15.4,14.8,15.2,14.8,14.6,14.2
331,WV,Morgantown,Weighted Annual Mean,30.0,29.8,30.4,29.2,27.6,30.0,27.0,...,17.8,17.2,16.4,14.8,14.6,13.8,14.4,12.4,15.4,14.0
333,WV,Parkersburg,Weighted Annual Mean,35.6,34.8,31.6,29.8,29.8,32.8,29.4,...,18.8,19.2,18.6,15.8,16.2,15.4,15.4,14.0,15.8,14.8
335,WY,Riverton,Weighted Annual Mean,19.2,19.2,19.2,16.2,19.2,15.4,15.4,...,15.6,13.4,12.4,13.6,14.8,14.6,14.0,14.6,17.0,11.6


In [31]:
city_pm25 = state_pm25.set_index('City')

In [32]:
city_pm25.head()


Unnamed: 0_level_0,State,Trend Statistic,2000,2001,2002,2003,2004,2005,2006,2007,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Anchorage,AK,Weighted Annual Mean,11.6,12.4,11.6,13.2,13.8,13.4,13.8,10.4,...,11.2,14.0,13.0,12.2,11.2,10.0,14.8,10.6,10.4,9.4
Juneau,AK,Weighted Annual Mean,13.2,11.2,12.4,12.8,13.2,16.2,17.0,13.2,...,11.8,15.4,13.2,12.0,11.2,13.8,13.6,9.6,9.4,10.6
Birmingham,AL,Weighted Annual Mean,40.0,34.6,32.8,31.4,32.0,36.0,34.6,34.4,...,20.6,21.8,20.6,18.4,19.0,18.0,18.4,17.6,19.6,18.4
Daphne,AL,Weighted Annual Mean,29.0,21.2,20.8,24.4,22.8,23.4,22.4,20.8,...,16.8,17.8,17.2,14.4,14.8,14.2,15.0,15.8,14.6,14.6
Decatur,AL,Weighted Annual Mean,26.2,26.2,26.2,27.4,24.4,27.4,26.4,28.8,...,17.2,18.6,17.6,14.8,15.4,14.8,15.2,14.2,18.0,17.0


In [33]:
city_pm25.drop(columns=['Trend Statistic'], inplace=True)

In [34]:
#city_pm25.set_index('City', inplace=True)

In [35]:
city_25 = city_pm25.transpose()

In [36]:
city_25.reset_index(inplace=True)

In [37]:
city_25 = city_25.drop(index=0)

In [38]:
city_25.head()

City,index,Anchorage,Juneau,Birmingham,Daphne,Decatur,Fort Payne,Gadsden,Huntsville,Little Rock,...,Madison,Milwaukee,Platteville,Charleston,Clarksburg,Fairmont,Morgantown,Parkersburg,Riverton,Sheridan
1,2000,11.6,13.2,40.0,29.0,26.2,34.4,39.0,32.6,31.0,...,25.6,26.2,24.6,36.2,29.8,32.0,30.0,35.6,19.2,24.2
2,2001,12.4,11.2,34.6,21.2,26.2,29.4,34.4,29.2,29.4,...,26.6,27.6,23.8,36.2,28.8,31.8,29.8,34.8,19.2,22.0
3,2002,11.6,12.4,32.8,20.8,26.2,28.8,29.6,27.6,26.4,...,24.6,25.4,21.8,34.2,28.0,30.6,30.4,31.6,19.2,20.2
4,2003,13.2,12.8,31.4,24.4,27.4,30.0,28.6,27.6,25.8,...,23.8,24.6,22.6,32.2,26.8,30.0,29.2,29.8,16.2,20.2
5,2004,13.8,13.2,32.0,22.8,24.4,28.2,28.6,26.8,24.4,...,22.6,24.0,21.6,31.8,26.6,28.2,27.6,29.8,19.2,20.0


In [39]:
city25_test = city_25
city25_test['index'] = pd.to_numeric(city25_test['index'].str.replace('[^\d.]', '', regex=True), errors='coerce')
city25_test = city25_test.apply(pd.to_numeric, errors='coerce')

In [40]:
x5 = city25_test['index']
y5 = city25_test['Anchorage']

slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
print(r_value)

-0.8223616462784891


In [41]:
#Creating a dash app to display a line chart that shows PM2.5 pollution by year with a dropdown for each city

# Create a Dash app
appPM25 = dash.Dash(__name__)


dropdown_options = [{'label': col, 'value': col} for col in city25_test.columns[2:]]

# Define app layout
appPM25.layout = html.Div([
    dcc.Dropdown(
        id='dropdown-PM25',
        options=dropdown_options,
        value='Anchorage'
    ),
    dcc.Graph(id='line-plot3')
])

# Define callback to update the line plot
@appPM25.callback(
    Output('line-plot3', 'figure'),
    [Input('dropdown-PM25', 'value')]
)
def update_line_plot(selected_column):

    #Perform linear regression
    x25 = city25_test['index']
    y25 = city25_test[selected_column]

    slope, intercept, r_value, p_value, std_err = stats.linregress(x25, y25)
    regression_line = slope * x25 + intercept

    #Create the line plots and add the linear regression line with r-value and p-value
    fig = px.line(city25_test, x='index', y=selected_column)
    fig.add_trace(go.Scatter(x=x25, y=regression_line, mode='lines', name='Linear Regression Line'))
    fig.add_annotation(
        text=f'p-value: {p_value:.3f}, r-value: {r_value:.3f}',
        xref='paper', yref='paper',
        x=1.185, y=0.1,
        showarrow=False
    )


    title = f'{selected_column} PM2.5'

    fig.update_layout(title = title, xaxis_title = 'Year', yaxis_title = 'PM2.5 Concentration')

    return fig

# Run the app
if __name__ == '__main__':
    appPM25.run_server(debug=True, port=8053)

In [42]:
#Using the above table, perform a t-test for each state checking for significant difference at the 0.05 significance level of PM2.5 pollution between 2000 and 2022.
#If values return "nan" it means there was only one record for that state, which does not allow for a t-test.

for state in state_pm25['State'].unique():
    data_2000 = state_pm25.loc[state_pm25['State'] == state, '2000']
    data_2022 = state_pm25.loc[state_pm25['State'] == state, '2022']

    t_statistic, p_value = ttest_ind(data_2000, data_2022)

    print(f'State: {state}')
    print(f't_statistic: {t_statistic}')
    print(f'p_value: {p_value}')
    if p_value < 0.05:
        print('There is a significant difference in PM2.5 pollution between 2000 and 2022')
        if t_statistic > 0:
            print('There was a significant decrease in PM2.5 pollution between 2000 and 2022')
        else:
            print('There was a significant increase in PM2.5 pollution between 2000 and 2022')
            
    else:
        print('No significant difference in PM2.5 pollution between 2000 and 2022')


    print('-------------------------------')

State: AK
t_statistic: 2.3999999999999995
p_value: 0.1384502096587143
No significant difference in PM2.5 pollution between 2000 and 2022
-------------------------------
State: AL
t_statistic: 7.601710220633044
p_value: 1.8364299284931228e-05
There is a significant difference in PM2.5 pollution between 2000 and 2022
There was a significant decrease in PM2.5 pollution between 2000 and 2022
-------------------------------
State: AR
t_statistic: nan
p_value: nan
No significant difference in PM2.5 pollution between 2000 and 2022
-------------------------------
State: AZ
t_statistic: 1.1214296958515246
p_value: 0.32487611072332695
No significant difference in PM2.5 pollution between 2000 and 2022
-------------------------------
State: CA
t_statistic: 2.8265393387646105
p_value: 0.007175150014392773
There is a significant difference in PM2.5 pollution between 2000 and 2022
There was a significant decrease in PM2.5 pollution between 2000 and 2022
-------------------------------
State: CO
t_sta


Precision loss occurred in moment calculation due to catastrophic cancellation. This occurs when the data are nearly identical. Results may be unreliable.


divide by zero encountered in divide


invalid value encountered in scalar multiply



In [43]:
#Create a dataframe containing only the SO2 pollutant
SO2 = ['SO2']
SO2_df = mean_2000[mean_2000['Pollutant'].isin(SO2) == True]
state_SO2 = (SO2_df.groupby(['State', 'City'], sort=True, as_index=False)[years].sum())
state_SO2.head()

Unnamed: 0,State,City,2000,2001,2002,2003,2004,2005,2006,2007,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,AL,Birmingham,298.0,126.0,100.0,106.0,112.0,140.0,140.0,100.0,...,44.0,34.0,34.0,22.0,24.0,22.0,20.0,10.0,14.0,12.0
1,CA,San Francisco,40.0,42.0,40.0,30.0,40.0,32.0,44.0,36.0,...,18.0,24.0,16.0,18.0,18.0,20.0,20.0,16.0,18.0,14.0
2,CA,Santa Maria,8.0,16.0,14.0,10.0,22.0,14.0,10.0,10.0,...,6.0,8.0,4.0,4.0,2.0,4.0,6.0,6.0,4.0,4.0
3,CO,Denver,130.0,146.0,146.0,116.0,80.0,78.0,66.0,60.0,...,68.0,32.0,30.0,28.0,26.0,14.0,12.0,12.0,12.0,10.0
4,FL,Jacksonville,214.0,198.0,200.0,136.0,160.0,180.0,188.0,106.0,...,66.0,74.0,54.0,50.0,40.0,44.0,48.0,56.0,40.0,40.0


In [44]:
#Mainuplating the dataset to make it useful to make line charts from
city_SO2 = state_SO2.set_index('City')

In [45]:
city_SO2 = city_SO2.transpose()

In [46]:
city_SO2.reset_index(inplace=True)

In [47]:
city_SO2.drop(index=0)

City,index,Birmingham,San Francisco,Santa Maria,Denver,Jacksonville,Miami,Orlando,Palatka,Pensacola,...,York,Beaumont,Corpus Christi,Houston,Richmond,Roanoke,Rutland,Green Bay,Morgantown,Parkersburg
1,2000,298.0,40.0,8.0,130.0,214.0,12.0,52.0,108.0,280.0,...,156.0,270.0,66.0,152.0,150.0,38.0,114.0,174.0,372.0,386.0
2,2001,126.0,42.0,16.0,146.0,198.0,20.0,58.0,120.0,200.0,...,96.0,182.0,56.0,128.0,160.0,36.0,86.0,134.0,394.0,326.0
3,2002,100.0,40.0,14.0,146.0,200.0,12.0,32.0,86.0,168.0,...,116.0,158.0,56.0,128.0,184.0,38.0,112.0,124.0,564.0,250.0
4,2003,106.0,30.0,10.0,116.0,136.0,10.0,26.0,106.0,152.0,...,104.0,156.0,78.0,130.0,168.0,34.0,110.0,116.0,352.0,262.0
5,2004,112.0,40.0,22.0,80.0,160.0,8.0,24.0,96.0,170.0,...,216.0,160.0,80.0,124.0,158.0,32.0,110.0,106.0,364.0,342.0
6,2005,140.0,32.0,14.0,78.0,180.0,4.0,24.0,120.0,138.0,...,224.0,158.0,66.0,124.0,188.0,30.0,108.0,156.0,280.0,334.0
7,2006,140.0,44.0,10.0,66.0,188.0,2.0,24.0,122.0,158.0,...,186.0,322.0,44.0,104.0,156.0,44.0,88.0,136.0,362.0,194.0
8,2007,100.0,36.0,10.0,60.0,106.0,6.0,20.0,68.0,162.0,...,214.0,174.0,34.0,90.0,188.0,34.0,94.0,154.0,386.0,256.0
9,2008,138.0,36.0,8.0,60.0,84.0,2.0,16.0,78.0,238.0,...,192.0,144.0,48.0,62.0,116.0,24.0,66.0,156.0,260.0,298.0
10,2009,82.0,26.0,6.0,66.0,74.0,6.0,16.0,88.0,148.0,...,178.0,164.0,24.0,62.0,94.0,20.0,54.0,134.0,364.0,238.0


In [48]:
#Creating a dash app to display a line chart that shows SO2 pollution by year with a dropdown for each city

# Create a Dash app
appSO2 = dash.Dash(__name__)


dropdown_options = [{'label': col, 'value': col} for col in city_SO2.columns[1:]]

# Define app layout
appSO2.layout = html.Div([
    dcc.Dropdown(
        id='dropdown-SO2',
        options=dropdown_options,
        value='Birmingham'
    ),
    dcc.Graph(id='line-plot4')
])

# Define callback to update the line plot
@appSO2.callback(
    Output('line-plot4', 'figure'),
    [Input('dropdown-SO2', 'value')]
)
def update_line_plot(selected_column):
    fig = px.line(city_SO2, x='index', y=selected_column)

    title = f'{selected_column} SO2'

    fig.update_layout(title = title, xaxis_title = 'Year', yaxis_title = 'SO2 Concentration')

    return fig

# Run the app
if __name__ == '__main__':
    appSO2.run_server(debug=True, port=8057)

In [49]:
#Using the above table, perform a t-test for each state checking for significant difference at the 0.05 significance level between SO2 pollution in 2000 and 2022. 
#If values return "nan" it means there was only one record for that state, which does not allow for a t-test.

for state in state_SO2['State'].unique():
    data_2000 = state_SO2.loc[state_SO2['State'] == state, '2000']
    data_2022 = state_SO2.loc[state_SO2['State'] == state, '2022']

    t_statistic, p_value = ttest_ind(data_2000, data_2022)

    print(f'State: {state}')
    print(f't_statistic: {t_statistic}')
    print(f'p_value: {p_value}')
    if p_value < 0.05:
        print('There is a significant difference in SO2 pollution between 2000 and 2022')
        if t_statistic > 0:
            print('There was a significant decrease in SO2 pollution between 2000 and 2022')
        else:
            print('There was a significant increase in SO2 pollution between 2000 and 2022')
            
    else:
        print('No significant difference in SO2 pollution between 2000 and 2022')


    print('-------------------------------')
    print('-------------------------------')

State: AL
t_statistic: nan
p_value: nan
No significant difference in SO2 pollution between 2000 and 2022
-------------------------------
-------------------------------
State: CA
t_statistic: 0.8948249794078403
p_value: 0.4653077456503607
No significant difference in SO2 pollution between 2000 and 2022
-------------------------------
-------------------------------
State: CO
t_statistic: nan
p_value: nan
No significant difference in SO2 pollution between 2000 and 2022
-------------------------------
-------------------------------
State: FL
t_statistic: 2.3408832387529346
p_value: 0.04127736090068851
There is a significant difference in SO2 pollution between 2000 and 2022
There was a significant decrease in SO2 pollution between 2000 and 2022
-------------------------------
-------------------------------
State: GA
t_statistic: nan
p_value: nan
No significant difference in SO2 pollution between 2000 and 2022
-------------------------------
-------------------------------
State: HI
t_st


Precision loss occurred in moment calculation due to catastrophic cancellation. This occurs when the data are nearly identical. Results may be unreliable.


divide by zero encountered in divide


invalid value encountered in scalar multiply



In [50]:
#Create a dataframe containing only NO2 pollutant
NO2 = ['NO2']
NO2_df = mean_2000[mean_2000['Pollutant'].isin(NO2) == True]
state_NO2 = (NO2_df.groupby(['State', 'City'], sort=True, as_index=False)[years].sum())
state_NO2.head()

Unnamed: 0,State,City,2000,2001,2002,2003,2004,2005,2006,2007,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,AZ,Phoenix,58.0,52.0,58.0,52.0,48.0,48.0,48.0,42.0,...,36.0,36.0,32.0,32.0,34.0,32.0,28.0,26.0,30.0,28.0
1,AZ,Tucson,154.0,136.0,144.0,142.0,134.0,124.0,122.0,124.0,...,108.0,106.0,98.0,84.0,90.0,98.0,92.0,90.0,92.0,88.0
2,CA,Bakersfield,134.0,144.0,142.0,140.0,130.0,128.0,146.0,132.0,...,104.0,100.0,86.0,80.0,80.0,82.0,82.0,74.0,74.0,72.0
3,CA,Fresno,190.0,168.0,172.0,182.0,148.0,154.0,152.0,146.0,...,132.0,130.0,112.0,118.0,128.0,136.0,126.0,122.0,104.0,114.0
4,CA,Los Angeles,286.0,266.0,248.0,258.0,224.0,198.0,204.0,206.0,...,152.0,152.0,146.0,140.0,136.0,134.0,124.0,128.0,128.0,122.0


In [51]:
#Manipulate the data to make it usable for line graphs
city_NO2 = state_NO2.set_index('City')

In [52]:
city_NO2.drop(columns={'State'}, inplace=True)

In [53]:
cityNO2 = city_NO2.transpose()

In [54]:
cityNO2.reset_index(inplace=True)

In [55]:
cityNO2.head()

City,index,Phoenix,Tucson,Bakersfield,Fresno,Los Angeles,Merced,Modesto,Oxnard,Riverside,...,Scranton,York,Charleston,Nashville,Beaumont,Dallas,El Paso,Houston,Roanoke,Rutland
0,2000,58.0,154.0,134.0,190.0,286.0,24.0,32.0,156.0,234.0,...,130.0,166.0,22.0,38.0,98.0,144.0,32.0,168.0,120.0,22.0
1,2001,52.0,136.0,144.0,168.0,266.0,24.0,34.0,140.0,238.0,...,146.0,172.0,22.0,36.0,116.0,140.0,36.0,178.0,130.0,26.0
2,2002,58.0,144.0,142.0,172.0,248.0,24.0,34.0,128.0,220.0,...,146.0,160.0,20.0,32.0,100.0,146.0,36.0,168.0,120.0,26.0
3,2003,52.0,142.0,140.0,182.0,258.0,24.0,30.0,130.0,216.0,...,128.0,146.0,20.0,16.0,102.0,136.0,36.0,160.0,120.0,26.0
4,2004,48.0,134.0,130.0,148.0,224.0,22.0,28.0,118.0,200.0,...,116.0,140.0,20.0,32.0,104.0,136.0,34.0,156.0,122.0,24.0


In [56]:
#Creating a dash app to display a line chart that shows SO2 pollution by year with a dropdown for each city

# Create a Dash app
appNO2 = dash.Dash(__name__)


dropdown_options = [{'label': col, 'value': col} for col in cityNO2.columns[1:]]

# Define app layout
appNO2.layout = html.Div([
    dcc.Dropdown(
        id='dropdown-NO2',
        options=dropdown_options,
        value='Phoenix'
    ),
    dcc.Graph(id='line-plot5')
])

# Define callback to update the line plot
@appNO2.callback(
    Output('line-plot5', 'figure'),
    [Input('dropdown-NO2', 'value')]
)
def update_line_plot(selected_column):
    fig = px.line(cityNO2, x='index', y=selected_column)

    title = f'{selected_column} NO2'

    fig.update_layout(title = title, xaxis_title = 'Year', yaxis_title = 'NO2 Concentration')

    return fig

# Run the app
if __name__ == '__main__':
    appNO2.run_server(debug=True, port=8060)

In [57]:
#Using the above table, perform a t-test for each state checking for significant difference at the 0.05 significance level between NO2 pollution in 2000 and 2022. 
#If values return "nan" it means there was only one record for that state, which does not allow for a t-test.

for state in state_NO2['State'].unique():
    data_2000 = state_NO2.loc[state_NO2['State'] == state, '2000']
    data_2022 = state_NO2.loc[state_NO2['State'] == state, '2022']

    t_statistic, p_value = ttest_ind(data_2000, data_2022)

    print(f'State: {state}')
    print(f't_statistic: {t_statistic}')
    print(f'p_value: {p_value}')
    if p_value < 0.05:
        print('There is a significant difference in NO2 pollution between 2000 and 2022')
        if t_statistic > 0:
            print('There was a significant decrease in NO2 pollution between 2000 and 2022')
        else:
            print('There was a significant increase in NO2 pollution between 2000 and 2022')
            
    else:
        print('No significant difference in NO2 pollution between 2000 and 2022')


    print('-------------------------------')
    print('-------------------------------')

State: AZ
t_statistic: 0.847998304005088
p_value: 0.48574052277341995
No significant difference in NO2 pollution between 2000 and 2022
-------------------------------
-------------------------------
State: CA
t_statistic: 2.8663865148272745
p_value: 0.00779617713821866
There is a significant difference in NO2 pollution between 2000 and 2022
There was a significant decrease in NO2 pollution between 2000 and 2022
-------------------------------
-------------------------------
State: CO
t_statistic: 0.48586579810734964
p_value: 0.6750817833556706
No significant difference in NO2 pollution between 2000 and 2022
-------------------------------
-------------------------------
State: CT
t_statistic: nan
p_value: nan
No significant difference in NO2 pollution between 2000 and 2022
-------------------------------
-------------------------------
State: FL
t_statistic: 1.1663763400809437
p_value: 0.28772140440979
No significant difference in NO2 pollution between 2000 and 2022
-------------------


Precision loss occurred in moment calculation due to catastrophic cancellation. This occurs when the data are nearly identical. Results may be unreliable.


divide by zero encountered in divide


invalid value encountered in scalar multiply



In [58]:
CO = ['CO']
CO_df = mean_2000[mean_2000['Pollutant'].isin(CO) == True]
state_CO = (CO_df.groupby(['State', 'City'], sort=True, as_index=False)[years].sum())
state_CO.head()

Unnamed: 0,State,City,2000,2001,2002,2003,2004,2005,2006,2007,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,AK,Anchorage,10.8,11.4,9.4,11.4,12.8,9.6,8.6,7.2,...,6.2,5.0,5.6,6.0,7.0,5.4,4.8,6.0,4.4,4.8
1,AL,Birmingham,7.4,12.6,7.4,6.2,7.8,5.0,6.2,4.0,...,3.4,4.8,3.8,2.8,2.0,2.4,2.4,2.2,2.0,1.6
2,AZ,Phoenix,10.4,9.0,8.8,7.6,6.8,7.2,6.6,6.0,...,4.0,4.2,3.8,3.8,4.4,4.0,3.4,3.8,3.8,3.4
3,AZ,Tucson,9.4,5.8,5.0,5.2,4.0,4.2,3.6,3.8,...,2.2,2.0,2.0,1.8,2.0,1.6,1.2,1.6,1.4,1.4
4,CA,Fresno,6.6,5.6,5.0,3.6,3.4,3.6,4.2,3.4,...,2.8,2.2,2.0,2.6,2.4,2.6,2.0,4.4,2.2,2.0


In [59]:
#Dataframe manipulation for later graph making
city_CO = state_CO.set_index('City')

In [60]:
city_COpose = city_CO.transpose()

In [61]:
city_COpose.reset_index(inplace=True)

In [62]:
cityCO = city_COpose.drop(index=0)

In [63]:
cityCO.head()

City,index,Anchorage,Birmingham,Phoenix,Tucson,Fresno,Los Angeles,Riverside,Sacramento,San Francisco,...,Albuquerque,Reno,Buffalo,New York,Cleveland,Portland,Johnstown,Scranton,Houston,Rutland
1,2000,10.8,7.4,10.4,9.4,6.6,10.6,5.2,9.2,5.4,...,6.6,10.4,4.0,8.6,15.8,7.6,4.0,4.2,5.2,5.0
2,2001,11.4,12.6,9.0,5.8,5.6,7.8,4.4,10.0,5.6,...,5.8,9.2,3.8,7.8,5.6,6.4,4.2,3.6,4.6,4.4
3,2002,9.4,7.4,8.8,5.0,5.0,7.0,4.2,7.0,4.4,...,5.6,8.8,3.6,7.2,4.0,6.0,5.2,3.2,5.6,4.6
4,2003,11.4,6.2,7.6,5.2,3.6,6.8,4.8,7.6,4.2,...,4.2,7.6,4.8,6.2,5.4,6.2,4.4,3.0,8.4,3.8
5,2004,12.8,7.8,6.8,4.0,3.4,5.8,3.6,5.6,3.6,...,4.4,7.8,2.8,6.6,10.8,7.4,4.2,3.6,3.6,3.6


In [64]:
#Creating a dash app to display a line chart that shows CO pollution by year with a dropdown for each city

# Create a Dash app
appCO = dash.Dash(__name__)


dropdown_options = [{'label': col, 'value': col} for col in cityCO.columns[1:]]

# Define app layout
appCO.layout = html.Div([
    dcc.Dropdown(
        id='dropdown-CO',
        options=dropdown_options,
        value='Anchorage'
    ),
    dcc.Graph(id='line-plot6')
])

# Define callback to update the line plot
@appCO.callback(
    Output('line-plot6', 'figure'),
    [Input('dropdown-CO', 'value')]
)
def update_line_plot(selected_column):
    fig = px.line(cityCO, x='index', y=selected_column)

    title = f'{selected_column} CO-pollutant'

    fig.update_layout(title = title, xaxis_title = 'Year', yaxis_title = 'CO Concentration')

    return fig

# Run the app
if __name__ == '__main__':
    appCO.run_server(debug=True, port=8062)

In [65]:
#Using the above table, perform a t-test for each state checking for significant difference at the 0.05 significance level between CO pollution in 1990 and 2022. 
#If values return "nan" it means there was only one record for that state, which does not allow for a t-test.

for state in state_CO['State'].unique():
    data_2000 = state_CO.loc[state_CO['State'] == state, '2000']
    data_2022 = state_CO.loc[state_CO['State'] == state, '2022']

    t_statistic, p_value = ttest_ind(data_2000, data_2022)

    print(f'State: {state}')
    print(f't_statistic: {t_statistic}')
    print(f'p_value: {p_value}')
    if p_value < 0.05:
        print('There is a significant difference in CO pollution between 2000 and 2022')
        if t_statistic > 0:
            print('There was a significant decrease in CO pollution between 2000 and 2022')
        else:
            print('There was a significant increase in CO pollution between 2000 and 2022')
            
    else:
        print('No significant difference in CO pollution between 2000 and 2022')


    print('-------------------------------')
    print('-------------------------------')

State: AK
t_statistic: nan
p_value: nan
No significant difference in CO pollution between 2000 and 2022
-------------------------------
-------------------------------
State: AL
t_statistic: nan
p_value: nan
No significant difference in CO pollution between 2000 and 2022
-------------------------------
-------------------------------
State: AZ
t_statistic: 6.7082039324993685
p_value: 0.02150789041983671
There is a significant difference in CO pollution between 2000 and 2022
There was a significant decrease in CO pollution between 2000 and 2022
-------------------------------
-------------------------------
State: CA
t_statistic: 4.2024635912800745
p_value: 0.0012265180738523498
There is a significant difference in CO pollution between 2000 and 2022
There was a significant decrease in CO pollution between 2000 and 2022
-------------------------------
-------------------------------
State: CO
t_statistic: 10.999999999999998
p_value: 0.008163401865824484
There is a significant difference 


Precision loss occurred in moment calculation due to catastrophic cancellation. This occurs when the data are nearly identical. Results may be unreliable.


divide by zero encountered in divide


invalid value encountered in scalar multiply



In [66]:
#Create a PM10 dataframe
pm10 = ['PM10']
pm10_df = mean_2000[mean_2000['Pollutant'].isin(pm10) == True]
state_PM10 = (pm10_df.groupby(['State', 'City'], sort=True, as_index=False)[years].sum())
state_PM10.head()

Unnamed: 0,State,City,2000,2001,2002,2003,2004,2005,2006,2007,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,AL,Birmingham,202.0,128.0,84.0,100.0,96.0,94.0,80.0,100.0,...,64.0,64.0,128.0,70.0,56.0,58.0,56.0,56.0,56.0,56.0
1,AL,Huntsville,113.4,79.4,81.4,86.0,81.4,86.6,78.0,100.0,...,52.0,58.6,85.4,78.0,45.4,61.4,57.4,59.4,54.0,66.6
2,AZ,Phoenix,214.0,126.4,160.4,252.4,96.8,152.0,155.6,158.4,...,315.6,275.6,124.4,234.0,229.2,426.8,96.0,170.0,291.6,147.6
3,AZ,Tucson,100.0,170.0,60.0,94.0,48.0,62.0,140.0,96.0,...,68.0,114.0,46.0,58.0,84.0,62.0,112.0,122.0,120.0,136.0
4,CA,Fresno,223.0,241.0,173.0,163.0,132.0,178.0,197.0,167.0,...,170.0,164.0,160.0,150.0,190.0,220.0,194.0,283.0,268.0,268.0


In [67]:
city_PM10 = state_PM10.set_index('City')

In [68]:
city_PM10pose = city_PM10.transpose()

In [69]:
city_PM10pose.reset_index(inplace=True)

In [70]:
cityPM10 = city_PM10pose.drop(index=0)

In [71]:
cityPM10.head()

City,index,Birmingham,Huntsville,Phoenix,Tucson,Fresno,Los Angeles,Redding,Riverside,Sacramento,...,Cleveland,Portsmouth,Eugene,Pittsburgh,Charleston,Houston,Provo,Gillette,Riverton,Rock Springs
1,2000,202.0,113.4,214.0,100.0,223.0,172.0,76.0,187.2,132.0,...,126.0,118.0,138.0,122.0,104.0,136.0,178.0,146.0,98.0,64.4
2,2001,128.0,79.4,126.4,170.0,241.0,174.0,88.0,203.6,154.0,...,133.0,104.0,150.0,143.6,126.0,114.6,202.0,130.0,86.0,65.0
3,2002,84.0,81.4,160.4,60.0,173.0,140.0,100.0,170.4,124.0,...,117.6,78.0,158.0,127.0,78.0,135.4,210.0,138.0,114.0,84.0
4,2003,100.0,86.0,252.4,94.0,163.0,172.0,62.0,215.0,104.0,...,121.0,70.0,120.0,132.6,86.0,132.6,236.0,132.0,100.0,64.4
5,2004,96.0,81.4,96.8,48.0,132.0,142.0,80.0,157.6,94.0,...,111.6,62.0,100.0,133.6,90.0,134.0,222.0,126.0,100.0,64.0


In [72]:
#Creating a dash app to display a line chart that shows PM10 pollution by year with a dropdown for each city

# Create a Dash app
appPM10 = dash.Dash(__name__)


dropdown_options = [{'label': col, 'value': col} for col in cityPM10.columns[1:]]

# Define app layout
appPM10.layout = html.Div([
    dcc.Dropdown(
        id='dropdown-PM10',
        options=dropdown_options,
        value='Birmingham'
    ),
    dcc.Graph(id='line-plot7')
])

# Define callback to update the line plot
@appPM10.callback(
    Output('line-plot7', 'figure'),
    [Input('dropdown-PM10', 'value')]
)
def update_line_plot(selected_column):
    fig = px.line(cityPM10, x='index', y=selected_column)

    title = f'{selected_column} PM10'

    fig.update_layout(title = title, xaxis_title = 'Year', yaxis_title = 'PM10 Concentration')

    return fig

# Run the app
if __name__ == '__main__':
    appPM10.run_server(debug=True, port=8064)

In [73]:
#Using the above table, perform a t-test for each state checking for significant difference at the 0.05 significance level of PM10 pollution in 1990 and 2022. 
#If values return "nan" it means there was only one record for that state, which does not allow for a t-test.

for state in state_PM10['State'].unique():
    data_2000 = state_PM10.loc[state_PM10['State'] == state, '2000']
    data_2022 = state_PM10.loc[state_PM10['State'] == state, '2022']

    t_statistic, p_value = ttest_ind(data_2000, data_2022)

    print(f'State: {state}')
    print(f't_statistic: {t_statistic}')
    print(f'p_value: {p_value}')
    if p_value < 0.05:
        print('There is a significant difference in PM10 pollution between 2000 and 2022')
        if t_statistic > 0:
            print('There was a significant decrease in PM10 pollution between 2000 and 2022')
        else:
            print('There was a significant increase in PM10 pollution between 2000 and 2022')
            
    else:
        print('No significant difference in PM10 pollution between 2000 and 2022')


    print('-------------------------------')
    print('-------------------------------')

State: AL
t_statistic: 2.160663899492874
p_value: 0.16329153226024773
No significant difference in PM10 pollution between 2000 and 2022
-------------------------------
-------------------------------
State: AZ
t_statistic: 0.26529676809462893
p_value: 0.8156230279812167
No significant difference in PM10 pollution between 2000 and 2022
-------------------------------
-------------------------------
State: CA
t_statistic: 0.3308922656626504
p_value: 0.7450196635641749
No significant difference in PM10 pollution between 2000 and 2022
-------------------------------
-------------------------------
State: CO
t_statistic: 1.2535422782033492
p_value: 0.27827697859581185
No significant difference in PM10 pollution between 2000 and 2022
-------------------------------
-------------------------------
State: FL
t_statistic: 0.17755520605710876
p_value: 0.8754274845413441
No significant difference in PM10 pollution between 2000 and 2022
-------------------------------
-----------------------------


Precision loss occurred in moment calculation due to catastrophic cancellation. This occurs when the data are nearly identical. Results may be unreliable.


divide by zero encountered in divide


invalid value encountered in scalar multiply



In [74]:

#create a dataframe summing the total pollutants in each city by year
city_total_df = (mean_2000.groupby('City', sort=True, as_index=False)[years].sum())


In [75]:
#Convert list of cities to be used in a dropdown menu for future plots
city_list = city_total_df['City'].tolist()

In [76]:
#Set index to prepare for dataframe inversion
city_total_df.set_index('City', inplace=True)

In [77]:
city_total_df.head()

Unnamed: 0_level_0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Akron,432.57,384.592,402.2,430.38,441.758,397.582,363.748,182.374,159.95,137.534,...,113.518,105.92,94.93,66.52,57.932,63.936,64.524,65.926,66.134,60.34
Albany,194.14,187.978,179.982,172.762,186.35,196.564,180.54,198.356,160.756,150.332,...,126.326,119.122,115.724,106.136,112.722,113.932,108.71,123.908,131.322,98.118
Albuquerque,273.944,280.742,332.348,334.152,252.142,308.146,267.342,280.738,262.328,229.326,...,343.334,371.324,254.132,283.33,245.338,272.348,248.134,282.542,321.142,348.742
Altoona,124.16,136.166,102.178,124.166,122.146,144.154,96.142,112.142,110.15,102.13,...,58.13,60.12,62.138,26.124,14.126,16.128,16.128,8.126,8.122,12.124
Americus,0.184,0.164,0.14,0.144,0.14,0.142,0.154,0.152,0.136,0.12,...,0.12,0.118,0.114,0.13,0.116,0.124,0.124,0.108,0.118,0.118


In [78]:
#Transposing the dataframe
cities_df = city_total_df.transpose()
cities_df.head()


City,Akron,Albany,Albuquerque,Altoona,Americus,Anchorage,Ann Arbor,Appleton,Arkadelphia,Asheville,...,Washington Court House,Watertown,Wausau,Whitewater,Wichita,Wilmington,Winston,Yakima,York,Yuba City
2000,432.57,194.14,273.944,124.16,0.184,62.4,88.67,87.0,0.058,98.98,...,0.048,0.144,0.146,0.152,72.56,0.16,392.978,97.0,437.58,257.358
2001,384.592,187.978,280.742,136.166,0.164,73.8,108.862,88.2,0.042,89.152,...,0.044,0.204,0.144,0.176,67.968,0.156,402.388,97.0,383.574,287.962
2002,402.2,179.982,332.348,102.178,0.14,71.0,91.86,79.6,0.062,89.78,...,0.04,0.2,0.146,0.168,72.758,0.16,442.186,90.4,417.802,248.36
2003,430.38,172.762,334.152,124.166,0.144,70.6,107.268,72.8,0.064,87.34,...,0.04,0.178,0.148,0.162,69.36,0.152,383.162,90.4,378.962,229.152
2004,441.758,186.35,252.142,122.146,0.14,86.6,89.448,77.4,0.044,78.746,...,0.048,0.142,0.13,0.138,68.736,0.14,406.156,109.8,460.954,224.146


In [79]:
#reset the index
cities_df.reset_index(inplace=True)

In [80]:
cities_df.head()

City,index,Akron,Albany,Albuquerque,Altoona,Americus,Anchorage,Ann Arbor,Appleton,Arkadelphia,...,Washington Court House,Watertown,Wausau,Whitewater,Wichita,Wilmington,Winston,Yakima,York,Yuba City
0,2000,432.57,194.14,273.944,124.16,0.184,62.4,88.67,87.0,0.058,...,0.048,0.144,0.146,0.152,72.56,0.16,392.978,97.0,437.58,257.358
1,2001,384.592,187.978,280.742,136.166,0.164,73.8,108.862,88.2,0.042,...,0.044,0.204,0.144,0.176,67.968,0.156,402.388,97.0,383.574,287.962
2,2002,402.2,179.982,332.348,102.178,0.14,71.0,91.86,79.6,0.062,...,0.04,0.2,0.146,0.168,72.758,0.16,442.186,90.4,417.802,248.36
3,2003,430.38,172.762,334.152,124.166,0.144,70.6,107.268,72.8,0.064,...,0.04,0.178,0.148,0.162,69.36,0.152,383.162,90.4,378.962,229.152
4,2004,441.758,186.35,252.142,122.146,0.14,86.6,89.448,77.4,0.044,...,0.048,0.142,0.13,0.138,68.736,0.14,406.156,109.8,460.954,224.146


In [81]:
##Creating a dash app to display a line chart that shows overall pollution by year with a dropdown for each state

# Create a Dash app
app = dash.Dash(__name__)


dropdown_options = [{'label': col, 'value': col} for col in cities_df.columns[1:]]

# Define app layout
app.layout = html.Div([
    dcc.Dropdown(
        id='dropdown-column',
        options=dropdown_options,
        value='Akron'
    ),
    dcc.Graph(id='line-plot')
])

# Define callback to update the line plot
@app.callback(
    Output('line-plot', 'figure'),
    [Input('dropdown-column', 'value')]
)
def update_line_plot(selected_column):
    fig = px.line(cities_df, x='index', y=selected_column)

    return fig

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True, port = 8050)

The following will include some statistics found from these above data starting with the state level.

In [82]:
state_avg.reset_index(inplace=True)


In [83]:
#What are the best and worst states for pollution?
lowest_pollutant = state_avg['2022'].min()
best_state = state_avg.loc[state_avg['2022'].idxmin(), 'State']
print(f'{best_state} is the state with the lowest overall pollution concentration with a pollutant density value of {lowest_pollutant}.')

most_pollutant = state_avg['2022'].max()
worst_state = state_avg.loc[state_avg['2022'].idxmax(), 'State']
print(f'{worst_state} is the state with the highest overall pollution concentration with a pollutant density value of {most_pollutant}.')

ME is the state with the lowest overall pollution concentration with a pollutant density value of 0.11533333333333333.
OR is the state with the highest overall pollution concentration with a pollutant density value of 291.0415.


In [84]:
#Which states improved and deproved over time the most in pollution concentration?

state_avg['Change_Over_Time'] = state_avg['2022'] - state_avg['2000']

best_change = state_avg['Change_Over_Time'].min()
best_change_state = state_avg.loc[state_avg['Change_Over_Time'].idxmin(), 'State']
print(f'{best_change_state} is the state with the best overall pollution change over time with a pollutant density change of {best_change}.')


worst_change = state_avg['Change_Over_Time'].max()
worst_change_state = state_avg.loc[state_avg['Change_Over_Time'].idxmax(), 'State']
print(f'{worst_change_state} is the state with the worst overall pollution change over time with a pollutant density change of {worst_change}.')



WV is the state with the best overall pollution change over time with a pollutant density change of -199.33960000000002.
OR is the state with the worst overall pollution change over time with a pollutant density change of 183.70049999999998.


The following will take a look at the same statistics at the city level.

In [85]:
city_total_df.reset_index(inplace=True)

In [86]:
#What are the best and worst cities for pollution?
low_pollutant = city_total_df['2022'].min()
best_city = city_total_df.loc[city_total_df['2022'].idxmin(), 'City']
print(f'{best_city} is the city with the lowest overall pollution concentration with a value of {low_pollutant}.')

high_pollutant = city_total_df['2022'].max()
worst_city = city_total_df.loc[city_total_df['2022'].idxmax(), 'City']
print(f'{worst_city} is the city with the highest pollution concentration with a value of {high_pollutant}.')

Bellingham is the city with the lowest overall pollution concentration with a value of 0.096.
Eugene is the city with the highest pollution concentration with a value of 1013.516.


In [87]:
#Which cities improved and deproved over time the most in pollution concentration?

city_total_df['Change_Over_Time'] = city_total_df['2022'] - city_total_df['2000']

best_change_c = city_total_df['Change_Over_Time'].min()
best_change_city = city_total_df.loc[city_total_df['Change_Over_Time'].idxmin(), 'City']
print(f'{best_change_city} is the city with the best overall pollution change over time with a pollutant density change of {best_change_c}.')


worst_change_c = city_total_df['Change_Over_Time'].max()
worst_change_city = city_total_df.loc[city_total_df['Change_Over_Time'].idxmax(), 'City']
print(f'{worst_change_city} is the city with the worst overall pollution change over time with a pollutant density change of {worst_change_c}.')

Birmingham is the city with the best overall pollution change over time with a pollutant density change of -515.454.
Eugene is the city with the worst overall pollution change over time with a pollutant density change of 760.804.


Now we will take a look at some statistics at the individual pollutant level.

In [88]:
#Summary statistics by each year for O3 concentration by state.
O3_avg.describe()

Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
count,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,...,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0
mean,0.147543,0.149701,0.1572,0.147218,0.13517,0.144417,0.142379,0.141496,0.134195,0.126487,...,0.129656,0.128872,0.131245,0.130421,0.13058,0.133673,0.124486,0.122787,0.129904,0.126658
std,0.025112,0.027015,0.029795,0.021356,0.018429,0.024093,0.021282,0.025,0.019391,0.014451,...,0.012497,0.009754,0.010853,0.013433,0.011361,0.01178,0.010679,0.014742,0.013262,0.012364
min,0.086,0.084,0.086,0.076,0.092,0.076,0.08,0.066,0.082,0.093,...,0.094,0.104,0.096,0.094,0.092,0.092,0.102,0.088,0.09,0.088
25%,0.134333,0.140232,0.14575,0.141625,0.128857,0.1338,0.13325,0.130821,0.126708,0.122464,...,0.12405,0.123429,0.1262,0.12725,0.125,0.12745,0.118167,0.113083,0.122167,0.122
50%,0.15025,0.15775,0.159667,0.150583,0.138,0.148,0.1435,0.148762,0.136667,0.128,...,0.131,0.128833,0.13,0.130833,0.129167,0.13425,0.1253,0.119429,0.127,0.12525
75%,0.167,0.16525,0.175619,0.162357,0.1459,0.1595,0.157,0.1565,0.146583,0.135125,...,0.136405,0.135875,0.13675,0.138,0.135889,0.13975,0.129833,0.131625,0.135333,0.13425
max,0.19,0.198667,0.212667,0.186,0.164,0.186667,0.184,0.182667,0.168,0.1495,...,0.165333,0.148,0.159333,0.158,0.156667,0.16,0.158,0.16,0.164,0.150667


In [89]:
# Which states have the lowest and highest O3 concentration?
low_O3 = O3_avg['2022'].min()
best_state_O3 = O3_avg.loc[O3_avg['2022'].idxmin(), 'State']
print(f'{best_state_O3} is the state with the lowest concentration of O3 with a value of {low_O3}')

high_O3 = O3_avg['2022'].max()
worst_state_O3 = O3_avg.loc[O3_avg['2022'].idxmax(), 'State']
print(f'{worst_state_O3} is the state with the highest concentration of O3 with a value of {high_O3}')

HI is the state with the lowest concentration of O3 with a value of 0.088
CT is the state with the highest concentration of O3 with a value of 0.15066666666666664


In [90]:
#Summary statistics of PM25 concentration for each year by city.
city_pm25.describe()

Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
count,169.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0,...,169.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0
mean,26.081657,25.351479,24.798817,23.718343,22.933728,24.725444,22.808284,23.159763,21.302959,19.377515,...,17.67929,17.461538,16.779882,15.353846,16.03787,16.504142,15.171598,16.414201,17.138462,15.79645
std,7.214549,6.745211,6.795079,5.986907,5.746706,6.629938,5.612119,6.281325,5.299376,4.709789,...,4.721441,4.361684,3.944354,3.601587,3.972163,5.038853,3.133558,5.54774,4.644095,4.131498
min,8.4,8.4,8.0,7.4,6.8,7.6,7.0,8.0,8.8,7.6,...,9.4,7.8,9.2,5.8,7.4,6.4,6.0,5.8,5.0,5.0
25%,21.8,21.4,20.6,19.4,19.4,20.6,18.6,19.2,18.2,16.6,...,15.0,15.0,14.6,13.6,14.2,14.2,13.6,14.0,14.8,13.8
50%,26.2,25.4,25.4,24.0,23.2,25.6,23.4,24.0,21.4,19.4,...,17.4,17.8,16.8,15.2,15.6,15.8,15.2,15.6,16.6,15.4
75%,31.2,29.6,29.0,28.0,27.0,29.2,27.0,28.0,24.4,22.2,...,19.4,19.6,18.6,17.2,17.2,17.6,17.0,16.8,18.8,17.0
max,47.8,45.0,46.4,36.4,36.2,37.8,38.0,43.8,45.4,41.4,...,42.8,40.2,34.2,30.6,34.2,37.2,25.8,40.0,41.4,41.4


In [91]:
# Which states have the lowest and highest PM2.5 concentration?
low_PM25 = city_pm25['2022'].min()
best_state_PM25 = city_pm25.loc[city_pm25['2022'].idxmin(), 'State']
print(f'{best_state_PM25} is the state with the lowest concentration of PM2.5 with an average value of {low_PM25}')

high_PM25 = city_pm25['2022'].max()
worst_state_PM25 = city_pm25.loc[city_pm25['2022'].idxmax(), 'State']
print(f'{worst_state_PM25} is the state with the highest concentration of PM2.5 with an average value of {high_PM25}')

HI is the state with the lowest concentration of PM2.5 with an average value of 5.0
CA is the state with the highest concentration of PM2.5 with an average value of 41.4
