In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objs as go

import plotly
plotly.offline.init_notebook_mode(connected=True)

#Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<center style="font-family:verdana;"><h1 style="font-size:200%; padding: 10px; background: white;"><b style="color:#DC143C;">Particulate Matter</b></h1></center>

Particulate Matter (PM) Basics

What is PM, and how does it get into the air?

"PM stands for particulate matter (also called particle pollution): the term for a mixture of solid particles and liquid droplets found in the air. Some particles, such as dust, dirt, soot, or smoke, are large or dark enough to be seen with the naked eye. Others are so small they can only be detected using an electron microscope."

Particle pollution includes:

"PM10 : inhalable particles, with diameters that are generally 10 micrometers and smaller; and
PM2.5 : fine inhalable particles, with diameters that are generally 2.5 micrometers and smaller."

"How small is 2.5 micrometers? Think about a single hair from your head. The average human hair is about 70 micrometers in diameter – making it 30 times larger than the largest fine particle."

https://www.epa.gov/pm-pollution/particulate-matter-pm-basics

![](https://www.epa.gov/sites/default/files/2016-09/pm2.5_scale_graphic-color_2.jpg)epa.gov

In [None]:
!pip install openpyxl

In [None]:
df = pd.read_excel('/kaggle/input/phase-ii-widsdatathon2022/epa/epa/Datathon_EPA_Air_Quality_Demographics_Meteorology_2020.xlsx')
df.head()

In [None]:
df1 = pd.read_excel('/kaggle/input/phase-ii-widsdatathon2022/epa/epa/Datathon_EPA_Air_Quality_Demographics_Meteorology_2019.xlsx')
df1.head()

In [None]:
df.columns.tolist()

In [None]:
df.isnull().sum()

#Benzene

To avoid the overlapping I commented some of the columns names

In [None]:
#Code by Prem Chotepanit https://www.kaggle.com/code/batprem/more-feature-selection-and-model-back-testing

fig = px.parallel_coordinates(
    df,
    color='BENZENE_PPBC',
    dimensions=[
        'DATE',
        'RELATIVE_HUMIDITY',
       # 'PEOPLE_OF_COLOR_FRACTION', #sensor_3
       # 'WIND_SPEED_METERS_PER_SECOND', #absolute_humidity
        'PM25_UG_PER_CUBIC_METER', #deg_C
        ## 'month'
        ## 'week_day',
        'OZONE_PPM',#sensor_1
       # 'LESS_THAN_HS_ED_FRACTION', #sensor_2
       # 'LOW_INCOME_FRACTION', #sensor_4
        #'LINGUISTICALLY_ISOLATED_FRACTION', #sensor_5
        ## 'week_day*hour',
        'BENZENE_PPBC',
    ],
    labels={
            'BENZENE_PPBC': 'Benzene',
            'NO2_PPB': 'Nitrogen Dioxide',
            "CO_PPM": "Carbon monoxide"
    },
    color_continuous_scale=px.colors.diverging.Tealrose,
    color_continuous_midpoint=2)
fig.show()

#Nitrogen Dioxide

In [None]:
#Code by Prem Chotepanit https://www.kaggle.com/code/batprem/more-feature-selection-and-model-back-testing

fig = px.parallel_coordinates(
    df,
    color='NO2_PPB',
    dimensions=[
        'DATE',                   
        #'RELATIVE_HUMIDITY',
       # 'PEOPLE_OF_COLOR_FRACTION', #sensor_3
       # 'WIND_SPEED_METERS_PER_SECOND',#absolute_humidity
        'PM25_UG_PER_CUBIC_METER', #deg_C
        'OZONE_PPM', #sensor_1
       # 'LESS_THAN_HS_ED_FRACTION', #sensor_2
        # 'week_day',
        # 'week_day*hour',
        'BENZENE_PPBC',
        "CO_PPM",
       # 'LOW_INCOME_FRACTION', #sensor_4
        #'LINGUISTICALLY_ISOLATED_FRACTION', # sensor_5
        'NO2_PPB',
    ],
    labels={
            'BENZENE_PPBC': 'Benzene',
            'NO2_PPB': 'Nitrogen Dioxide',
            "CO_PPM": "Carbon monoxide"
    },
    color_continuous_scale=px.colors.diverging.delta,
    color_continuous_midpoint=2,
    range_color=[
        min(df.NO2_PPB),
        max(df.NO2_PPB)
    ]
)
fig.show()

#Carbon monoxide

Some are commented  to avoid a clumsy chart since the columns names are too long

In [None]:
#Code by Prem Chotepanit https://www.kaggle.com/code/batprem/more-feature-selection-and-model-back-testing

fig = px.parallel_coordinates(
    df,
    color='CO_PPM',
    dimensions=[
        'DATE',
        #'RELATIVE_HUMIDITY',
       # 'PEOPLE_OF_COLOR_FRACTION', #sensor_3
       # 'WIND_SPEED_METERS_PER_SECOND', #absolute_humidity
        'PM25_UG_PER_CUBIC_METER', #deg_C
        'OZONE_PPM', #sensor_1
        #'LESS_THAN_HS_ED_FRACTION', #sensor_2
        'LOW_INCOME_FRACTION', #sensor_4
       # 'LINGUISTICALLY_ISOLATED_FRACTION', #sensor_5
        # 'week_day',
        'CO_PPM',
    ],
    labels={
            'BENZENE_PPBC': 'Benzene',
            'NO2_PPB': 'Nitrogen Dioxide',
            "CO_PPM": "Carbon monoxide"
    },
    color_continuous_scale=px.colors.diverging.Spectral,
    color_continuous_midpoint=2,
    range_color=[
        min(df.CO_PPM),
        max(df.CO_PPM)
    ]
)
fig.show()

#Dealing with Missing values to make some charts

In [None]:
# Lets first handle numerical features with nan value
numerical_nan = [feature for feature in df.columns if df[feature].isna().sum()>1 and df[feature].dtypes!='O']
numerical_nan

In [None]:
df[numerical_nan].isna().sum()

In [None]:
## Replacing the numerical Missing Values

for feature in numerical_nan:
    ## We will replace by using median since there are outliers
    median_value=df[feature].median()
    
    df[feature].fillna(median_value,inplace=True)
    
df[numerical_nan].isnull().sum()

In [None]:
# categorical features with missing values
categorical_nan = [feature for feature in df.columns if df[feature].isna().sum()>0 and df[feature].dtypes=='O']
print(categorical_nan)

In [None]:
# replacing missing values in categorical features
for feature in categorical_nan:
    df[feature] = df[feature].fillna('None')

In [None]:
df[categorical_nan].isna().sum()

#Parts per Billion Concentration (ppbC)

In [None]:
#Code by Youko___4444  https://www.kaggle.com/code/e34444/notebook08734402d2

features = df.columns
targets = ["BENZENE_PPBC", "NO2_PPB", "CO_PPM"]

In [None]:
ca = df[(df['STATE']=='California')].reset_index(drop=True)
ca.head(2)

<h1><span class="label label-default" style="background-color:white;border-radius:100px 100px; font-weight: bold; font-family:Garamond; font-size:20px; color:#DC143C; padding:10px">California PM 2.5 pcm</span></h1><br>

In [None]:
#Code by Youko___4444  https://www.kaggle.com/code/e34444/notebook08734402d2

for target in targets:
    fig, ax = plt.subplots(figsize=(20, 10))
    sns.scatterplot(data=ca, x="PM25_UG_PER_CUBIC_METER", y=target, hue="COUNTY")
    plt.title('California Counties PM 2.5 pcm')
    plt.show()

<h1><span class="label label-default" style="background-color:white;border-radius:100px 100px; font-weight: bold; font-family:Garamond; font-size:20px; color:#DC143C; padding:10px">Harmful Effects of PM</span></h1><br>

"Most particles form in the atmosphere as a result of complex reactions of chemicals such as sulfur dioxide and nitrogen oxides, which are pollutants emitted from power plants, industries and automobiles."

"Particulate matter contains microscopic solids or liquid droplets that are so small that they can be inhaled and cause serious health problems. Some particles less than 10 micrometers in diameter can get deep into your lungs and some may even get into your bloodstream. Of these, particles less than 2.5 micrometers in diameter, also known as fine particles or PM2.5, pose the greatest risk to health."

"Fine particles are also the main cause of reduced visibility (haze) in parts of the United States, including many of our treasured national parks and wilderness areas."

https://www.epa.gov/pm-pollution/particulate-matter-pm-basics

<h1><span class="label label-default" style="background-color:white;border-radius:100px 100px; font-weight: bold; font-family:Garamond; font-size:20px; color:#DC143C; padding:10px">Michigan PM25 pcm</span></h1><br>

In [None]:
mi = df[(df['STATE']=='Michigan')].reset_index(drop=True)
mi.head(2)

In [None]:
#Code by Youko___4444  https://www.kaggle.com/code/e34444/notebook08734402d2

for target in targets:
    fig, ax = plt.subplots(figsize=(10, 8))
    sns.scatterplot(data=mi, x="PM25_UG_PER_CUBIC_METER", y=target, hue="COUNTY")
    plt.title('Michigan Counties PM 2.5 pcm')
    plt.show()

In [None]:
#Code by Youko___4444  https://www.kaggle.com/code/e34444/notebook08734402d2

from catboost import CatBoostRegressor
from scipy.stats import randint
from sklearn.mixture import GaussianMixture
from sklearn import datasets, preprocessing
from sklearn.cluster import KMeans

In [None]:
#Code by Youko___4444  https://www.kaggle.com/code/e34444/notebook08734402d2

X=df[["PM25_UG_PER_CUBIC_METER","BENZENE_PPBC"]]
sc=preprocessing.StandardScaler()
sc.fit(X)
X_norm=sc.transform(X)
# クラスタリング
cls = KMeans(n_clusters=2)
result = cls.fit(X_norm)
# 結果を出力
plt.scatter(X_norm[:,0],X_norm[:,1], c=result.labels_)
plt.scatter(result.cluster_centers_[:,0],result.cluster_centers_[:,1],s=250, marker='*',c='red')
plt.title('US Particulate Matter 2.5 pcm vs. Benzene ppb Concentration')
plt.show()

<h1><span class="label label-default" style="background-color:white;border-radius:100px 100px; font-weight: bold; font-family:Garamond; font-size:20px; color:#DC143C; padding:10px">California PM25 pcm </span></h1><br>

NO2_PPB = Nitrogen dioxide parts per billion

PM25 = Particulate Matter 2.5 micrometers

In [None]:
#Code by Youko___4444  https://www.kaggle.com/code/e34444/notebook08734402d2

X=ca[["PM25_UG_PER_CUBIC_METER","NO2_PPB"]]
sc=preprocessing.StandardScaler()
sc.fit(X)
X_norm=sc.transform(X)
# クラスタリング
cls = KMeans(n_clusters=2)
result = cls.fit(X_norm)
# 結果を出力
plt.scatter(X_norm[:,0],X_norm[:,1], c=result.labels_)
plt.scatter(result.cluster_centers_[:,0],result.cluster_centers_[:,1],s=250, marker='*',c='red')
plt.title('California Particulate Matter 2.5 pcm vs. Nitrogen dioxide ppb')
plt.show()

In [None]:
#Code by Youko___4444  https://www.kaggle.com/code/e34444/notebook08734402d2

X=mi[["PM25_UG_PER_CUBIC_METER","CO_PPM"]]
sc=preprocessing.StandardScaler()
sc.fit(X)
X_norm=sc.transform(X)
# クラスタリング
cls = KMeans(n_clusters=2)
result = cls.fit(X_norm)
# 結果を出力
plt.scatter(X_norm[:,0],X_norm[:,1], c=result.labels_)
plt.scatter(result.cluster_centers_[:,0],result.cluster_centers_[:,1],s=250, marker='*',c='red')
plt.title('Michigan PM 2.5 pcm vs. Carbon monoxide ppm')
plt.show()

In [None]:
import datetime
import calendar

In [None]:
#Code by Ajay Sampath https://www.kaggle.com/code/ajaysamp/tab-july-full-eda

df['DATE'] = pd.to_datetime(df['DATE'], format = '%Y-%m-%d %H:%M:%S')
df['month'] = df['DATE'].dt.month

def findDay(date):
    x = datetime.datetime.strptime(str(date), '%Y-%m-%d %H:%M:%S').weekday()
    return (calendar.day_name[x])
df['weekday'] = df['DATE'].apply(findDay)

df['hour'] = df['DATE'].dt.hour

In [None]:
#Code by Ajay Sampath https://www.kaggle.com/code/ajaysamp/tab-july-full-eda

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

from plotnine import *  #allows ggplot
import plotnine as pn

In [None]:
#Code by Ajay Sampath https://www.kaggle.com/code/ajaysamp/tab-july-full-eda

ggplot(df, aes(x='DATE', y='CO_PPM')) + geom_line(color='blue', size = 0.2) + labs(x = 'Date', y= 'Carbon Monoxide') \
                                            + theme(figure_size=(16, 4))\
                                            + ggtitle('Carbon Monoxide - Timeseries') + geom_hline(yintercept=7.5, linetype="dashed",color = "black", size=0.5)

#Keep aes. Don't change it.

In [None]:
#Code by Ajay Sampath https://www.kaggle.com/code/ajaysamp/tab-july-full-eda

ggplot(df, aes(x='DATE', y='BENZENE_PPBC')) + geom_line(color='red', size = 0.2) + labs(x = 'Date', y= 'Benzene') + theme(figure_size=(16, 4))\
                                            + ggtitle('Benzene - Timeseries') + geom_hline(yintercept=40, linetype="dashed",color = "black", size=0.5)

In [None]:
#Code by Ajay Sampath https://www.kaggle.com/code/ajaysamp/tab-july-full-eda

ggplot(df, aes(x='DATE', y='NO2_PPB')) + geom_line(color='green', size = 0.2) + labs(x = 'Date', y= 'Nitrogen Dioxide') \
                                            + theme(figure_size=(16, 4))\
                                            + ggtitle('Nitrogen Dioxide - Timeseries') + geom_hline(yintercept=750, linetype="dashed",color = "black", size=0.5)

In [None]:
#Code by Ajay Sampath https://www.kaggle.com/code/ajaysamp/tab-july-full-eda

co_monthly_avg = df.groupby('month')['CO_PPM'].mean().reset_index().rename(columns={'CO_PPM':'values'})
co_monthly_avg['type'] = 'Average'

co_monthly_median = df.groupby('month')['CO_PPM'].median().reset_index().rename(columns={'CO_PPM':'values'})
co_monthly_median['type'] = 'Median'

co_df = pd.concat([co_monthly_avg, co_monthly_median])
month_map = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep',10:'Oct',11:'Nov', 12:'Dec'}

co_df['month_text'] = co_df['month'].map(month_map)

ggplot(co_df, aes(x = 'reorder(month_text,month)', y='values', fill='type')) + geom_bar(position="dodge", stat='identity') \
                    + theme(figure_size=(16, 4)) + labs(fill='', x= '',y='Value') + ggtitle('Carbon Monoxide - Monthly Median & Average')\
                    + scale_fill_manual(values=("#21618c","#5dade2"))

In [None]:
#Code by Ajay Sampath https://www.kaggle.com/code/ajaysamp/tab-july-full-eda

ben_monthly_avg = df.groupby('month')['BENZENE_PPBC'].mean().reset_index().rename(columns={'BENZENE_PPBC':'values'})
ben_monthly_avg['type'] = 'Average'

ben_monthly_median = df.groupby('month')['BENZENE_PPBC'].median().reset_index().rename(columns={'BENZENE_PPBC':'values'})
ben_monthly_median['type'] = 'Median'

ben_df = pd.concat([ben_monthly_avg, ben_monthly_median])
month_map = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep',10:'Oct',11:'Nov', 12:'Dec'}

ben_df['month_text'] = ben_df['month'].map(month_map)

ggplot(ben_df, aes(x = 'reorder(month_text,month)', y='values', fill='type')) + geom_bar(position="dodge", stat='identity') \
                    + theme(figure_size=(16, 4)) + labs(fill='', x= '',y='Value') + ggtitle('Benzene - Monthly Median & Average')\
                    + scale_fill_manual(values=("#943126","#ec7063"))

In [None]:
#Code by Ajay Sampath https://www.kaggle.com/code/ajaysamp/tab-july-full-eda

nit_monthly_avg = df.groupby('month')['NO2_PPB'].mean().reset_index().rename(columns={'NO2_PPB':'values'})
nit_monthly_avg['type'] = 'Average'

nit_monthly_median = df.groupby('month')['NO2_PPB'].median().reset_index().rename(columns={'NO2_PPB':'values'})
nit_monthly_median['type'] = 'Median'

nit_df = pd.concat([nit_monthly_avg, nit_monthly_median])
month_map = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep',10:'Oct',11:'Nov', 12:'Dec'}

nit_df['month_text'] = nit_df['month'].map(month_map)

ggplot(nit_df, aes(x = 'reorder(month_text,month)', y='values', fill='type')) + geom_bar(position="dodge", stat='identity') \
                    + theme(figure_size=(16, 4)) + labs(fill='', x= '',y='Value') + ggtitle('Nitrogen Dioxides - Monthly Median & Average')\
                    + scale_fill_manual(values=("#117864","#48c9b0"))

#The snippet below allocated too much. Then I gave it up. Requires Google Cloud.

In [None]:
#Code by Ajay Sampath https://www.kaggle.com/code/ajaysamp/tab-july-full-eda

# correlation plot with other features
corr_df = df[['PM25_UG_PER_CUBIC_METER','RELATIVE_HUMIDITY','WIND_SPEED_METERS_PER_SECOND','OZONE_PPM','LESS_THAN_HS_ED_FRACTION','PEOPLE_OF_COLOR_FRACTION','LOW_INCOME_FRACTION','LINGUISTICALLY_ISOLATED_FRACTION']]
corr_df = corr_df.corr()

# function for individual correlation
column_list = ['PM25_UG_PER_CUBIC_METER','RELATIVE_HUMIDITY','WIND_SPEED_METERS_PER_SECOND','OZONE_PPM','LESS_THAN_HS_ED_FRACTION','PEOPLE_OF_COLOR_FRACTION','LOW_INCOME_FRACTION','LINGUISTICALLY_ISOLATED_FRACTION']

def ind_corr(df, col):
    df = df[df.index == col]
    var_list = column_list.copy()
    df = pd.melt(df, id_vars = col, value_vars = var_list.remove(col))   
    return df

In [None]:
#Code by Ajay Sampath https://www.kaggle.com/code/ajaysamp/tab-july-full-eda

pm25_c_corr = ind_corr(corr_df, 'PM25_UG_PER_CUBIC_METER')

ggplot(pm25_c_corr, aes(x = 'variable', y='value')) + geom_bar(position="dodge", stat='identity', fill = "grey") \
                    + theme(figure_size=(16, 4)) + labs(x= 'features',y='Value')\
                    + ggtitle('Correlation - PM25_UG pcm with other features') 

In [None]:
#Code by Ajay Sampath https://www.kaggle.com/code/ajaysamp/tab-july-full-eda

f, axes = plt.subplots(figsize=(20, 6), ncols=3)

p1 = sns.scatterplot(data = df, x = 'PM25_UG_PER_CUBIC_METER', y = 'CO_PPM', ax = axes[0], color = 'blue')
axes[0].set(xlabel = 'PM25_UG_PER_CUBIC_METER', ylabel = 'Carbon Monoxide')

p2 = sns.scatterplot(data = df, x = 'PM25_UG_PER_CUBIC_METER', y = 'BENZENE_PPBC', ax = axes[1], color = 'red')
axes[1].set(xlabel = 'PM25_UG_PER_CUBIC_METER', ylabel = 'Benzene')

p3 = sns.scatterplot(data = df, x = 'PM25_UG_PER_CUBIC_METER', y = 'NO2_PPB', ax = axes[2], color = 'green')
axes[2].set(xlabel = 'PM25_UG_PER_CUBIC_METER', ylabel = 'Nitrogen Dioxide')

plt.suptitle('PM25_UG pcm vs Target Variables', fontsize = 16,fontweight='bold')
plt.tight_layout()
plt.show()

<h1><span class="label label-default" style="background-color:white;border-radius:100px 100px; font-weight: bold; font-family:Garamond; font-size:20px; color:#DC143C; padding:10px">What is Being Done to Reduce Particle Pollution?</span></h1><br>

"EPA regulates inhalable particles. Particles of sand and large dust, which are larger than 10 micrometers, are not regulated by EPA." 

"EPA’s national and regional rules to reduce emissions of pollutants that form PM will help state and local governments meet the Agency’s national air quality standards."

https://www.epa.gov/pm-pollution/particulate-matter-pm-basics

#LAWA - Land, Air, Water Aotearoa 

![](https://www.lawa.org.nz/media/5260240/health-effects-of-pm.png?width=1038&height=743&mode=max)Lava.org.nz

https://www.lawa.org.nz/learn/factsheets/air-quality-topic/air-pollutants-particulate-matter/