# Analysis of Health Impacts and Mortality Risk of Air Pollution in Different Countries 

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sqlalchemy import desc

In [2]:
!pip install psycopg2



In [3]:
# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func

In [4]:
# Connect to local database

engine = create_engine('postgresql+psycopg2://postgres:postgres@localhost:5432/air_quality')
connection = engine.connect()

In [5]:
# reflect an existing database into a new model
Base = automap_base()

# reflect the tables
Base.prepare(autoload_with=engine)

In [6]:
# View all of the classes that automap found
Base.classes.keys()

['countries_codes_and_coordinates',
 'ambient_air_quality_data',
 'death_rates_from_air_pollution',
 'disease_burden_by_risk_factor',
 'number_of_deaths_by_risk_factor',
 'outdoor_air_death_rates_by_age',
 'aq_pollution_mortality_data']

In [7]:
# Save references to each table
Countries_codes_and_coordinates = Base.classes.countries_codes_and_coordinates
Ambient_air_quality_data = Base.classes.ambient_air_quality_data
Death_rates_from_air_pollution = Base.classes.death_rates_from_air_pollution
Disease_burden_by_risk_factor = Base.classes.disease_burden_by_risk_factor
Number_of_deaths_by_risk_factor = Base.classes.number_of_deaths_by_risk_factor
Outdoor_air_death_rates_by_age = Base.classes.outdoor_air_death_rates_by_age
Aq_pollution_mortality_data = Base.classes.aq_pollution_mortality_data



In [8]:
# Create our session (link) from Python to the DB
session = Session(engine)
session

<sqlalchemy.orm.session.Session at 0x20dd076c9c8>

# Ambient Air Quality Data


FROM: 'Ambient Air Quality Data' table

Data can be updated by changing the year that you want to check # input year ("2010 to 2019"))and/or by selecting the country

Top countries with the highest concentration of PM25, PM10, and NO2 in the air quality combined in the past 10 years 
(our complete data here covered only 2010-2019, 2020-2022 are incomplete)

Top countries with the highest concentration of PM25 in 2019

Top countries with the highest concentration of PM10  in  2019

Top countries with the highest concentration of NO2  in 2019


In [9]:
Ambient_Air_Quality_Data = pd.read_sql('select * from Ambient_Air_Quality_Data', connection)
# Ambient_Air_Quality_Data

In [26]:
# unique_countries = top_100_countries['country'].unique()
# unique_countries

Top 10 countries in 2019 with the highest concentartion of PM25, PM10, and NO2 in the air quality combined

In [27]:
# Change the year that you want to check ( 2010 to 2019)
# Define the date range for the year 2019
year = 2019

# Create the SQLAlchemy query
result = (session.query(Ambient_air_quality_data.country,
            func.sum(Ambient_air_quality_data.pm25).label('pm25_sum'),
            func.sum(Ambient_air_quality_data.pm10).label('pm10_sum'),
            func.sum(Ambient_air_quality_data.no2).label('no2_sum'))
          .filter(Ambient_air_quality_data.year == year)
          .group_by(Ambient_air_quality_data.country)
          .order_by((func.sum(Ambient_air_quality_data.pm25) +
                     func.sum(Ambient_air_quality_data.pm10) +
                     func.sum(Ambient_air_quality_data.no2)).desc())
          .limit(10))

# Convert the result to a pandas DataFrame
top_10_countries = pd.read_sql(result.statement, session.bind)

# Print the result
top_10_countries


Unnamed: 0,country,pm25_sum,pm10_sum,no2_sum
0,India,5231.42,32467.79,7350.89
1,China,37430.51,310.35,0.0
2,Italy,3246.84,8385.76,8191.51
3,Germany,1616.68,4109.01,7232.68
4,France,1280.87,4954.21,4964.15
5,Spain,843.15,4093.52,4120.49
6,Poland,1648.66,4983.4,1767.99
7,Turkey,962.13,4860.04,1997.51
8,Switzerland,580.2,1302.62,1968.48
9,United Kingdom,585.12,898.41,2319.16


Top 10 countries in 2019 with the highest concentration of PM2.5

In [34]:
# Change the year that you want to check ( 2010 to 2019)
# Define the date range for the year 2019
year = 2019
   
# Create the SQLAlchemy query
result = (session.query(Ambient_air_quality_data.country,
            func.sum(Ambient_air_quality_data.pm25).label('pm25_sum'))
          .filter(Ambient_air_quality_data.year == year)
          .group_by(Ambient_air_quality_data.country)
          .order_by((func.sum(Ambient_air_quality_data.pm25).desc()))
          .limit(10))
# Convert the result to a pandas DataFrame\n",
top_10_countries = pd.read_sql(result.statement, session.bind)
# Print the result
top_10_countries

Unnamed: 0,country,pm25_sum
0,China,37430.51
1,India,5231.42
2,Italy,3246.84
3,Poland,1648.66
4,Germany,1616.68
5,France,1280.87
6,Canada,994.77
7,Turkey,962.13
8,Czechia,871.23
9,Spain,843.15


Top 10 countries in 2019 with the highest concentration of PM10

In [36]:
# Change the year that you want to check ( 2010 to 2019)
# Define the date range for the year 2019,
year = 2019
   
# Create the SQLAlchemy query
result = (session.query(Ambient_air_quality_data.country,
            func.sum(Ambient_air_quality_data.pm10).label('pm10_sum'))
          .filter(Ambient_air_quality_data.year == year)
          .group_by(Ambient_air_quality_data.country)
          .order_by((func.sum(Ambient_air_quality_data.pm10).desc()))
          .limit(10))
# Convert the result to a pandas DataFrame\n",
top_10_countries = pd.read_sql(result.statement, session.bind)
# Print the result
top_10_countries

Unnamed: 0,country,pm10_sum
0,India,32467.79
1,Italy,8385.76
2,Poland,4983.4
3,France,4954.21
4,Turkey,4860.04
5,Germany,4109.01
6,Spain,4093.52
7,United States of America,1844.52
8,Czechia,1786.96
9,Austria,1503.26


Top countries with the highest concentration of NO2  in 2019

In [37]:
# Change the year that you want to check ( 2010 to 2019)
# Define the date range for the year 2019,
year = 2019
   
# Create the SQLAlchemy query
result = (session.query(Ambient_air_quality_data.country,
            func.sum(Ambient_air_quality_data.no2).label('no2_sum'))
          .filter(Ambient_air_quality_data.year == year)
          .group_by(Ambient_air_quality_data.country)
          .order_by((func.sum(Ambient_air_quality_data.no2).desc()))
          .limit(10))
# Convert the result to a pandas DataFrame\n",
top_10_countries = pd.read_sql(result.statement, session.bind)
# Print the result
top_10_countries

Unnamed: 0,country,no2_sum
0,Italy,8191.51
1,India,7350.89
2,Germany,7232.68
3,France,4964.15
4,Spain,4120.49
5,United Kingdom,2319.16
6,Turkey,1997.51
7,Switzerland,1968.48
8,Austria,1796.69
9,Poland,1767.99


# Outdoor Pollution Rates by Age

FROM: 'Outdoor_Air_Death_Rates_by_Age' table

Data can be updated by changing the year that you want to check # input year ("1990 to 2019")) and/or by selecting the country

Top 10 countries with the highest death rates in 2019 age groups combined

Top 10 countries with highest death rates in 2019 for the 'under_5' age group

Top 10 countries with highest death rates in 2019 for the 'age_5_to_14_years' age group

Top 10 countries with highest death rates in 2019 for the 'age_15_to_49_years' age group

Top 10 countries with highest death rates in 2019 for the 'age_50_to_69_years' age group

Top 10 countries with highest death rates in 2019 for the 'age_70plus_years' age group

In [13]:
Outdoor_Air_Death_Rates_by_Age = pd.read_sql('select * from Outdoor_Air_Death_Rates_by_Age', connection)


Top 10 countries with the highest death rates in 2019 age groups combined

In [40]:
import pycountry
import pandas as pd
from sqlalchemy import func

# Define a function to check if a country name is valid
def is_valid_country(country_name):
    try:
        pycountry.countries.search_fuzzy(country_name)
        return True
    except LookupError:
        return False

# Query the Outdoor_air_death_rates_by_age table to get the top 10 countries with highest death rates in 2019 for the specified age groups
age_pollution_results = session.query(
    Outdoor_air_death_rates_by_age.country,
    Outdoor_air_death_rates_by_age.year,
    func.sum(Outdoor_air_death_rates_by_age.under_5).label('under_5'),
    func.sum(Outdoor_air_death_rates_by_age.age_5_to_14_years).label('age_5_to_14_years'),
    func.sum(Outdoor_air_death_rates_by_age.age_70plus_years).label('age_70plus_years'),
    func.sum(Outdoor_air_death_rates_by_age.age_15_to_49_years).label('age_15_to_49_years'),
    func.sum(Outdoor_air_death_rates_by_age.age_50_to_69_years).label('age_50_to_69_years')
).filter(
    Outdoor_air_death_rates_by_age.year == 2019,
    func.lower(Outdoor_air_death_rates_by_age.country).in_([c.name.lower() for c in pycountry.countries if is_valid_country(c.name)]),
    Outdoor_air_death_rates_by_age.year >= 2015
).group_by(
    Outdoor_air_death_rates_by_age.country,
    Outdoor_air_death_rates_by_age.year
).order_by(
    func.sum(Outdoor_air_death_rates_by_age.under_5).desc(),
    func.sum(Outdoor_air_death_rates_by_age.age_5_to_14_years).desc(),
    func.sum(Outdoor_air_death_rates_by_age.age_70plus_years).desc(),
    func.sum(Outdoor_air_death_rates_by_age.age_15_to_49_years).desc(),
    func.sum(Outdoor_air_death_rates_by_age.age_50_to_69_years).desc()
).limit(10).all()

# Convert the query results to a pandas DataFrame
df = pd.DataFrame(age_pollution_results, columns=[
    'country', 'year', 'under_5', 'age_5_to_14_years', 'age_15_to_49_years', 'age_50_to_69_years', 'age_70plus_years'
])

# Display the DataFrame
df.head(10)


Unnamed: 0,country,year,under_5,age_5_to_14_years,age_15_to_49_years,age_50_to_69_years,age_70plus_years
0,Pakistan,2019,101.894562,1.084323,864.855474,13.276061,190.377088
1,Nigeria,2019,98.994145,0.610762,555.734909,4.124567,77.070225
2,Cameroon,2019,79.508952,1.265684,628.605675,7.844955,129.009571
3,Djibouti,2019,74.300174,1.171645,618.384502,8.842384,127.89238
4,India,2019,68.945005,0.858983,1011.86621,12.257924,201.43284
5,Mauritania,2019,64.312576,0.862998,609.23861,5.090131,100.280224
6,Lesotho,2019,62.60748,1.03434,530.967309,9.464656,150.617248
7,Botswana,2019,55.61023,0.963583,607.016358,9.812889,146.446376
8,Ghana,2019,52.974236,0.796439,699.954729,9.767004,146.759379
9,Equatorial Guinea,2019,49.894763,0.530833,791.090141,5.505915,130.307988


Top 10 countries with highest death rates in 2019 for the 'under_5' age group

In [43]:
# Define a function to check if a country name is valid
def is_valid_country(country_name):
    try:
        pycountry.countries.search_fuzzy(country_name)
        return True
    except LookupError:
        return False

#Query the Outdoor_air_death_rates_by_age table to get the top 10 countries with highest death rates in 2019 for children under 5
age_pollution_results = session.query(
        Outdoor_air_death_rates_by_age.country,
        Outdoor_air_death_rates_by_age.year,
        func.sum(Outdoor_air_death_rates_by_age.under_5).label('under_5')
    ).filter(
        Outdoor_air_death_rates_by_age.year == 2019,
        func.lower(Outdoor_air_death_rates_by_age.country).in_([c.name.lower() for c in pycountry.countries if is_valid_country(c.name)]),
        Outdoor_air_death_rates_by_age.year >= 2015
    ).group_by(
        Outdoor_air_death_rates_by_age.country,
        Outdoor_air_death_rates_by_age.year
    ).order_by(
        func.sum(Outdoor_air_death_rates_by_age.under_5).desc()
    ).limit(10).all()

#Convert the query results to a pandas DataFrame
df = pd.DataFrame(age_pollution_results, columns=['country', 'year', 'under_5'])

#Display the DataFrame
df.head(10)

Unnamed: 0,country,year,under_5
0,Pakistan,2019,101.894562
1,Nigeria,2019,98.994145
2,Cameroon,2019,79.508952
3,Djibouti,2019,74.300174
4,India,2019,68.945005
5,Mauritania,2019,64.312576
6,Lesotho,2019,62.60748
7,Botswana,2019,55.61023
8,Ghana,2019,52.974236
9,Equatorial Guinea,2019,49.894763


Top 10 countries with highest death rates in 2019 for the 'age_5_to_14_years' age group

In [45]:
# Define a function to check if a country name is valid
def is_valid_country(country_name):
    try:
        pycountry.countries.search_fuzzy(country_name)
        return True
    except LookupError:
        return False

#Query the Outdoor_air_death_rates_by_age table to get the top 10 countries with highest death rates in 2019 for age_5_to_14_years
age_pollution_results = session.query(
        Outdoor_air_death_rates_by_age.country,
        Outdoor_air_death_rates_by_age.year,
        func.sum(Outdoor_air_death_rates_by_age.age_5_to_14_years).label('age_5_to_14_years')
    ).filter(
        Outdoor_air_death_rates_by_age.year == 2019,
        func.lower(Outdoor_air_death_rates_by_age.country).in_([c.name.lower() for c in pycountry.countries if is_valid_country(c.name)]),
        Outdoor_air_death_rates_by_age.year >= 2015
    ).group_by(
        Outdoor_air_death_rates_by_age.country,
        Outdoor_air_death_rates_by_age.year
    ).order_by(
        func.sum(Outdoor_air_death_rates_by_age.age_5_to_14_years).desc()
    ).limit(10).all()

#Convert the query results to a pandas DataFrame
df = pd.DataFrame(age_pollution_results, columns=['country', 'year', 'age_5_to_14_years'])

#Display the DataFrame
df.head(10)

Unnamed: 0,country,year,age_5_to_14_years
0,Uzbekistan,2019,1.79576
1,Egypt,2019,1.365701
2,Cameroon,2019,1.265684
3,Tajikistan,2019,1.245731
4,Djibouti,2019,1.171645
5,Pakistan,2019,1.084323
6,Bangladesh,2019,1.057153
7,Lesotho,2019,1.03434
8,Guinea,2019,1.026235
9,Eritrea,2019,1.016561


Top 10 countries with highest death rates in 2019 for the 'age_15_to_49_years' age group

In [46]:
# Define a function to check if a country name is valid
def is_valid_country(country_name):
    try:
        pycountry.countries.search_fuzzy(country_name)
        return True
    except LookupError:
        return False

#Query the Outdoor_air_death_rates_by_age table to get the top 10 countries with highest death rates in 2019 for age_15_to_49_years
age_pollution_results = session.query(
        Outdoor_air_death_rates_by_age.country,
        Outdoor_air_death_rates_by_age.year,
        func.sum(Outdoor_air_death_rates_by_age.age_15_to_49_years).label('age_15_to_49_years')
    ).filter(
        Outdoor_air_death_rates_by_age.year == 2019,
        func.lower(Outdoor_air_death_rates_by_age.country).in_([c.name.lower() for c in pycountry.countries if is_valid_country(c.name)]),
        Outdoor_air_death_rates_by_age.year >= 2015
    ).group_by(
        Outdoor_air_death_rates_by_age.country,
        Outdoor_air_death_rates_by_age.year
    ).order_by(
        func.sum(Outdoor_air_death_rates_by_age.age_15_to_49_years).desc()
    ).limit(10).all()

#Convert the query results to a pandas DataFrame
df = pd.DataFrame(age_pollution_results, columns=['country', 'year', 'age_15_to_49_years'])

#Display the DataFrame
df.head(10)

Unnamed: 0,country,year,age_15_to_49_years
0,Mongolia,2019,22.902026
1,Egypt,2019,22.109721
2,Saudi Arabia,2019,20.43216
3,United Arab Emirates,2019,19.927865
4,Uzbekistan,2019,17.254512
5,Turkmenistan,2019,17.196562
6,Libya,2019,14.660402
7,Iraq,2019,13.851077
8,Ukraine,2019,13.632345
9,Pakistan,2019,13.276061


Top 10 countries with highest death rates in 2019 for the 'age_50_to_69_years' age group

In [49]:
# Define a function to check if a country name is valid
def is_valid_country(country_name):
    try:
        pycountry.countries.search_fuzzy(country_name)
        return True
    except LookupError:
        return False

#Query the Outdoor_air_death_rates_by_age table to get the top 10 countries with highest death rates in 2019 for age_50_to_69_years
age_pollution_results = session.query(
        Outdoor_air_death_rates_by_age.country,
        Outdoor_air_death_rates_by_age.year,
        func.sum(Outdoor_air_death_rates_by_age.age_50_to_69_years).label('age_50_to_69_years')
    ).filter(
        Outdoor_air_death_rates_by_age.year == 2019,
        func.lower(Outdoor_air_death_rates_by_age.country).in_([c.name.lower() for c in pycountry.countries if is_valid_country(c.name)]),
        Outdoor_air_death_rates_by_age.year >= 2015
    ).group_by(
        Outdoor_air_death_rates_by_age.country,
        Outdoor_air_death_rates_by_age.year
    ).order_by(
        func.sum(Outdoor_air_death_rates_by_age.age_50_to_69_years).desc()
    ).limit(10).all()

#Convert the query results to a pandas DataFrame
df = pd.DataFrame(age_pollution_results, columns=['country', 'year', 'age_50_to_69_years'])

#Display the DataFrame
df.head(10)

Unnamed: 0,country,year,age_50_to_69_years
0,Egypt,2019,354.465382
1,Uzbekistan,2019,281.995798
2,Iraq,2019,249.905151
3,Saudi Arabia,2019,221.519942
4,Mongolia,2019,219.160752
5,Turkmenistan,2019,213.625206
6,India,2019,201.43284
7,Morocco,2019,197.195141
8,Nepal,2019,196.347995
9,Tajikistan,2019,192.164736


Top 10 countries with highest death rates in 2019 for the 'age_70plus_years' age group

In [50]:
# Define a function to check if a country name is valid
def is_valid_country(country_name):
    try:
        pycountry.countries.search_fuzzy(country_name)
        return True
    except LookupError:
        return False

#Query the Outdoor_air_death_rates_by_age table to get the top 10 countries with highest death rates in 2019 for age_70plus_years
age_pollution_results = session.query(
        Outdoor_air_death_rates_by_age.country,
        Outdoor_air_death_rates_by_age.year,
        func.sum(Outdoor_air_death_rates_by_age.age_70plus_years).label('age_70plus_years')
    ).filter(
        Outdoor_air_death_rates_by_age.year == 2019,
        func.lower(Outdoor_air_death_rates_by_age.country).in_([c.name.lower() for c in pycountry.countries if is_valid_country(c.name)]),
        Outdoor_air_death_rates_by_age.year >= 2015
    ).group_by(
        Outdoor_air_death_rates_by_age.country,
        Outdoor_air_death_rates_by_age.year
    ).order_by(
        func.sum(Outdoor_air_death_rates_by_age.age_70plus_years).desc()
    ).limit(10).all()

#Convert the query results to a pandas DataFrame
df = pd.DataFrame(age_pollution_results, columns=['country', 'year', 'age_70plus_years'])

#Display the DataFrame
df.head(10)

Unnamed: 0,country,year,age_70plus_years
0,Uzbekistan,2019,1587.262141
1,Egypt,2019,1303.048885
2,Nepal,2019,1261.0903
3,Oman,2019,1253.966342
4,Tajikistan,2019,1171.260097
5,Bahrain,2019,1087.470906
6,Iraq,2019,1075.839887
7,Azerbaijan,2019,1062.319935
8,Qatar,2019,1032.71167
9,India,2019,1011.86621


# Death_Rates_from_Air_Pollution

FROM: 'Death_Rates_from_Air_Pollution' table

Data can be updated by changing the year that you want to check # input year ("1990 to 2019")) and/or by selecting the country

Top 10 countries in year 2019 who have the highest death rate of 'air_pollution_deaths'

Top 10 countries in year 2019 who have the highest number of 'ambient_particulate_matter_pollution_deaths'

Top 10 countries in year 2019 who have the highest death rate of 'household_air_pollution_deaths'

Top 10 countries in year 2019 who have the highest death rate of 'ambient_ozone_pollution_deaths'

In [51]:
Death_Rates_from_Air_Pollution = pd.read_sql('select * from Death_Rates_from_Air_Pollution', connection)
# Death_Rates_from_Air_Pollution

Top 10 countries in year 2019 who have the highest death rate of 'air_pollution_deaths'

In [60]:
# Query the top 10 countries with highest air pollution deaths in 2019
top10_countries = session.query(
        Death_rates_from_air_pollution.country,
        Death_rates_from_air_pollution.air_pollution_deaths
    ).filter(
        Death_rates_from_air_pollution.year == 2019
    ).order_by(
        Death_rates_from_air_pollution.air_pollution_deaths.desc()
    ).limit(10).all()


# Convert the query result to a pandas dataframe
top10_countries_df = pd.DataFrame(
        top10_countries, 
        columns=['Country', 'Air_Pollution_Deaths']
    )
top10_countries_df


Unnamed: 0,Country,Air_Pollution_Deaths
0,Solomon Islands,432.927795
1,Central African Republic,287.261837
2,Somalia,280.003604
3,Papua New Guinea,254.161654
4,Vanuatu,250.74625
5,Guinea-Bissau,243.934563
6,Afghanistan,238.330404
7,Chad,224.693109
8,Niger,223.494654
9,Nepal,221.997435


Top 10 countries in year 2019 who have the highest number of 'household_air_pollution_deaths'

In [61]:
# Query the top 10 countries with highest household_air_pollution_deaths in 2019
top10_countries = session.query(
        Death_rates_from_air_pollution.country,
        Death_rates_from_air_pollution.household_air_pollution_deaths
    ).filter(
        Death_rates_from_air_pollution.year == 2019
    ).order_by(
        Death_rates_from_air_pollution.household_air_pollution_deaths.desc()
    ).limit(10).all()

# Convert the query result to a pandas dataframe
top10_countries_df = pd.DataFrame(
        top10_countries, 
        columns=['Country', 'household_air_pollution_deaths']
    )
top10_countries_df


Unnamed: 0,Country,household_air_pollution_deaths
0,Solomon Islands,397.259518
1,Somalia,272.016627
2,Central African Republic,251.240382
3,Papua New Guinea,229.517215
4,Vanuatu,217.726538
5,Niger,199.514586
6,Guinea-Bissau,198.904015
7,Chad,195.550233
8,Kiribati,193.456736
9,Burundi,186.151162


Top 10 countries in year 2019 who have the highest death rate of 'ambient_ozone_pollution_deaths'

In [62]:
# Query the top 10 countries with highest ambient_ozone_pollution_deaths in 2019
top10_countries = session.query(
        Death_rates_from_air_pollution.country,
        Death_rates_from_air_pollution.ambient_ozone_pollution_deaths
    ).filter(
        Death_rates_from_air_pollution.year == 2019
    ).order_by(
        Death_rates_from_air_pollution.ambient_ozone_pollution_deaths.desc()
    ).limit(10).all()

# Convert the query result to a pandas dataframe
top10_countries_df = pd.DataFrame(
        top10_countries, 
        columns=['Country', 'ambient_ozone_pollution_deaths']
    )
top10_countries_df

Unnamed: 0,Country,ambient_ozone_pollution_deaths
0,Nepal,34.929455
1,India,18.304516
2,South Asia (WB),16.908039
3,Bhutan,14.53259
4,South-East Asia Region (WHO),13.948752
5,Pakistan,13.93471
6,North Korea,13.737307
7,World Bank Lower Middle Income,10.88162
8,Bangladesh,8.848887
9,Central African Republic,6.462195


Top 10 countries in year 2019 who have the highest number of 'ambient_particulate_matter_pollution_deaths'

In [63]:
# Query the top 10 countries with highest ambient_particulate_matter_pollution_deaths in 2019
top10_countries = session.query(
        Death_rates_from_air_pollution.country,
        Death_rates_from_air_pollution.ambient_particulate_matter_pollution_deaths
    ).filter(
        Death_rates_from_air_pollution.year == 2019
    ).order_by(
        Death_rates_from_air_pollution.ambient_particulate_matter_pollution_deaths.desc()
    ).limit(10).all()

# Convert the query result to a pandas dataframe
top10_countries_df = pd.DataFrame(
        top10_countries, 
        columns=['Country', 'ambient_particulate_matter_pollution_deaths']
    )
top10_countries_df

Unnamed: 0,Country,ambient_particulate_matter_pollution_deaths
0,Uzbekistan,176.445651
1,Egypt,157.467515
2,Qatar,128.860605
3,Oman,127.022083
4,Iraq,121.619637
5,Tajikistan,115.84625
6,Saudi Arabia,109.837659
7,Azerbaijan,108.391364
8,Mongolia,106.759508
9,Bahrain,103.714155


# Disease Burden by Risk Factor

FROM: 'Disease_Burden_by_Risk_Factor' table

Data can be updated by changing the year that you want to check # input year ("1990 to 2019"))

Value of each risk factor DALYs in descending order for year 2019

It will show the ranking of air pollution DALYSs compared to other risk factors


In [68]:
Disease_Burden_by_Risk_Factor = pd.read_sql('select * from Disease_Burden_by_Risk_Factor', connection)
Disease_Burden_by_Risk_Factor.columns

Index(['dalys_id', 'country', 'iso3', 'year', 'dalys_low_physical_activity',
       'dalys_non_exclusive_breastfeeding', 'dalys_air_pollution',
       'dalys_child_wasting', 'dalys_high_systolic_bp',
       'dalys_high_fasting_glucose', 'dalys_child_stunting',
       'dalys_high_body_mass_index', 'dalys_secondhand_smoke',
       'dalys_unsafe_sanitation', 'dalys_unsafe_water_source',
       'dalys_diet_low_in_vegetables', 'dalys_diet_low_in_fruits',
       'dalys_diet_high_in_sodium', 'dalys_drug_use',
       'dalys_household_air_pollution_from_solid_fuels',
       'dalys_high_ldl_cholesterol', 'dalys_iron_deficiency',
       'dalys_zinc_deficiency', 'dalys_smoking', 'dalys_vitamina_deficiency',
       'dalys_particulate_matter_pollution'],
      dtype='object')

In [78]:
Disease_Burden_by_Risk_Factor.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6840 entries, 0 to 6839
Data columns (total 26 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   dalys_id                                        6840 non-null   object 
 1   country                                         6840 non-null   object 
 2   iso3                                            6840 non-null   object 
 3   year                                            6840 non-null   int64  
 4   dalys_low_physical_activity                     6840 non-null   float64
 5   dalys_non_exclusive_breastfeeding               6840 non-null   float64
 6   dalys_air_pollution                             6840 non-null   float64
 7   dalys_child_wasting                             6840 non-null   float64
 8   dalys_high_systolic_bp                          6840 non-null   float64
 9   dalys_high_fasting_glucose               

In [92]:
# Select the desired columns and filter by year 2019
results = session.query(
    Disease_burden_by_risk_factor.country,
    Disease_burden_by_risk_factor.dalys_low_physical_activity,
    Disease_burden_by_risk_factor.dalys_non_exclusive_breastfeeding,
    Disease_burden_by_risk_factor.dalys_air_pollution,
    Disease_burden_by_risk_factor.dalys_child_wasting,
    Disease_burden_by_risk_factor.dalys_high_systolic_bp,
    Disease_burden_by_risk_factor.dalys_high_fasting_glucose,
    Disease_burden_by_risk_factor.dalys_child_stunting,
    Disease_burden_by_risk_factor.dalys_high_body_mass_index,
    Disease_burden_by_risk_factor.dalys_secondhand_smoke,
    Disease_burden_by_risk_factor.dalys_unsafe_sanitation,
    Disease_burden_by_risk_factor.dalys_unsafe_water_source,
    Disease_burden_by_risk_factor.dalys_diet_low_in_vegetables,
    Disease_burden_by_risk_factor.dalys_diet_low_in_fruits,
    Disease_burden_by_risk_factor.dalys_diet_high_in_sodium,
    Disease_burden_by_risk_factor.dalys_drug_use,
    Disease_burden_by_risk_factor.dalys_household_air_pollution_from_solid_fuels,
    Disease_burden_by_risk_factor.dalys_high_ldl_cholesterol,
    Disease_burden_by_risk_factor.dalys_iron_deficiency,
    Disease_burden_by_risk_factor.dalys_zinc_deficiency,
    Disease_burden_by_risk_factor.dalys_smoking,
    Disease_burden_by_risk_factor.dalys_vitamina_deficiency,
    Disease_burden_by_risk_factor.dalys_particulate_matter_pollution
    ).filter(
        Disease_burden_by_risk_factor.year == 2019
    ).all()

# Create a Pandas DataFrame from the results
df = pd.DataFrame(results, columns=[
    'country', 'dalys_low_physical_activity',
       'dalys_non_exclusive_breastfeeding', 'dalys_air_pollution',
       'dalys_child_wasting', 'dalys_high_systolic_bp',
       'dalys_high_fasting_glucose', 'dalys_child_stunting',
       'dalys_high_body_mass_index', 'dalys_secondhand_smoke',
       'dalys_unsafe_sanitation', 'dalys_unsafe_water_source',
       'dalys_diet_low_in_vegetables', 'dalys_diet_low_in_fruits',
       'dalys_diet_high_in_sodium', 'dalys_drug_use',
       'dalys_household_air_pollution_from_solid_fuels',
       'dalys_high_ldl_cholesterol', 'dalys_iron_deficiency',
       'dalys_zinc_deficiency', 'dalys_smoking', 'dalys_vitamina_deficiency',
       'dalys_particulate_matter_pollution'
])

# Select only the columns containing numeric data
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()

# Rank the columns by their values in descending order
ranked_cols = df[numeric_cols].mean().sort_values(ascending=False).index.tolist()

# Convert the query results to a pandas DataFrame
df = pd.DataFrame(ranked_cols, columns=['Disease_burden_by_risk_factor by Rank'])

# Display the DataFrame
df.head(10)


Unnamed: 0,Disease_burden_by_risk_factor by Rank
0,dalys_high_systolic_bp
1,dalys_smoking
2,dalys_air_pollution
3,dalys_high_fasting_glucose
4,dalys_high_body_mass_index
5,dalys_particulate_matter_pollution
6,dalys_high_ldl_cholesterol
7,dalys_household_air_pollution_from_solid_fuels
8,dalys_child_wasting
9,dalys_unsafe_water_source


# Number of Deaths by Risk Factor

FROM: 'Number_of_Deaths_by_Risk_Factor' table

Data can be updated by changing the year that you want to check # input year ("1990 to 2019"))

Value of each risk factor mortality rate in descending order for year 2019

It will show the ranking of health risk factors compared to other each other that causes death

In [91]:
Number_of_Deaths_by_Risk_Factor = pd.read_sql('select * from Number_of_Deaths_by_Risk_Factor', connection)
# Number_of_Deaths_by_Risk_Factor.columns

In [90]:
# Select the desired columns and filter by year 2019
results = session.query(
       Number_of_deaths_by_risk_factor.country,
       Number_of_deaths_by_risk_factor.deaths_from_outdoor_air_pollution,
       Number_of_deaths_by_risk_factor.deaths_from_high_systolic_blood_pressure,
       Number_of_deaths_by_risk_factor.deaths_from_high_sodium_diet,
       Number_of_deaths_by_risk_factor.deaths_from_low_whole_grains_diet,
       Number_of_deaths_by_risk_factor.deaths_from_alcohol_use,
       Number_of_deaths_by_risk_factor.deaths_from_low_fruits_diet,
       Number_of_deaths_by_risk_factor.deaths_from_unsafe_water_source,
       Number_of_deaths_by_risk_factor.deaths_from_secondhand_smoke,
       Number_of_deaths_by_risk_factor.death_from_low_birth_weight,
       Number_of_deaths_by_risk_factor.deaths_from_child_wasting,
       Number_of_deaths_by_risk_factor.deaths_from_unsafe_sex,
       Number_of_deaths_by_risk_factor.deaths_from_diet_low_in_nuts_and_seeds,
       Number_of_deaths_by_risk_factor.death_from_household_air_pollution_from_solid_fuels,
       Number_of_deaths_by_risk_factor.deaths_from_diet_low_in_vegetables,
       Number_of_deaths_by_risk_factor.deaths_low_physical_activity,
       Number_of_deaths_by_risk_factor.deaths_from__smoking,
       Number_of_deaths_by_risk_factor.deaths_from_high_fasting_plasma_glucose,
       Number_of_deaths_by_risk_factor.deaths_from_air_pollution,
       Number_of_deaths_by_risk_factor.deaths_from_high_body_mass_index,
       Number_of_deaths_by_risk_factor.deaths_from_unsafe_sanitation,
       Number_of_deaths_by_risk_factor.deaths_from_no_access_to_handwashing_facility, 
       Number_of_deaths_by_risk_factor.deaths_from_drug_use,
       Number_of_deaths_by_risk_factor.deaths_from_low_bone_mineral_density,
       Number_of_deaths_by_risk_factor.deaths_from_vitamina_deficiency, 
       Number_of_deaths_by_risk_factor.deaths_from_child_stunting,
       Number_of_deaths_by_risk_factor.deaths_from_discontinued_breastfeeding,
       Number_of_deaths_by_risk_factor.deaths_from_non_exclusive_breastfeeding,
       Number_of_deaths_by_risk_factor.deaths_from_iron_deficiency
).filter(
       Number_of_deaths_by_risk_factor.year == 2019
).all()

# Create a Pandas DataFrame from the results
df = pd.DataFrame(results, columns=[
       'country',
       'deaths_from_outdoor_air_pollution',
       'deaths_from_high_systolic_blood_pressure',
       'deaths_from_high_sodium_diet', 'deaths_from_low_whole_grains_diet',
       'deaths_from_alcohol_use', 'deaths_from_low_fruits_diet',
       'deaths_from_unsafe_water_source', 'deaths_from_secondhand_smoke',
       'death_from_low_birth_weight', 'deaths_from_child_wasting',
       'deaths_from_unsafe_sex', 'deaths_from_diet_low_in_nuts_and_seeds',
       'death_from_household_air_pollution_from_solid_fuels',
       'deaths_from_diet_low_in_vegetables', 'deaths_low_physical_activity',
       'deaths_from__smoking', 'deaths_from_high_fasting_plasma_glucose',
       'deaths_from_air_pollution', 'deaths_from_high_body_mass_index',
       'deaths_from_unsafe_sanitation',
       'deaths_from_no_access_to_handwashing_facility', 'deaths_from_drug_use',
       'deaths_from_low_bone_mineral_density',
       'deaths_from_vitamina_deficiency', 'deaths_from_child_stunting',
       'deaths_from_discontinued_breastfeeding',
       'deaths_from_non_exclusive_breastfeeding',
       'deaths_from_iron_deficiency'])
                  

  # Select only the columns containing numeric data
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()

# Rank the columns by their values in descending order
ranked_cols = df[numeric_cols].mean().sort_values(ascending=False).index.tolist()

# Convert the query results to a pandas DataFrame
df = pd.DataFrame(ranked_cols, columns=['Death Risk Factor by Rank'])

# Display the DataFrame
df.head(10)  
    
    
    
    
    # # Rank the columns by their values in descending order
# ranked_cols = df.mean().sort_values(ascending=False)


# # # Convert the query results to a pandas DataFrame
# # df = pd.DataFrame(ranked_cols, columns=['Mortality Rate' ])
# # # Display the DataFrame
# # df.head(30)

Unnamed: 0,Death Risk Factor by Rank
0,deaths_from_high_systolic_blood_pressure
1,deaths_from__smoking
2,deaths_from_high_fasting_plasma_glucose
3,deaths_from_air_pollution
4,deaths_from_high_body_mass_index
5,deaths_from_outdoor_air_pollution
6,deaths_from_alcohol_use
7,death_from_household_air_pollution_from_solid_...
8,deaths_from_high_sodium_diet
9,deaths_from_low_whole_grains_diet


# Pollution Mortality Data 

FROM: AQ_Pollution_Mortality_Data  

Data can be updated by country name input, then values of pollution deaths and their ranks for each country will show up

Values of pollution deaths and their ranks for country in 2019

Different types of pollutions were ranked from highest to lowest

Values of pollution deaths and their ranks for country in 2019

In [93]:
AQ_Pollution_Mortality_Data = pd.read_sql('select * from AQ_Pollution_Mortality_Data', connection)
AQ_Pollution_Mortality_Data.columns

Index(['mor_id', 'country', 'iso3', 'death_rate_ranking',
       'total_pollution_deaths', 'air_pollution_deaths',
       'water_pollution_deaths', 'occupational_pollution_deaths',
       'lead_deaths'],
      dtype='object')

Different types of pollutions were ranked from highest to lowest

In [105]:
# Select the desired columns and filter by year 2019
results = session.query(
        Aq_pollution_mortality_data.death_rate_ranking,
        Aq_pollution_mortality_data.total_pollution_deaths,
        Aq_pollution_mortality_data.air_pollution_deaths,
        Aq_pollution_mortality_data.water_pollution_deaths,
        Aq_pollution_mortality_data.occupational_pollution_deaths,
        Aq_pollution_mortality_data.lead_deaths).filter().all()

# Create a Pandas DataFrame from the results
df = pd.DataFrame(results, columns=['death_rate_ranking',
       'total_pollution_deaths', 'air_pollution_deaths',
       'water_pollution_deaths', 'occupational_pollution_deaths',
       'lead_deaths'])

# Rank the columns by their values in descending order
ranked_cols = df.mean().sort_values(ascending=False)

# Create a new DataFrame from the ranked columns
df_ranked = pd.DataFrame(ranked_cols, columns=['Mortality Rate'])

# Display the top 10 columns
df_ranked.head(10)



Unnamed: 0,Mortality Rate
total_pollution_deaths,44400.909574
air_pollution_deaths,25905.537234
water_pollution_deaths,8578.117021
lead_deaths,5573.276596
occupational_pollution_deaths,4343.930851
death_rate_ranking,94.5


In [24]:
# Query to get the values of pollution deaths and their rank
pollution_deaths = session.query(
                        Aq_pollution_mortality_data.country,
                        Aq_pollution_mortality_data.total_pollution_deaths,
                        func.rank().over(order_by=Aq_pollution_mortality_data.total_pollution_deaths.desc()).label('total_pollution_deaths_rank'),
                        Aq_pollution_mortality_data.air_pollution_deaths,
                        func.rank().over(order_by=Aq_pollution_mortality_data.air_pollution_deaths.desc()).label('air_pollution_deaths_rank'),
                        Aq_pollution_mortality_data.water_pollution_deaths,
                        func.rank().over(order_by=Aq_pollution_mortality_data.water_pollution_deaths.desc()).label('water_pollution_deaths_rank'),
                        Aq_pollution_mortality_data.occupational_pollution_deaths,
                        func.rank().over(order_by=Aq_pollution_mortality_data.occupational_pollution_deaths.desc()).label('occupational_pollution_deaths_rank'),
                        Aq_pollution_mortality_data.lead_deaths,
                        func.rank().over(order_by=Aq_pollution_mortality_data.lead_deaths.desc()).label('lead_deaths_rank')
                    )\
                    .all()

# Convert the query result to a pandas dataframe
df = pd.DataFrame(pollution_deaths, columns=['Country', 'Total_Pollution_Deaths', 'Total_Pollution_Deaths_Rank', 'Air_Pollution_Deaths', 'Air_Pollution_Deaths_Rank', 'Water_Pollution_Deaths', 'Water_Pollution_Deaths_Rank', 'Occupational_Pollution_Deaths', 'Occupational_Pollution_Deaths_Rank', 'Lead_Deaths', 'Lead_Deaths_Rank'])

df.head(10)


Unnamed: 0,Country,Total_Pollution_Deaths,Total_Pollution_Deaths_Rank,Air_Pollution_Deaths,Air_Pollution_Deaths_Rank,Water_Pollution_Deaths,Water_Pollution_Deaths_Rank,Occupational_Pollution_Deaths,Occupational_Pollution_Deaths_Rank,Lead_Deaths,Lead_Deaths_Rank
0,India,2326771,1,1240529,2,698597,1,153528,2,234117,2
1,China,1865566,2,1242987,1,9585,25,255580,1,357414,1
2,Nigeria,279318,3,114115,6,159777,2,2088,36,3338,35
3,Indonesia,232974,4,123753,4,60040,5,16331,9,32850,4
4,Pakistan,223836,5,128005,3,60213,4,8787,15,26831,6
5,Bangladesh,207922,6,122734,5,33583,7,13558,11,38048,3
6,United States of America,196930,7,107507,7,1628,59,59536,3,28260,5
7,Russian Federation,118687,8,99392,8,685,72,9634,14,8976,13
8,Ethiopia,110787,9,40614,19,63454,3,1931,38,4788,24
9,Brazil,109438,10,66245,9,7152,35,14462,10,21580,8
