# Analysis of Health Impacts and Mortality Risk of Air Pollution in Different Countries 

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sqlalchemy import desc

In [2]:
# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func

In [3]:
# Connect to local database
DATABASE_URI = 'postgresql://postgres:1372@localhost:5432/air_qualities'
engine = create_engine(DATABASE_URI)

In [4]:
# reflect an existing database into a new model
Base = automap_base()

# reflect the tables
Base.prepare(autoload_with=engine)

In [5]:
# View all of the classes that automap found
Base.classes.keys()

['number_of_deaths_by_risk_factor',
 'countries_codes_and_coordinates',
 'ambient_air_quality_data',
 'death_rates_from_air_pollution',
 'disease_burden_by_risk_factor',
 'outdoor_air_death_rates_by_age',
 'aq_pollution_mortality_data']

In [6]:
# Save references to each table
Countries_codes_and_coordinates = Base.classes.countries_codes_and_coordinates
Ambient_air_quality_data = Base.classes.ambient_air_quality_data
Death_rates_from_air_pollution = Base.classes.death_rates_from_air_pollution
Disease_burden_by_risk_factor = Base.classes.disease_burden_by_risk_factor
Number_of_deaths_by_risk_factor = Base.classes.number_of_deaths_by_risk_factor
Outdoor_air_death_rates_by_age = Base.classes.outdoor_air_death_rates_by_age
Aq_pollution_mortality_data = Base.classes.aq_pollution_mortality_data


In [7]:
# Create our session (link) from Python to the DB
session = Session(engine)
session

<sqlalchemy.orm.session.Session at 0x22cf2697bb0>

In [8]:
connection = engine.connect()

# Ambient Air Quality Data


In [9]:
CCC = pd.read_sql('select * from Countries_codes_and_coordinates', connection)
CCC

Unnamed: 0,coor_id,country,alpha_2_code,alpha_3_code,numeric_code,latitude,longitude
0,LL1,Afghanistan,AF,AFG,4,33.0000,65.0
1,LL2,Albania,AL,ALB,8,41.0000,20.0
2,LL3,Algeria,DZ,DZA,12,28.0000,3.0
3,LL4,American Samoa,AS,ASM,16,-14.3333,-170.0
4,LL5,Andorra,AD,AND,20,42.5000,1.6
...,...,...,...,...,...,...,...
240,LL252,Western Sahara,EH,ESH,732,24.5000,-13.0
241,LL253,Yemen,YE,YEM,887,15.0000,48.0
242,LL254,Zambia,ZM,ZMB,894,-15.0000,30.0
243,LL255,Zimbabwe,ZW,ZWE,716,-20.0000,30.0


In [10]:
country_informaiton=session.query(Countries_codes_and_coordinates.country,
                                  (Countries_codes_and_coordinates.latitude).label("lat"),
                                  (Countries_codes_and_coordinates.longitude).label("lng")
                                 ).distinct().all()
country_informaiton

[('Tonga', -20.0, -175.0),
 ('Azerbaijan', 40.5, 47.5),
 ('Mozambique', -18.25, 35.0),
 ('Kiribati', 1.4167, 173.0),
 ('Panama', 9.0, -80.0),
 ('French Polynesia', -15.0, -140.0),
 ('American Samoa', -14.3333, -170.0),
 ('Marshall Islands', 9.0, 168.0),
 ('Tanzania, United Republic of', -6.0, 35.0),
 ('Italy', 42.8333, 12.8333),
 ('San Marino', 43.7667, 12.4167),
 ('El Salvador', 13.8333, -88.9167),
 ('Antigua and Barbuda', 17.05, -61.8),
 ('Dominican Republic', 19.0, -70.6667),
 ('Saint Pierre and Miquelon', 46.8333, -56.3333),
 ('Jamaica', 18.25, -77.5),
 ('Egypt', 27.0, 30.0),
 ('Suriname', 4.0, -56.0),
 ('Bolivia', -17.0, -65.0),
 ('United States', 38.0, -97.0),
 ('French Southern Territories', -43.0, 67.0),
 ('Cook Islands', -21.2333, -159.7667),
 ('Ukraine', 49.0, 32.0),
 ('Jordan', 31.0, 36.0),
 ('Poland', 52.0, 20.0),
 ('Anguilla', 18.25, -63.1667),
 ('Madagascar', -20.0, 47.0),
 ('Armenia', 40.0, 45.0),
 ('Malta', 35.8333, 14.5833),
 ('India', 20.0, 77.0),
 ('Latvia', 57.0, 25

In [11]:
source={}
country_source=[]
country_name =[]
for row in country_informaiton:
    country = row.country
    lat = row.lat
    lng = row.lng
    
    source = {
        'country': country,
        'lat': lat,
        'lng': lng
    }
    country_source.append(source)
    country_name.append(country)
country = {
    'name': country_name,
    'metadata': country_source
}
country

{'name': ['Tonga',
  'Azerbaijan',
  'Mozambique',
  'Kiribati',
  'Panama',
  'French Polynesia',
  'American Samoa',
  'Marshall Islands',
  'Tanzania, United Republic of',
  'Italy',
  'San Marino',
  'El Salvador',
  'Antigua and Barbuda',
  'Dominican Republic',
  'Saint Pierre and Miquelon',
  'Jamaica',
  'Egypt',
  'Suriname',
  'Bolivia',
  'United States',
  'French Southern Territories',
  'Cook Islands',
  'Ukraine',
  'Jordan',
  'Poland',
  'Anguilla',
  'Madagascar',
  'Armenia',
  'Malta',
  'India',
  'Latvia',
  'Montserrat',
  'Eritrea',
  'Netherlands',
  'Ghana',
  'Morocco',
  'Bouvet Island',
  'Holy See (Vatican City State)',
  'Equatorial Guinea',
  'Luxembourg',
  'Liberia',
  'Cambodia',
  'Belize',
  'Hong Kong',
  'Guernsey',
  'Turkey',
  'Sri Lanka',
  'Burundi',
  'British Indian Ocean Territory',
  'Trinidad and Tobago',
  'Sweden',
  'Andorra',
  'South Korea',
  'Zimbabwe',
  'Vanuatu',
  'Pitcairn',
  'Botswana',
  'Turks and Caicos Islands',
  'Aust

Top countries with the highest concentartion of PM25, PM10, and NO2 in the air quality

In [None]:
Ambient_Air_Quality_Data = pd.read_sql('select * from Ambient_Air_Quality_Data', connection)
Ambient_Air_Quality_Data

In [None]:
unique_regin=Ambient_Air_Quality_Data["region"].unique()
unique_regin

In [26]:
# Define the date range for the past 10 years
current_year = 2019
past_years = 10
start_year = current_year - past_years

# Create the SQLAlchemy query
result = (session.query(Ambient_air_quality_data.country, Ambient_air_quality_data.year,
            func.sum(Ambient_air_quality_data.pm25).label('pm25_sum'),
            func.sum(Ambient_air_quality_data.pm10).label('pm10_sum'),
            func.sum(Ambient_air_quality_data.no2).label('no2_sum'))
          .filter(Ambient_air_quality_data.year ==2016)
          .group_by(Ambient_air_quality_data.country, Ambient_air_quality_data.year)
          .order_by(func.sum(Ambient_air_quality_data.pm25).desc()))

# # Convert the result to a pandas DataFrame

# # Print the result

In [27]:
result.all()[0][0]

'China'

In [28]:
top_100_countries.head()

Unnamed: 0,country,year,pm25_sum,pm10_sum,no2_sum
0,Portugal,2015,137.89,789.71,588.71
1,Lithuania,2015,74.88,368.84,204.76
2,Georgia,2015,24.5,47.5,0.0
3,Italy,2015,3381.85,8904.21,8643.32
4,Mongolia,2015,75.0,141.0,37.0


In [29]:
result_list = []
for row in result:
    country = row.country
    year = row.year
    pm25_sum = row.pm25_sum
    pm10_sum = row.pm10_sum
    no2_sum = row.no2_sum
    
    result_list.append({
        'country': country,
        'year': year,
        'pm25_sum': pm25_sum,
        'pm10_sum': pm10_sum,
        'no2_sum': no2_sum
    })

In [30]:
result_dict = {}
for row in result:
    country = row.country
    year = row.year
    pm25_sum = row.pm25_sum
    pm10_sum = row.pm10_sum
    no2_sum = row.no2_sum
    
    if country not in result_dict:
        result_dict[country] = {}
    
    result_dict[country][year] = {
        'pm25_sum': pm25_sum,
        'pm10_sum': pm10_sum,
        'no2_sum': no2_sum
    }
    

In [31]:
result_dict

{'China': {2016: {'pm25_sum': 25091.690000000017,
   'pm10_sum': 1024.27,
   'no2_sum': 4273.01}},
 'Italy': {2016: {'pm25_sum': 3061.810000000002,
   'pm10_sum': 7989.459999999998,
   'no2_sum': 8564.269999999993}},
 'India': {2016: {'pm25_sum': 2875.1899999999996,
   'pm10_sum': 24413.759999999995,
   'no2_sum': 5491.049999999999}},
 'Iran (Islamic Republic of)': {2016: {'pm25_sum': 1817.2800000000002,
   'pm10_sum': 71.51,
   'no2_sum': 184.15}},
 'Germany': {2016: {'pm25_sum': 1659.0000000000007,
   'pm10_sum': 4438.290000000003,
   'no2_sum': 7893.100000000001}},
 'Poland': {2016: {'pm25_sum': 1590.2600000000004,
   'pm10_sum': 5090.700000000001,
   'no2_sum': 1784.0099999999998}},
 'France': {2016: {'pm25_sum': 1330.9599999999998,
   'pm10_sum': 4845.130000000002,
   'no2_sum': 5802.269999999996}},
 'Chile': {2016: {'pm25_sum': 958.7000000000003,
   'pm10_sum': 1721.8000000000006,
   'no2_sum': 50.61}},
 'Czechia': {2016: {'pm25_sum': 941.6800000000002,
   'pm10_sum': 1984.189999

In [25]:
result_list

[{'country': 'China',
  'year': 2015,
  'pm25_sum': 26969.68999999999,
  'pm10_sum': 925.0,
  'no2_sum': 3672.0},
 {'country': 'Italy',
  'year': 2015,
  'pm25_sum': 3381.850000000001,
  'pm10_sum': 8904.210000000001,
  'no2_sum': 8643.319999999998},
 {'country': 'Germany',
  'year': 2015,
  'pm25_sum': 1724.8199999999997,
  'pm10_sum': 4799.550000000003,
  'no2_sum': 7981.280000000001},
 {'country': 'Poland',
  'year': 2015,
  'pm25_sum': 1630.6599999999999,
  'pm10_sum': 5350.6399999999985,
  'no2_sum': 1782.3600000000001},
 {'country': 'France',
  'year': 2015,
  'pm25_sum': 1580.45,
  'pm10_sum': 5691.839999999997,
  'no2_sum': 6290.4299999999985},
 {'country': 'Canada',
  'year': 2015,
  'pm25_sum': 1033.3700000000001,
  'pm10_sum': 329.43999999999994,
  'no2_sum': 1248.5},
 {'country': 'Chile',
  'year': 2015,
  'pm25_sum': 973.1699999999998,
  'pm10_sum': 2012.19,
  'no2_sum': 21.009999999999998},
 {'country': 'Spain',
  'year': 2015,
  'pm25_sum': 826.22,
  'pm10_sum': 4347.249

Top 10 countries in 2019 with the highest concentartion of PM25, PM10, and NO2 in the air quality

In [None]:
# Define the date range for the year 2019
year = 2019

# Create the SQLAlchemy query
result = (session.query(Ambient_air_quality_data.country,
            func.sum(Ambient_air_quality_data.pm25).label('pm25_sum'),
            func.sum(Ambient_air_quality_data.pm10).label('pm10_sum'),
            func.sum(Ambient_air_quality_data.no2).label('no2_sum'))
          .filter(Ambient_air_quality_data.year == year)
          .group_by(Ambient_air_quality_data.country)
          .order_by((func.sum(Ambient_air_quality_data.pm25) +
                     func.sum(Ambient_air_quality_data.pm10) +
                     func.sum(Ambient_air_quality_data.no2)).desc())
          .limit(10))

# Convert the result to a pandas DataFrame
top_10_countries = pd.read_sql(result.statement, session.bind)

# Print the result
top_10_countries


# Outdoor Pollution Rates by Ages


In [None]:
Outdoor_Air_Death_Rates_by_Age = pd.read_sql('select * from Outdoor_Air_Death_Rates_by_Age', connection)
Outdoor_Air_Death_Rates_by_Age

Top 10 countries with the highest death rates in 2019

In [None]:
# Query the Outdoor_air_death_rates_by_age table to get the top 10 countries with highest death rates in 2019 for the specified age groups
age_pollution_results = session.query(
    Outdoor_air_death_rates_by_age.country,
    Outdoor_air_death_rates_by_age.year,
    func.sum(Outdoor_air_death_rates_by_age.under_5).label('under_5'),
    func.sum(Outdoor_air_death_rates_by_age.age_5_to_14_years).label('age_5_to_14_years'),
    func.sum(Outdoor_air_death_rates_by_age.age_70plus_years).label('age_70plus_years'),
    func.sum(Outdoor_air_death_rates_by_age.age_15_to_49_years).label('age_15_to_49_years'),
    func.sum(Outdoor_air_death_rates_by_age.age_50_to_69_years).label('age_50_to_69_years')
).filter(
    Outdoor_air_death_rates_by_age.year >= 2015
).group_by(
    Outdoor_air_death_rates_by_age.country,
     Outdoor_air_death_rates_by_age.year
).order_by(
    func.sum(Outdoor_air_death_rates_by_age.under_5).desc(),
    func.sum(Outdoor_air_death_rates_by_age.age_5_to_14_years).desc(),
    func.sum(Outdoor_air_death_rates_by_age.age_70plus_years).desc(),
    func.sum(Outdoor_air_death_rates_by_age.age_15_to_49_years).desc(),
    func.sum(Outdoor_air_death_rates_by_age.age_50_to_69_years).desc()
).all()

# Convert the query results to a pandas DataFrame
df = pd.DataFrame(age_pollution_results, columns=[
    'country',"year", 'under_5', 'age_5_to_14_years', 'age_70plus_years', 'age_15_to_49_years', 'age_50_to_69_years'
])

# Display the DataFrame
age_pollution_results


In [None]:
age_pollution_results_dict = {}
for row in age_pollution_results:
    country = row.country
    year = row.year
    under5 = row.under_5
    age5to14 = row.age_5_to_14_years
    age15to49 = row.age_15_to_49_years
    age50to69 = row.age_50_to_69_years
    age70plus = row.age_70plus_years 
    
    if country not in age_pollution_results_dict:
        age_pollution_results_dict[country] = {}
    
    age_pollution_results_dict[country][year] = {
        'under5': under5,
        'age5to14': age5to14,
        'age15to49': age15to49,
        'age50to69': age50to69,
        'age70plus': age70plus
    }
age_pollution_results_dict

In [None]:
four_air_pollution = session.query(
    Death_rates_from_air_pollution.country,
    Death_rates_from_air_pollution.year,
    (Death_rates_from_air_pollution.household_air_pollution_deaths).label('HAP'),
    (Death_rates_from_air_pollution.ambient_particulate_matter_pollution_deaths).label('APM'),
    (Death_rates_from_air_pollution.air_pollution_deaths).label('AP'),
    (Death_rates_from_air_pollution.ambient_ozone_pollution_deaths).label('AOP'))\
    .filter(Death_rates_from_air_pollution.year >= 2015)

four_air_pollution.all()

In [None]:
four_air_pollution_dict = {}
for row in four_air_pollution:
    country = row.country
    year = row.year
    HAP = row.HAP
    APM = row.APM
    AP = row.AP
    AOP = row.AOP
    
    if country not in four_air_pollution_dict:
        four_air_pollution_dict[country] = {}
    
    four_air_pollution_dict[country][year] = {
        'HAP': HAP,
        'APM': APM,
        'AP': AP,
        'AOP': AOP,
    }
four_air_pollution_dict

Top 10 countries in year 2019 who have the highest value of DALYs dalys_air_pollution,
dalys_household_air_pollution_from_solid_fuels & dalys_particulate_matter_pollution

In [None]:
Disease_Burden_by_Risk_Factor = pd.read_sql('select * from Disease_Burden_by_Risk_Factor', connection)
Disease_Burden_by_Risk_Factor

In [None]:
# Query to filter the top 10 countries in year 2019 who have the highest value of DALYs
top_countries = session.query(Disease_burden_by_risk_factor.country,
                              Disease_burden_by_risk_factor.dalys_air_pollution,
                              Disease_burden_by_risk_factor.dalys_household_air_pollution_from_solid_fuels,
                              Disease_burden_by_risk_factor.dalys_particulate_matter_pollution)\
                        .filter(Disease_burden_by_risk_factor.year>=2015)\
                        .order_by(Disease_burden_by_risk_factor.dalys_id.desc())\
                        .limit(10)\
                        .all()

# Convert the query result to a pandas dataframe
df = pd.DataFrame(top_countries, columns=['Country', 'DALYs_Air_Pollution', 'DALYs_Household_Air_Pollution_from_Solid_Fuels', 'DALYs_Particulate_Matter_Pollution'])

df



In [None]:
Number_of_Deaths_by_Risk_Factor = pd.read_sql('select * from Number_of_Deaths_by_Risk_Factor', connection)
Number_of_Deaths_by_Risk_Factor

Countries ranked by the highest count of mortality death related to pollution

In [None]:
AQ_Pollution_Mortality_Data = pd.read_sql('select * from AQ_Pollution_Mortality_Data', connection)
AQ_Pollution_Mortality_Data

In [None]:
# Query to get the values of pollution deaths and their rank
pollution_deaths = session.query(
                        Aq_pollution_mortality_data.country,
                        (Aq_pollution_mortality_data.total_pollution_deaths).label("TP"),
                        func.rank().over(order_by=Aq_pollution_mortality_data.total_pollution_deaths.desc()).label('TP_Rank'),
                        (Aq_pollution_mortality_data.air_pollution_deaths).label("AP"),
                        func.rank().over(order_by=Aq_pollution_mortality_data.air_pollution_deaths.desc()).label('AP_Rank'),
                        (Aq_pollution_mortality_data.water_pollution_deaths).label("WP"),
                        func.rank().over(order_by=Aq_pollution_mortality_data.water_pollution_deaths.desc()).label('WP_Rank'),
                        (Aq_pollution_mortality_data.occupational_pollution_deaths).label("OP"),
                        func.rank().over(order_by=Aq_pollution_mortality_data.occupational_pollution_deaths.desc()).label('OP_Rank'),
                        (Aq_pollution_mortality_data.lead_deaths).label("LP"),
                        func.rank().over(order_by=Aq_pollution_mortality_data.lead_deaths.desc()).label('LP_Rank')
                    )\
                    .all()

# Convert the query result to a pandas dataframe
df = pd.DataFrame(pollution_deaths, columns=['Country', 'Total_Pollution_Deaths', 'Total_Pollution_Deaths_Rank', 'Air_Pollution_Deaths', 'Air_Pollution_Deaths_Rank', 'Water_Pollution_Deaths', 'Water_Pollution_Deaths_Rank', 'Occupational_Pollution_Deaths', 'Occupational_Pollution_Deaths_Rank', 'Lead_Deaths', 'Lead_Deaths_Rank'])

df.head(10)


In [None]:
pollution_deaths_dict = {}
for row in pollution_deaths:
    country = row.country
    TP = row.TP
    TP_Rank = row.TP_Rank
    AP = row.AP
    AP_Rank = row.AP_Rank
    WP = row.WP
    WP_Rank = row.WP_Rank
    OP = row.OP
    OP_Rank = row.OP_Rank
    LP = row.LP
    LP_Rank = row.LP_Rank
    
    if country not in pollution_deaths_dict:
        pollution_deaths_dict[country] = {}
    
    pollution_deaths_dict[country] = {
        'TP': TP,
        'TP_Rank': TP_Rank,
        'AP': AP,
        'AP_Rank': AP_Rank,
        'WP': WP,
        'WP_Rank': WP_Rank,
        'OP': OP,
        'OP_Rank': OP_Rank,
        'LP': LP,
        'LP_Rank': LP_Rank,
    }
pollution_deaths_dict