# ETL Project - Data Extraction

## Air Pollution Effects by OECD country

### by Team Omicron

#### Phase 1 - Framework definition

In [1]:
# Import of modules
import pandas as pd
import csv
import os

#### Phase 2 - Dataframe creation and cleaning

In [2]:
path = os.path.join("Data", "air_pollution_eff_by_country_oecd.csv")

In [3]:
Effect_DB = pd.read_csv(path)

In [4]:
Effect_DB

Unnamed: 0,LOCATION,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value,Flag Codes
0,Australia,POLLUTIONEFFECT,MORTALITY,1000000HAB,A,2010,217.448,
1,Australia,POLLUTIONEFFECT,MORTALITY,1000000HAB,A,2011,223.709,
2,Australia,POLLUTIONEFFECT,MORTALITY,1000000HAB,A,2012,209.101,
3,Australia,POLLUTIONEFFECT,MORTALITY,1000000HAB,A,2013,198.205,
4,Australia,POLLUTIONEFFECT,MORTALITY,1000000HAB,A,2014,193.408,
...,...,...,...,...,...,...,...,...
283,Slovenia,POLLUTIONEFFECT,MORTALITY,1000000HAB,A,2013,375.722,
284,Slovenia,POLLUTIONEFFECT,MORTALITY,1000000HAB,A,2014,350.257,
285,Slovenia,POLLUTIONEFFECT,MORTALITY,1000000HAB,A,2015,371.517,
286,Slovenia,POLLUTIONEFFECT,MORTALITY,1000000HAB,A,2016,361.266,


In [5]:
NAmerica_DF = Effect_DB.loc[Effect_DB["LOCATION"].isin(["Mexico", "Canada", "United States"])]

In [6]:
NAmerica_DF = NAmerica_DF.reset_index()
NAmerica_DF = NAmerica_DF[["LOCATION", "INDICATOR", "TIME", "Value"]]
NAmerica_DF

Unnamed: 0,LOCATION,INDICATOR,TIME,Value
0,Canada,POLLUTIONEFFECT,2010,229.879
1,Canada,POLLUTIONEFFECT,2011,230.249
2,Canada,POLLUTIONEFFECT,2012,220.521
3,Canada,POLLUTIONEFFECT,2013,209.496
4,Canada,POLLUTIONEFFECT,2014,205.924
5,Canada,POLLUTIONEFFECT,2015,201.792
6,Canada,POLLUTIONEFFECT,2016,182.572
7,Canada,POLLUTIONEFFECT,2017,180.705
8,Mexico,POLLUTIONEFFECT,2010,235.823
9,Mexico,POLLUTIONEFFECT,2011,242.456


In [7]:
# Dataframe 2

In [8]:
path = os.path.join("Data", "air_pollution_exp_by_country_oecd.csv")

In [9]:
Exposure_DB = pd.read_csv(path)

In [10]:
Exposure_DB

Unnamed: 0,LOCATION,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value,Flag Codes
0,Australia,POLLUTIONEXP,EXPOS2PM25,MICGRCUBM,A,2010,10.53559,
1,Australia,POLLUTIONEXP,EXPOS2PM25,MICGRCUBM,A,2011,10.88267,
2,Australia,POLLUTIONEXP,EXPOS2PM25,MICGRCUBM,A,2012,10.40148,
3,Australia,POLLUTIONEXP,EXPOS2PM25,MICGRCUBM,A,2013,9.84329,
4,Australia,POLLUTIONEXP,EXPOS2PM25,MICGRCUBM,A,2014,9.36117,
...,...,...,...,...,...,...,...,...
283,Slovenia,POLLUTIONEXP,EXPOS2PM25,MICGRCUBM,A,2013,17.82317,
284,Slovenia,POLLUTIONEXP,EXPOS2PM25,MICGRCUBM,A,2014,17.02368,
285,Slovenia,POLLUTIONEXP,EXPOS2PM25,MICGRCUBM,A,2015,17.52007,
286,Slovenia,POLLUTIONEXP,EXPOS2PM25,MICGRCUBM,A,2016,16.14072,


In [11]:
NAmerica_Exposure_DB = Exposure_DB.loc[Exposure_DB["LOCATION"].isin(["Mexico", "Canada", "United States"])]

In [12]:
NAmerica_Exposure_DB = NAmerica_Exposure_DB[["LOCATION", "INDICATOR", "TIME", "Value"]]
NAmerica_Exposure_DB = NAmerica_Exposure_DB.reset_index()
NAmerica_Exposure_DB

Unnamed: 0,index,LOCATION,INDICATOR,TIME,Value
0,24,Canada,POLLUTIONEXP,2010,8.37601
1,25,Canada,POLLUTIONEXP,2011,8.57013
2,26,Canada,POLLUTIONEXP,2012,8.10123
3,27,Canada,POLLUTIONEXP,2013,7.75003
4,28,Canada,POLLUTIONEXP,2014,7.30653
5,29,Canada,POLLUTIONEXP,2015,7.17206
6,30,Canada,POLLUTIONEXP,2016,6.48028
7,31,Canada,POLLUTIONEXP,2017,6.45931
8,136,Mexico,POLLUTIONEXP,2010,26.75453
9,137,Mexico,POLLUTIONEXP,2011,28.21291


Source = https://www.kaggle.com/marprezd/air-pollution-exposure-and-effects?select=air_pollution_exp_by_country_oecd.csv

### Import Death Rates

In [13]:
path = os.path.join("Data", "death-rates-from-air-pollution.csv")

In [15]:
DeathRate = pd.read_csv(path)

In [16]:
DeathRate 

Unnamed: 0,Entity,Code,Year,"Air pollution (total) (deaths per 100,000)","Indoor air pollution (deaths per 100,000)","Outdoor particulate matter (deaths per 100,000)","Outdoor ozone pollution (deaths per 100,000)"
0,Afghanistan,AFG,1990,299.477309,250.362910,46.446589,5.616442
1,Afghanistan,AFG,1991,291.277967,242.575125,46.033841,5.603960
2,Afghanistan,AFG,1992,278.963056,232.043878,44.243766,5.611822
3,Afghanistan,AFG,1993,278.790815,231.648134,44.440148,5.655266
4,Afghanistan,AFG,1994,287.162923,238.837177,45.594328,5.718922
...,...,...,...,...,...,...,...
6463,Zimbabwe,ZWE,2013,143.850145,113.456097,27.589603,4.426291
6464,Zimbabwe,ZWE,2014,138.200536,108.703566,26.760618,4.296971
6465,Zimbabwe,ZWE,2015,132.752553,104.340506,25.715415,4.200907
6466,Zimbabwe,ZWE,2016,128.692138,100.392287,25.643570,4.117173


In [17]:
NAmericaDeath_DF = DeathRate.loc[DeathRate["Entity"].isin(["Mexico", "Canada", "United States"])]

In [20]:
NAmericaDeath_DF

Unnamed: 0,Entity,Code,Year,"Air pollution (total) (deaths per 100,000)","Indoor air pollution (deaths per 100,000)","Outdoor particulate matter (deaths per 100,000)","Outdoor ozone pollution (deaths per 100,000)"
952,Canada,CAN,1990,23.748444,0.146160,21.821096,2.024766
953,Canada,CAN,1991,23.340363,0.134791,21.405468,2.046623
954,Canada,CAN,1992,23.009471,0.124798,21.063923,2.069720
955,Canada,CAN,1993,23.032934,0.119108,21.034445,2.135114
956,Canada,CAN,1994,22.602876,0.110767,20.595466,2.152504
...,...,...,...,...,...,...,...
6099,United States,USA,2013,21.278553,0.098260,17.610370,4.091789
6100,United States,USA,2014,20.270392,0.095735,16.577096,4.092842
6101,United States,USA,2015,19.951715,0.092759,16.232600,4.115665
6102,United States,USA,2016,18.805666,0.090756,15.041788,4.122649
