Ensure conda environment has wbgapi installed: pip install wbgapi

In [32]:
# Import dependencies
import pandas as pd
import wbgapi as wb
import numpy as np
from sqlalchemy import create_engine
import psycopg2
from config import db_password

### Needed Series from WDI Database 
- EN.ATM.CO2E.KT - CO2 emissions (kt)
- EN.ATM.CO2E.PC - CO2 emissions (metric tons per capita)
- EN.ATM.CO2E.PP.GD.KD	- CO2 emissions (kg per 2017 PPP dollar of GDP)
- EG.USE.PCAP.KG.OE	 - Energy Use (kg of oil equivalent per capita)
- EG.FEC.RNEW.ZS - Renewable energy consumption (% of total final energy consumption)
- AG.YLD.CREL.KG - Cereal yield
- EG.ELC.ACCS.ZS - Access to Electricity (% of population)
- AG.LND.FRST.ZS - Forest area (% of land area)
- NY.GDP.MKTP.KD.ZG - GDP growth (annual %)
- NY.GDP.PCAP.CD - GDP per capita (current USD)
- NY.GNP.PCAP.CD - GNI per capita, Atlas method (current USD)
- SP.POP.TOTL - Total Population
- SP.POP.GROW - Population growth (annual %)
- SP.URB.GROW - Urban population growth (annual %)
- SP.URB.TOTL - Urban population
- SP.URB.TOTL.IN.ZS	- Urban population (% of total population)

## Extract emissions data from World Bank database via API (wbgapi)

In [33]:
series_list = ['EN.ATM.CO2E.KT',
               'EN.ATM.CO2E.PC',
               'EN.ATM.CO2E.PP.GD.KD',
               'EG.USE.PCAP.KG.OE',
               'EG.FEC.RNEW.ZS',
               'AG.YLD.CREL.KG',
               'EG.ELC.ACCS.ZS',
               'AG.LND.FRST.ZS',
               'NY.GDP.MKTP.KD.ZG',
               'NY.GDP.PCAP.CD',
               'NY.GNP.PCAP.CD',
               'SP.POP.TOTL',
               'SP.POP.GROW',
               'SP.URB.GROW',
               'SP.URB.TOTL',
               'SP.URB.TOTL.IN.ZS']

In [34]:
# Using wbgapi to extract World Bank data as Pandas data frame #Takes approx 4-5mins to load.
raw_df = wb.data.DataFrame(series_list, time=range(1990, 2018), numericTimeKeys=True, labels=True, columns='series').reset_index()
raw_df.head()

Unnamed: 0,economy,time,Country,Time,AG.LND.FRST.ZS,AG.YLD.CREL.KG,EG.ELC.ACCS.ZS,EG.FEC.RNEW.ZS,EG.USE.PCAP.KG.OE,EN.ATM.CO2E.KT,EN.ATM.CO2E.PC,EN.ATM.CO2E.PP.GD.KD,NY.GDP.MKTP.KD.ZG,NY.GDP.PCAP.CD,NY.GNP.PCAP.CD,SP.POP.GROW,SP.POP.TOTL,SP.URB.GROW,SP.URB.TOTL,SP.URB.TOTL.IN.ZS
0,ZWE,2017,Zimbabwe,2017,45.451183,1202.7,44.178635,82.46,,10340.000153,0.700965,0.300613,4.080264,1192.107012,1170.0,2.04362,14751101.0,1.860765,4755312.0,32.237
1,ZWE,2016,Zimbabwe,2016,45.570273,435.1,42.561729,81.9,,11020.000458,0.762487,0.333455,0.900955,1421.787789,1200.0,2.081806,14452704.0,1.80661,4667645.0,32.296
2,ZWE,2015,Zimbabwe,2015,45.689363,557.5,33.700001,80.82,,12430.000305,0.878139,0.379509,2.02365,1410.329174,1220.0,2.136294,14154937.0,1.769505,4584076.0,32.385
3,ZWE,2014,Zimbabwe,2014,45.808453,831.4,32.299999,80.27,,12079.999924,0.87184,0.376287,1.484543,1407.034293,1210.0,2.191391,13855753.0,1.730983,4503674.0,32.504
4,ZWE,2013,Zimbabwe,2013,45.927543,668.5,40.498375,78.87,832.572236,12279.999733,0.905911,0.388196,3.196731,1408.36781,1200.0,2.163267,13555422.0,1.613531,4426387.0,32.654


In [35]:
# Rows and columns of data set
raw_df.shape

(7448, 20)

In [36]:
# # Datatypes of columns
raw_df.dtypes

economy                  object
time                      int64
Country                  object
Time                     object
AG.LND.FRST.ZS          float64
AG.YLD.CREL.KG          float64
EG.ELC.ACCS.ZS          float64
EG.FEC.RNEW.ZS          float64
EG.USE.PCAP.KG.OE       float64
EN.ATM.CO2E.KT          float64
EN.ATM.CO2E.PC          float64
EN.ATM.CO2E.PP.GD.KD    float64
NY.GDP.MKTP.KD.ZG       float64
NY.GDP.PCAP.CD          float64
NY.GNP.PCAP.CD          float64
SP.POP.GROW             float64
SP.POP.TOTL             float64
SP.URB.GROW             float64
SP.URB.TOTL             float64
SP.URB.TOTL.IN.ZS       float64
dtype: object

In [37]:
# Descriptive statistics
raw_df.describe()

Unnamed: 0,time,AG.LND.FRST.ZS,AG.YLD.CREL.KG,EG.ELC.ACCS.ZS,EG.FEC.RNEW.ZS,EG.USE.PCAP.KG.OE,EN.ATM.CO2E.KT,EN.ATM.CO2E.PC,EN.ATM.CO2E.PP.GD.KD,NY.GDP.MKTP.KD.ZG,NY.GDP.PCAP.CD,NY.GNP.PCAP.CD,SP.POP.GROW,SP.POP.TOTL,SP.URB.GROW,SP.URB.TOTL,SP.URB.TOTL.IN.ZS
count,7448.0,7117.0,6175.0,6305.0,7089.0,4751.0,6669.0,6669.0,6206.0,6782.0,6944.0,6393.0,7418.0,7420.0,7363.0,7364.0,7364.0
mean,2003.5,32.625391,2894.700192,79.933424,31.159401,2270.160374,992709.5,4.21034,0.26854,3.581076,11171.752248,9479.314154,1.478696,258308400.0,2.22649,121029400.0,55.266415
std,8.07829,23.388023,2336.051682,29.463801,30.118135,2669.532102,3204385.0,5.229999,0.215092,5.78717,18975.239473,15075.519703,1.650633,819608100.0,2.092501,380606500.0,23.529375
min,1990.0,0.0,0.1,0.533899,0.0,9.579196,0.0,0.0,0.0,-64.047107,22.850371,40.0,-27.722225,9182.0,-27.707932,3733.0,5.416
25%,1996.75,12.51395,1404.05,65.926689,4.785963,603.049194,2230.0,0.639336,0.138454,1.545476,1007.129242,910.0,0.54664,1330466.0,0.774823,651572.2,35.29575
50%,2003.5,30.855176,2388.7,98.300003,20.946516,1238.114597,23740.0,2.421594,0.215263,3.712218,3322.03311,3020.0,1.408416,8483160.0,2.199913,4041233.0,54.165
75%,2010.25,47.617367,3796.25,100.0,54.76,3025.736971,246490.0,6.218103,0.330033,5.902814,13096.144542,10400.0,2.416364,55932340.0,3.5037,31827580.0,74.308513
max,2017.0,98.574551,36761.9,100.0,98.34,21420.628504,33514540.0,47.651306,2.085052,149.972963,203266.913745,122130.0,19.360429,7578158000.0,31.143425,4147419000.0,100.0


## Data clean starts here

In [38]:
# Assign original dataframe to another that can be modified
emissions_df = raw_df
emissions_df.sample(5)

Unnamed: 0,economy,time,Country,Time,AG.LND.FRST.ZS,AG.YLD.CREL.KG,EG.ELC.ACCS.ZS,EG.FEC.RNEW.ZS,EG.USE.PCAP.KG.OE,EN.ATM.CO2E.KT,EN.ATM.CO2E.PC,EN.ATM.CO2E.PP.GD.KD,NY.GDP.MKTP.KD.ZG,NY.GDP.PCAP.CD,NY.GNP.PCAP.CD,SP.POP.GROW,SP.POP.TOTL,SP.URB.GROW,SP.URB.TOTL,SP.URB.TOTL.IN.ZS
1877,PAK,2016,Pakistan,2016,5.017954,3020.7,71.550224,43.69,,181110.0,0.848192,0.17869,5.526736,1468.821421,1300.0,1.204056,213524800.0,1.779757,77368590.0,36.234
6505,MIC,2008,Middle income,2008,34.074941,3327.709855,83.971217,21.919641,1177.776764,15750240.0,3.085498,0.396711,5.902866,3264.379336,2918.222964,1.177836,5104602000.0,2.577235,2347353000.0,46.00078
2813,LUX,2004,Luxembourg,2004,33.98771,6396.6,100.0,1.35,9387.437104,11660.0,25.453235,0.236478,4.231891,76544.917087,62680.0,1.421333,458095.0,1.926368,394773.0,86.177
5894,ATG,2003,Antigua and Barbuda,2003,21.020455,1571.4,97.956825,0.0,,410.0,5.251361,0.287947,6.076544,10968.892684,10700.0,1.133522,78075.0,-0.924867,23785.0,30.464
4460,ECU,2009,Ecuador,2009,52.739656,2965.5,96.470863,12.87,789.774727,34500.0,2.340131,0.230087,0.566492,4240.702593,4070.0,1.682479,14742770.0,1.994991,9213492.0,62.495


In [39]:
# Remove Time column as it is a duplicate
emissions_df.drop(columns = ['Time'], axis = 1, inplace = True)

In [40]:
# Create a dictionary to rename the columns headers to meaningful names
column_names = {'AG.LND.FRST.ZS':'forest_area_percent',
             'AG.YLD.CREL.KG':'cereal_yield',
             'EG.ELC.ACCS.ZS':'electricity_access_percent',
             'EG.FEC.RNEW.ZS':'renew_energy_percent',
             'EG.USE.PCAP.KG.OE':'energy_use_per_capita',
             'EN.ATM.CO2E.KT':'emissions_total',
             'EN.ATM.CO2E.PC':'emissions_per_capita',
             'EN.ATM.CO2E.PP.GD.KD':'emissions_per_gdp',
             'NY.GDP.MKTP.KD.ZG':'gdp_growth_percent',
             'NY.GDP.PCAP.CD':'gdp_per_capita',
             'NY.GNP.PCAP.CD':'gni_per_capita',
             'SP.POP.GROW':'pop_growth_percent',
             'SP.POP.TOTL':'pop_total',
             'SP.URB.GROW':'urb_pop_growth_percent',
             'SP.URB.TOTL':'urban_pop_total',
             'SP.URB.TOTL.IN.ZS':'urban_pop_percent',
             'economy':'country_code',
             'time':'year',
             'Country':'country_name'    
            }

In [41]:
# Rename the columns headers
emissions_df = emissions_df.rename(columns=column_names)
emissions_df.sample(10)

Unnamed: 0,country_code,year,country_name,forest_area_percent,cereal_yield,electricity_access_percent,renew_energy_percent,energy_use_per_capita,emissions_total,emissions_per_capita,emissions_per_gdp,gdp_growth_percent,gdp_per_capita,gni_per_capita,pop_growth_percent,pop_total,urb_pop_growth_percent,urban_pop_total,urban_pop_percent
6035,ALB,2002,Albania,28.169854,3291.9,100.0,35.82,660.047984,3760.0,1.232379,0.182437,4.536524,1425.124219,1370.0,-0.299877,3051010.0,2.181209,1327220.0,43.501
3919,GRL,1990,Greenland,0.000644,,100.0,0.0,,,,,-11.719533,18326.805123,15690.0,0.541029,55600.0,0.966016,44312.0,79.698
4386,GNQ,1999,Equatorial Guinea,93.544991,,,61.819019,,3230.0,4.931079,0.563589,25.664015,948.229598,720.0,3.9382,655029.0,7.669115,310156.0,47.35
6923,IBT,2010,IDA & IBRD total,32.360664,3135.194542,80.241094,22.621168,1243.243991,18194930.0,3.116442,0.388071,7.702184,3731.641865,3454.478433,1.328636,5838367000.0,2.590963,2687692000.0,46.048999
2551,MRT,2014,Mauritania,0.33519,1678.8,38.799999,30.92,,2900.0,0.754585,0.150312,4.274823,1715.388838,1790.0,2.642211,3843174.0,4.44515,1929389.0,50.203
1636,PRI,2005,Puerto Rico,51.894025,1915.6,100.0,0.56,,,,,-1.986939,21959.322697,14750.0,-0.144242,3821362.0,-0.203721,3596360.0,94.112
5606,BRB,2011,Barbados,14.651163,2988.9,100.0,4.66,,1760.0,6.388709,0.402703,-0.687779,16907.209259,15450.0,0.281717,275486.0,-0.252746,87329.0,31.7
417,UGA,1992,Uganda,17.481437,1530.1,,96.11768,,820.0,0.043612,0.047371,3.418357,151.976546,190.0,3.408307,18801970.0,6.299607,2216752.0,11.79
1407,SAU,2010,Saudi Arabia,0.454484,5507.8,100.0,0.01,6306.593695,446130.0,15.168335,0.36945,5.039484,17958.948991,17480.0,3.206494,29411930.0,3.471209,24142490.0,82.084
6739,TLA,1998,Latin America & the Caribbean (IDA & IBRD coun...,51.372493,2853.953284,90.543871,28.521916,1158.076811,1180520.0,2.408508,0.197744,2.616,4488.976142,4282.004215,1.605642,490145700.0,2.26624,364930300.0,74.453426


In [42]:
# Swap year and country_name columns
col_list = list(emissions_df.columns)
x, y = col_list.index('year'), col_list.index('country_name')
col_list[y], col_list[x] = col_list[x], col_list[y]
emissions_df = emissions_df[col_list]
list(emissions_df.columns)

['country_code',
 'country_name',
 'year',
 'forest_area_percent',
 'cereal_yield',
 'electricity_access_percent',
 'renew_energy_percent',
 'energy_use_per_capita',
 'emissions_total',
 'emissions_per_capita',
 'emissions_per_gdp',
 'gdp_growth_percent',
 'gdp_per_capita',
 'gni_per_capita',
 'pop_growth_percent',
 'pop_total',
 'urb_pop_growth_percent',
 'urban_pop_total',
 'urban_pop_percent']

In [43]:
#Check for no. of rows and columns
emissions_df.shape

(7448, 19)

In [44]:
# extract countries
country_emissions_df = emissions_df.drop_duplicates(subset=['country_code'], keep='last')
countries_df = country_emissions_df[['country_code','country_name']]
countries_df.reset_index(drop=True, inplace=True)
countries_df

Unnamed: 0,country_code,country_name
0,ZWE,Zimbabwe
1,ZMB,Zambia
2,YEM,"Yemen, Rep."
3,PSE,West Bank and Gaza
4,VIR,Virgin Islands (U.S.)
...,...,...
261,CEB,Central Europe and the Baltics
262,CSS,Caribbean small states
263,ARB,Arab World
264,AFW,Africa Western and Central


In [45]:
emissions_df.drop(columns = ['country_name'], axis = 1, inplace = True)
emissions_df

Unnamed: 0,country_code,year,forest_area_percent,cereal_yield,electricity_access_percent,renew_energy_percent,energy_use_per_capita,emissions_total,emissions_per_capita,emissions_per_gdp,gdp_growth_percent,gdp_per_capita,gni_per_capita,pop_growth_percent,pop_total,urb_pop_growth_percent,urban_pop_total,urban_pop_percent
0,ZWE,2017,45.451183,1202.700000,44.178635,82.460000,,10340.000153,0.700965,0.300613,4.080264,1192.107012,1170.000000,2.043620,14751101.0,1.860765,4755312.0,32.237000
1,ZWE,2016,45.570273,435.100000,42.561729,81.900000,,11020.000458,0.762487,0.333455,0.900955,1421.787789,1200.000000,2.081806,14452704.0,1.806610,4667645.0,32.296000
2,ZWE,2015,45.689363,557.500000,33.700001,80.820000,,12430.000305,0.878139,0.379509,2.023650,1410.329174,1220.000000,2.136294,14154937.0,1.769505,4584076.0,32.385000
3,ZWE,2014,45.808453,831.400000,32.299999,80.270000,,12079.999924,0.871840,0.376287,1.484543,1407.034293,1210.000000,2.191391,13855753.0,1.730983,4503674.0,32.504000
4,ZWE,2013,45.927543,668.500000,40.498375,78.870000,832.572236,12279.999733,0.905911,0.388196,3.196731,1408.367810,1200.000000,2.163267,13555422.0,1.613531,4426387.0,32.654000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7443,AFE,1994,35.624993,1374.695849,,65.379764,743.131843,312160.000000,0.906340,0.333888,2.024488,697.183018,685.512668,2.619955,344418362.0,4.318885,92816300.0,26.948709
7444,AFE,1993,35.776246,1427.680683,,65.291676,742.813558,303600.000000,0.904581,0.330497,-0.387733,704.742868,695.723966,2.658526,335625136.0,4.374956,88973631.0,26.509823
7445,AFE,1992,35.927499,949.672903,,64.636541,732.667501,295090.000000,0.902599,0.319724,-1.963818,728.765774,702.790250,2.633682,326933522.0,4.349481,85244233.0,26.073874
7446,AFE,1991,36.078752,1364.936487,,62.876319,773.326417,298735.435602,0.937815,0.317398,0.114030,858.297836,750.500284,2.792410,318544083.0,4.596574,81691094.0,25.645146


In [46]:
#Drop all rows with nan values
emissions_df = emissions_df.dropna() 
emissions_df.shape

(3552, 18)

In [47]:
#Check for missing values
emissions_df.isnull().sum()

country_code                  0
year                          0
forest_area_percent           0
cereal_yield                  0
electricity_access_percent    0
renew_energy_percent          0
energy_use_per_capita         0
emissions_total               0
emissions_per_capita          0
emissions_per_gdp             0
gdp_growth_percent            0
gdp_per_capita                0
gni_per_capita                0
pop_growth_percent            0
pop_total                     0
urb_pop_growth_percent        0
urban_pop_total               0
urban_pop_percent             0
dtype: int64

In [48]:
#Reset the index after droping rows
emissions_df.reset_index(drop=True, inplace=True)
emissions_df.head()

Unnamed: 0,country_code,year,forest_area_percent,cereal_yield,electricity_access_percent,renew_energy_percent,energy_use_per_capita,emissions_total,emissions_per_capita,emissions_per_gdp,gdp_growth_percent,gdp_per_capita,gni_per_capita,pop_growth_percent,pop_total,urb_pop_growth_percent,urban_pop_total,urban_pop_percent
0,ZWE,2013,45.927543,668.5,40.498375,78.87,832.572236,12279.999733,0.905911,0.388196,3.196731,1408.36781,1200.0,2.163267,13555422.0,1.613531,4426387.0,32.654
1,ZWE,2012,46.046633,695.7,44.0,77.5,814.910235,12010.000229,0.905368,0.391797,15.744877,1290.193956,1120.0,1.822309,13265331.0,1.272568,4355539.0,32.834
2,ZWE,2011,46.165723,587.4,36.900002,79.27,787.030033,11409.999847,0.875955,0.43083,14.620207,1082.615774,950.0,1.438339,13025785.0,0.891612,4300463.0,33.015
3,ZWE,2010,46.284813,733.4,38.782551,82.27,736.691254,9600.000381,0.747677,0.415482,21.452061,937.840338,650.0,1.25365,12839771.0,0.706879,4262290.0,33.196
4,ZWE,2009,46.403903,452.4,43.369083,82.09,720.587138,7750.0,0.611208,0.407369,12.01956,762.297957,440.0,1.026265,12679810.0,0.482488,4232267.0,33.378


In [50]:
#Save data into csv file
emissions_df.to_csv('../Resources/ghg_emissions.csv',index= False)
countries_df.to_csv('../Resources/countries.csv',index= False)

### Load our dataframe into sql

In [51]:
#create the connection to the PostgreSQL database
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/GHG_emissions"

#create the db engine
engine = create_engine(db_string)
    
# Load our datafrme into sql
emissions_df.to_sql(name='ghg_emissions', con=engine, index= False, if_exists='replace')
countries_df.to_sql(name='country', con=engine, index= False, if_exists='replace')