Ensure conda environment has wbgapi installed: pip install wbgapi

In [15]:
# Import dependencies
import pandas as pd
import wbgapi as wb
import numpy as np
from sqlalchemy import create_engine
import psycopg2
from config import db_password

### Needed Series from WDI Database 
- EN.ATM.CO2E.KT - CO2 emissions (kt)
- EN.ATM.CO2E.PC - CO2 emissions (metric tons per capita)
- EN.ATM.CO2E.PP.GD.KD	- CO2 emissions (kg per 2017 PPP dollar of GDP)
- EG.USE.PCAP.KG.OE	 - Energy Use (kg of oil equivalent per capita)
- EG.FEC.RNEW.ZS - Renewable energy consumption (% of total final energy consumption)
- AG.YLD.CREL.KG - Cereal yield
- EG.ELC.ACCS.ZS - Access to Electricity (% of population)
- AG.LND.FRST.ZS - Forest area (% of land area)
- NY.GDP.MKTP.KD.ZG - GDP growth (annual %)
- NY.GDP.PCAP.CD - GDP per capita (current USD)
- NY.GNP.PCAP.CD - GNI per capita, Atlas method (current USD)
- SP.POP.TOTL - Total Population
- SP.POP.GROW - Population growth (annual %)
- SP.URB.GROW - Urban population growth (annual %)
- SP.URB.TOTL - Urban population
- SP.URB.TOTL.IN.ZS	- Urban population (% of total population)

## Extract emissions data from World Bank database via API (wbgapi)

In [16]:
series_list = ['EN.ATM.CO2E.KT',
               'EN.ATM.CO2E.PC',
               'EN.ATM.CO2E.PP.GD.KD',
               'EG.USE.PCAP.KG.OE',
               'EG.FEC.RNEW.ZS',
               'AG.YLD.CREL.KG',
               'EG.ELC.ACCS.ZS',
               'AG.LND.FRST.ZS',
               'NY.GDP.MKTP.KD.ZG',
               'NY.GDP.PCAP.CD',
               'NY.GNP.PCAP.CD',
               'SP.POP.TOTL',
               'SP.POP.GROW',
               'SP.URB.GROW',
               'SP.URB.TOTL',
               'SP.URB.TOTL.IN.ZS']

In [17]:
# Using wbgapi to extract World Bank data as Pandas data frame #Takes approx 4-5mins to load.
raw_df = wb.data.DataFrame(series_list, time=range(1990, 2018), numericTimeKeys=True, labels=True, columns='series').reset_index()
raw_df.head()

Unnamed: 0,economy,time,Country,Time,AG.LND.FRST.ZS,AG.YLD.CREL.KG,EG.ELC.ACCS.ZS,EG.FEC.RNEW.ZS,EG.USE.PCAP.KG.OE,EN.ATM.CO2E.KT,EN.ATM.CO2E.PC,EN.ATM.CO2E.PP.GD.KD,NY.GDP.MKTP.KD.ZG,NY.GDP.PCAP.CD,NY.GNP.PCAP.CD,SP.POP.GROW,SP.POP.TOTL,SP.URB.GROW,SP.URB.TOTL,SP.URB.TOTL.IN.ZS
0,ZWE,2017,Zimbabwe,2017,45.451183,1202.7,44.178635,82.46,,10340.000153,0.700965,0.300613,4.080264,1192.107012,1170.0,2.04362,14751101.0,1.860765,4755312.0,32.237
1,ZWE,2016,Zimbabwe,2016,45.570273,435.1,42.561729,81.9,,11020.000458,0.762487,0.333455,0.900955,1421.787789,1200.0,2.081806,14452704.0,1.80661,4667645.0,32.296
2,ZWE,2015,Zimbabwe,2015,45.689363,557.5,33.700001,80.82,,12430.000305,0.878139,0.379509,2.02365,1410.329174,1220.0,2.136294,14154937.0,1.769505,4584076.0,32.385
3,ZWE,2014,Zimbabwe,2014,45.808453,831.4,32.299999,80.27,,12079.999924,0.87184,0.376287,1.484543,1407.034293,1210.0,2.191391,13855753.0,1.730983,4503674.0,32.504
4,ZWE,2013,Zimbabwe,2013,45.927543,668.5,40.498375,78.87,832.572236,12279.999733,0.905911,0.388196,3.196731,1408.36781,1200.0,2.163267,13555422.0,1.613531,4426387.0,32.654


In [18]:
# Rows and columns of data set
raw_df.shape

(7448, 20)

In [19]:
# # Datatypes of columns
raw_df.dtypes

economy                  object
time                      int64
Country                  object
Time                     object
AG.LND.FRST.ZS          float64
AG.YLD.CREL.KG          float64
EG.ELC.ACCS.ZS          float64
EG.FEC.RNEW.ZS          float64
EG.USE.PCAP.KG.OE       float64
EN.ATM.CO2E.KT          float64
EN.ATM.CO2E.PC          float64
EN.ATM.CO2E.PP.GD.KD    float64
NY.GDP.MKTP.KD.ZG       float64
NY.GDP.PCAP.CD          float64
NY.GNP.PCAP.CD          float64
SP.POP.GROW             float64
SP.POP.TOTL             float64
SP.URB.GROW             float64
SP.URB.TOTL             float64
SP.URB.TOTL.IN.ZS       float64
dtype: object

In [20]:
# Descriptive statistics
raw_df.describe()

Unnamed: 0,time,AG.LND.FRST.ZS,AG.YLD.CREL.KG,EG.ELC.ACCS.ZS,EG.FEC.RNEW.ZS,EG.USE.PCAP.KG.OE,EN.ATM.CO2E.KT,EN.ATM.CO2E.PC,EN.ATM.CO2E.PP.GD.KD,NY.GDP.MKTP.KD.ZG,NY.GDP.PCAP.CD,NY.GNP.PCAP.CD,SP.POP.GROW,SP.POP.TOTL,SP.URB.GROW,SP.URB.TOTL,SP.URB.TOTL.IN.ZS
count,7448.0,7117.0,6175.0,6305.0,7089.0,4751.0,6669.0,6669.0,6206.0,6782.0,6944.0,6393.0,7418.0,7420.0,7363.0,7364.0,7364.0
mean,2003.5,32.625391,2894.700192,79.933424,31.159401,2270.160374,992709.5,4.21034,0.26854,3.581076,11171.752248,9479.314154,1.478696,258308400.0,2.22649,121029400.0,55.266415
std,8.07829,23.388023,2336.051682,29.463801,30.118135,2669.532102,3204385.0,5.229999,0.215092,5.78717,18975.239473,15075.519703,1.650633,819608100.0,2.092501,380606500.0,23.529375
min,1990.0,0.0,0.1,0.533899,0.0,9.579196,0.0,0.0,0.0,-64.047107,22.850371,40.0,-27.722225,9182.0,-27.707932,3733.0,5.416
25%,1996.75,12.51395,1404.05,65.926689,4.785963,603.049194,2230.0,0.639336,0.138454,1.545476,1007.129242,910.0,0.54664,1330466.0,0.774823,651572.2,35.29575
50%,2003.5,30.855176,2388.7,98.300003,20.946516,1238.114597,23740.0,2.421594,0.215263,3.712218,3322.03311,3020.0,1.408416,8483160.0,2.199913,4041233.0,54.165
75%,2010.25,47.617367,3796.25,100.0,54.76,3025.736971,246490.0,6.218103,0.330033,5.902814,13096.144542,10400.0,2.416364,55932340.0,3.5037,31827580.0,74.308513
max,2017.0,98.574551,36761.9,100.0,98.34,21420.628504,33514540.0,47.651306,2.085052,149.972963,203266.913745,122130.0,19.360429,7578158000.0,31.143425,4147419000.0,100.0


## Data clean starts here

In [21]:
# Assign original dataframe to another that can be modified
emissions_df = raw_df
emissions_df.sample(5)

Unnamed: 0,economy,time,Country,Time,AG.LND.FRST.ZS,AG.YLD.CREL.KG,EG.ELC.ACCS.ZS,EG.FEC.RNEW.ZS,EG.USE.PCAP.KG.OE,EN.ATM.CO2E.KT,EN.ATM.CO2E.PC,EN.ATM.CO2E.PP.GD.KD,NY.GDP.MKTP.KD.ZG,NY.GDP.PCAP.CD,NY.GNP.PCAP.CD,SP.POP.GROW,SP.POP.TOTL,SP.URB.GROW,SP.URB.TOTL,SP.URB.TOTL.IN.ZS
5818,ABW,1995,Aruba,1995,2.333333,,100.0,0.173372,,,,,2.547144,17140.433369,16930.0,3.084061,77050.0,2.247286,37575.0,48.767
3995,GHA,1998,Ghana,1998,39.833612,1333.8,42.6,73.578071,395.139053,6100.0,0.325947,0.12202,4.700391,399.737408,380.0,2.415665,18714710.0,4.233212,7938405.0,42.418
2976,LBN,2009,Lebanon,2009,13.434311,2618.9,99.234489,4.31,1342.79958,21900.0,4.423228,0.246018,10.232156,7149.791498,6680.0,1.29128,4951135.0,1.457728,4316746.0,87.187
2264,NAM,1993,Namibia,1993,10.392625,356.1,27.0,39.241399,527.076487,1410.0,0.933877,0.145029,-1.579539,2153.36959,2270.0,3.240846,1509834.0,4.944712,435179.0,28.823
3566,IND,2007,India,2007,23.182003,2583.3,71.119865,41.53,482.45644,1336740.0,1.123602,0.308512,7.660815,1022.731629,910.0,1.466372,1189692000.0,2.599633,355789232.0,29.906


In [22]:
# Remove Time column as it is a duplicate
emissions_df.drop(columns = ['Time'], axis = 1, inplace = True)

In [23]:
# Create a dictionary to rename the columns headers to meaningful names
column_names = {'AG.LND.FRST.ZS':'forest_area_percent',
             'AG.YLD.CREL.KG':'cereal_yield',
             'EG.ELC.ACCS.ZS':'electricity_access_percent',
             'EG.FEC.RNEW.ZS':'renew_energy_percent',
             'EG.USE.PCAP.KG.OE':'energy_use_per_capita',
             'EN.ATM.CO2E.KT':'emissions_total',
             'EN.ATM.CO2E.PC':'emissions_per_capita',
             'EN.ATM.CO2E.PP.GD.KD':'emissions_per_gdp',
             'NY.GDP.MKTP.KD.ZG':'gdp_growth_percent',
             'NY.GDP.PCAP.CD':'gdp_per_capita',
             'NY.GNP.PCAP.CD':'gni_per_capita',
             'SP.POP.GROW':'pop_growth_percent',
             'SP.POP.TOTL':'pop_total',
             'SP.URB.GROW':'urb_pop_growth_percent',
             'SP.URB.TOTL':'urban_pop_total',
             'SP.URB.TOTL.IN.ZS':'urban_pop_percent',
             'economy':'country_code',
             'time':'year',
             'Country':'country_name'    
            }

In [24]:
# Rename the columns headers
emissions_df = emissions_df.rename(columns=column_names)
emissions_df.sample(10)

Unnamed: 0,country_code,year,country_name,forest_area_percent,cereal_yield,electricity_access_percent,renew_energy_percent,energy_use_per_capita,emissions_total,emissions_per_capita,emissions_per_gdp,gdp_growth_percent,gdp_per_capita,gni_per_capita,pop_growth_percent,pop_total,urb_pop_growth_percent,urban_pop_total,urban_pop_percent
6902,IDB,2003,IDA blend,31.022825,1762.648972,58.295942,59.582488,640.91566,360400.0,0.90101,0.26135,5.113633,640.786015,582.698036,2.378026,399995500.0,3.670985,140867968.0,35.217388
4684,CUB,2009,Cuba,27.079106,2068.8,97.383095,18.97,1011.566504,26980.0,2.391169,,1.451305,5094.437562,5090.0,0.058298,11283180.0,0.178631,8632201.0,76.505
6245,SAS,2016,South Asia,18.628349,3127.45906,86.227074,,,2491070.0,1.386354,0.250354,7.682241,1675.575511,1594.291263,1.220816,1796850000.0,2.473388,595910612.0,33.16418
767,SYR,2006,Syrian Arab Republic,2.549039,2002.3,99.50576,1.82,1113.08347,60210.0,3.098496,,5.046196,7914.771494,7000.0,4.46445,19432010.0,5.138976,10521656.0,54.146
727,TZA,1990,Tanzania,64.788903,1506.5,,94.784909,371.410118,1890.0,0.072121,0.055399,7.045072,167.144562,190.0,2.641284,26206010.0,4.34497,4948743.0,18.884
6290,SST,1999,Small states,41.35952,1339.118134,,27.405689,,128891.3,4.35689,0.350339,,3951.634756,,1.636951,29583320.0,2.536102,14400445.0,48.677581
6004,DZA,2005,Algeria,0.734127,1500.8,98.88961,0.58,983.68216,94190.0,2.857993,0.27048,5.9,3131.328532,2730.0,1.364082,32956690.0,2.576162,21036255.0,63.83
4091,GAB,2014,Gabon,91.597322,1592.5,86.96431,81.3,2580.438314,5960.0,3.030218,0.204947,4.314964,9255.368597,8930.0,3.341104,1966855.0,3.907446,1723968.0,87.651
1584,ROU,2001,Romania,27.778068,3029.4,100.0,13.39,1663.739288,95230.0,4.302825,0.331131,5.218136,1825.179805,1760.0,-1.39543,22131970.0,-1.682608,11697189.0,52.852
6564,MNA,2005,Middle East & North Africa (excluding high inc...,2.358692,2136.136536,93.163353,3.610744,1209.097924,1038968.0,3.330575,0.377195,4.533387,2571.02295,2364.208149,1.962045,311948600.0,2.6538,178039708.0,57.073416


In [25]:
# Swap year and country_name columns
col_list = list(emissions_df.columns)
x, y = col_list.index('year'), col_list.index('country_name')
col_list[y], col_list[x] = col_list[x], col_list[y]
emissions_df = emissions_df[col_list]
list(emissions_df.columns)

['country_code',
 'country_name',
 'year',
 'forest_area_percent',
 'cereal_yield',
 'electricity_access_percent',
 'renew_energy_percent',
 'energy_use_per_capita',
 'emissions_total',
 'emissions_per_capita',
 'emissions_per_gdp',
 'gdp_growth_percent',
 'gdp_per_capita',
 'gni_per_capita',
 'pop_growth_percent',
 'pop_total',
 'urb_pop_growth_percent',
 'urban_pop_total',
 'urban_pop_percent']

In [26]:
#Check for no. of rows and columns
emissions_df.shape

(7448, 19)

In [27]:
#Drop all rows with nan values
emissions_df = emissions_df.dropna() 
emissions_df.shape

(3552, 19)

In [28]:
#Check for missing values
emissions_df.isnull().sum()

country_code                  0
country_name                  0
year                          0
forest_area_percent           0
cereal_yield                  0
electricity_access_percent    0
renew_energy_percent          0
energy_use_per_capita         0
emissions_total               0
emissions_per_capita          0
emissions_per_gdp             0
gdp_growth_percent            0
gdp_per_capita                0
gni_per_capita                0
pop_growth_percent            0
pop_total                     0
urb_pop_growth_percent        0
urban_pop_total               0
urban_pop_percent             0
dtype: int64

In [29]:
#Reset the index after droping rows
emissions_df.reset_index(drop=True, inplace=True)
emissions_df.head()

Unnamed: 0,country_code,country_name,year,forest_area_percent,cereal_yield,electricity_access_percent,renew_energy_percent,energy_use_per_capita,emissions_total,emissions_per_capita,emissions_per_gdp,gdp_growth_percent,gdp_per_capita,gni_per_capita,pop_growth_percent,pop_total,urb_pop_growth_percent,urban_pop_total,urban_pop_percent
0,ZWE,Zimbabwe,2013,45.927543,668.5,40.498375,78.87,832.572236,12279.999733,0.905911,0.388196,3.196731,1408.36781,1200.0,2.163267,13555422.0,1.613531,4426387.0,32.654
1,ZWE,Zimbabwe,2012,46.046633,695.7,44.0,77.5,814.910235,12010.000229,0.905368,0.391797,15.744877,1290.193956,1120.0,1.822309,13265331.0,1.272568,4355539.0,32.834
2,ZWE,Zimbabwe,2011,46.165723,587.4,36.900002,79.27,787.030033,11409.999847,0.875955,0.43083,14.620207,1082.615774,950.0,1.438339,13025785.0,0.891612,4300463.0,33.015
3,ZWE,Zimbabwe,2010,46.284813,733.4,38.782551,82.27,736.691254,9600.000381,0.747677,0.415482,21.452061,937.840338,650.0,1.25365,12839771.0,0.706879,4262290.0,33.196
4,ZWE,Zimbabwe,2009,46.403903,452.4,43.369083,82.09,720.587138,7750.0,0.611208,0.407369,12.01956,762.297957,440.0,1.026265,12679810.0,0.482488,4232267.0,33.378


In [30]:
#Save data into csv file
emissions_df.to_csv('../Resources/ghg_emissions.csv',index= False)

### Load our dataframe into sql

In [31]:
#create the connection to the PostgreSQL database
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/GHG_emissions"

#create the db engine
engine = create_engine(db_string)
    
# Load our datafrme into sql
emissions_df.to_sql(name='ghg_emissions', con=engine, index= False, if_exists='replace')