Ensure conda environment has wbgapi installed: pip install wbgapi

In [1]:
# Import dependencies
import pandas as pd
import wbgapi as wb
import numpy as np
from sqlalchemy import create_engine
import psycopg2
from config import db_password

### Needed Series from WDI Database 
- EN.ATM.CO2E.KT - CO2 emissions (kt)
- EN.ATM.CO2E.PC - CO2 emissions (metric tons per capita)
- EN.ATM.CO2E.PP.GD.KD	- CO2 emissions (kg per 2017 PPP dollar of GDP)
- EG.USE.PCAP.KG.OE	 - Energy Use (kg of oil equivalent per capita)
- EG.FEC.RNEW.ZS - Renewable energy consumption (% of total final energy consumption)
- AG.YLD.CREL.KG - Cereal yield
- EG.ELC.ACCS.ZS - Access to Electricity (% of population)
- AG.LND.FRST.ZS - Forest area (% of land area)
- NY.GDP.MKTP.KD.ZG - GDP growth (annual %)
- NY.GDP.PCAP.CD - GDP per capita (current USD)
- NY.GNP.PCAP.CD - GNI per capita, Atlas method (current USD)
- SP.POP.TOTL - Total Population
- SP.POP.GROW - Population growth (annual %)
- SP.URB.GROW - Urban population growth (annual %)
- SP.URB.TOTL - Urban population
- SP.URB.TOTL.IN.ZS	- Urban population (% of total population)

## Extract emissions data from World Bank database via API (wbgapi)

In [2]:
series_list = ['EN.ATM.CO2E.KT',
               'EN.ATM.CO2E.PC',
               'EN.ATM.CO2E.PP.GD.KD',
               'EG.USE.PCAP.KG.OE',
               'EG.FEC.RNEW.ZS',
               'AG.YLD.CREL.KG',
               'EG.ELC.ACCS.ZS',
               'AG.LND.FRST.ZS',
               'NY.GDP.MKTP.KD.ZG',
               'NY.GDP.PCAP.CD',
               'NY.GNP.PCAP.CD',
               'SP.POP.TOTL',
               'SP.POP.GROW',
               'SP.URB.GROW',
               'SP.URB.TOTL',
               'SP.URB.TOTL.IN.ZS']

In [3]:
# Using wbgapi to extract World Bank data as Pandas data frame
raw_df = wb.data.DataFrame(series_list, time=range(1990, 2018), numericTimeKeys=True, labels=True, columns='series').reset_index()
raw_df.head()

Unnamed: 0,economy,time,Country,Time,AG.LND.FRST.ZS,AG.YLD.CREL.KG,EG.ELC.ACCS.ZS,EG.FEC.RNEW.ZS,EG.USE.PCAP.KG.OE,EN.ATM.CO2E.KT,EN.ATM.CO2E.PC,EN.ATM.CO2E.PP.GD.KD,NY.GDP.MKTP.KD.ZG,NY.GDP.PCAP.CD,NY.GNP.PCAP.CD,SP.POP.GROW,SP.POP.TOTL,SP.URB.GROW,SP.URB.TOTL,SP.URB.TOTL.IN.ZS
0,ZWE,2017,Zimbabwe,2017,45.451183,1202.7,44.178635,82.46,,10340.000153,0.700965,0.300613,4.080264,1192.107012,1170.0,2.04362,14751101.0,1.860765,4755312.0,32.237
1,ZWE,2016,Zimbabwe,2016,45.570273,435.1,42.561729,81.9,,11020.000458,0.762487,0.333455,0.900955,1421.787789,1200.0,2.081806,14452704.0,1.80661,4667645.0,32.296
2,ZWE,2015,Zimbabwe,2015,45.689363,557.5,33.700001,80.82,,12430.000305,0.878139,0.379509,2.02365,1410.329174,1220.0,2.136294,14154937.0,1.769505,4584076.0,32.385
3,ZWE,2014,Zimbabwe,2014,45.808453,831.4,32.299999,80.27,,12079.999924,0.87184,0.376287,1.484543,1407.034293,1210.0,2.191391,13855753.0,1.730983,4503674.0,32.504
4,ZWE,2013,Zimbabwe,2013,45.927543,668.5,40.498375,78.87,832.572236,12279.999733,0.905911,0.388196,3.196731,1408.36781,1200.0,2.163267,13555422.0,1.613531,4426387.0,32.654


In [4]:
# Rows and columns of data set
raw_df.shape

(7448, 20)

In [5]:
# # Datatypes of columns
raw_df.dtypes

economy                  object
time                      int64
Country                  object
Time                     object
AG.LND.FRST.ZS          float64
AG.YLD.CREL.KG          float64
EG.ELC.ACCS.ZS          float64
EG.FEC.RNEW.ZS          float64
EG.USE.PCAP.KG.OE       float64
EN.ATM.CO2E.KT          float64
EN.ATM.CO2E.PC          float64
EN.ATM.CO2E.PP.GD.KD    float64
NY.GDP.MKTP.KD.ZG       float64
NY.GDP.PCAP.CD          float64
NY.GNP.PCAP.CD          float64
SP.POP.GROW             float64
SP.POP.TOTL             float64
SP.URB.GROW             float64
SP.URB.TOTL             float64
SP.URB.TOTL.IN.ZS       float64
dtype: object

In [6]:
# Descriptive statistics
raw_df.describe()

Unnamed: 0,time,AG.LND.FRST.ZS,AG.YLD.CREL.KG,EG.ELC.ACCS.ZS,EG.FEC.RNEW.ZS,EG.USE.PCAP.KG.OE,EN.ATM.CO2E.KT,EN.ATM.CO2E.PC,EN.ATM.CO2E.PP.GD.KD,NY.GDP.MKTP.KD.ZG,NY.GDP.PCAP.CD,NY.GNP.PCAP.CD,SP.POP.GROW,SP.POP.TOTL,SP.URB.GROW,SP.URB.TOTL,SP.URB.TOTL.IN.ZS
count,7448.0,7117.0,6175.0,6305.0,7089.0,4751.0,6669.0,6669.0,6206.0,6782.0,6944.0,6393.0,7418.0,7420.0,7363.0,7364.0,7364.0
mean,2003.5,32.625391,2894.700192,79.933424,31.159401,2270.160374,992709.5,4.21034,0.26854,3.581076,11171.752248,9479.314154,1.478696,258308400.0,2.22649,121029400.0,55.266415
std,8.07829,23.388023,2336.051682,29.463801,30.118135,2669.532102,3204385.0,5.229999,0.215092,5.78717,18975.239473,15075.519703,1.650633,819608100.0,2.092501,380606500.0,23.529375
min,1990.0,0.0,0.1,0.533899,0.0,9.579196,0.0,0.0,0.0,-64.047107,22.850371,40.0,-27.722225,9182.0,-27.707932,3733.0,5.416
25%,1996.75,12.51395,1404.05,65.926689,4.785963,603.049194,2230.0,0.639336,0.138454,1.545476,1007.129242,910.0,0.54664,1330466.0,0.774823,651572.2,35.29575
50%,2003.5,30.855176,2388.7,98.300003,20.946516,1238.114597,23740.0,2.421594,0.215263,3.712218,3322.03311,3020.0,1.408416,8483160.0,2.199913,4041233.0,54.165
75%,2010.25,47.617367,3796.25,100.0,54.76,3025.736971,246490.0,6.218103,0.330033,5.902814,13096.144542,10400.0,2.416364,55932340.0,3.5037,31827580.0,74.308513
max,2017.0,98.574551,36761.9,100.0,98.34,21420.628504,33514540.0,47.651306,2.085052,149.972963,203266.913745,122130.0,19.360429,7578158000.0,31.143425,4147419000.0,100.0


## Data clean starts here

In [7]:
# Assign original dataframe to another that can be modified
emissions_df = raw_df
emissions_df.sample(5)

Unnamed: 0,economy,time,Country,Time,AG.LND.FRST.ZS,AG.YLD.CREL.KG,EG.ELC.ACCS.ZS,EG.FEC.RNEW.ZS,EG.USE.PCAP.KG.OE,EN.ATM.CO2E.KT,EN.ATM.CO2E.PC,EN.ATM.CO2E.PP.GD.KD,NY.GDP.MKTP.KD.ZG,NY.GDP.PCAP.CD,NY.GNP.PCAP.CD,SP.POP.GROW,SP.POP.TOTL,SP.URB.GROW,SP.URB.TOTL,SP.URB.TOTL.IN.ZS
2930,LBR,1999,Liberia,1999,85.682101,1277.2,,90.050951,,400.0,0.143382,,,,,5.546454,2789743.0,6.326939,1227152.0,43.988
4577,DNK,2004,Denmark,2004,14.4389,6013.1,100.0,14.44,3597.17296,53729.999542,9.941673,0.197073,2.668219,46511.598332,42640.0,0.258432,5404523.0,0.499468,4624434.0,85.566
2396,MNG,2001,Mongolia,2001,9.212578,708.4,71.076576,6.09,976.601158,8970.0,3.627759,0.803407,2.952711,512.81947,470.0,0.87831,2472601.0,2.766455,1439598.0,58.222
2547,MUS,1990,Mauritius,1990,20.231527,4190.8,99.041565,47.067827,629.382069,1160.0,1.095606,0.13306,7.186737,2506.179312,2430.0,0.712313,1058775.0,1.434721,464802.0,43.9
2531,MUS,2006,Mauritius,2006,19.608867,7793.1,99.050987,16.41,1006.501642,3299.999952,2.674239,0.183176,4.865545,5695.969327,5910.0,0.466404,1233996.0,0.200093,518278.0,42.0


In [8]:
# Remove Time column as it is a duplicate
emissions_df.drop(columns = ['Time'], axis = 1, inplace = True)

In [9]:
# Create a dictionary to rename the columns headers to meaningful names
column_names = {'AG.LND.FRST.ZS':'forest_area_percent',
             'AG.YLD.CREL.KG':'cereal_yield',
             'EG.ELC.ACCS.ZS':'electricity_access_percent',
             'EG.FEC.RNEW.ZS':'renew_energy_percent',
             'EG.USE.PCAP.KG.OE':'energy_use_per_capita',
             'EN.ATM.CO2E.KT':'emissions_total',
             'EN.ATM.CO2E.PC':'emissions_per_capita',
             'EN.ATM.CO2E.PP.GD.KD':'emissions_per_gdp',
             'NY.GDP.MKTP.KD.ZG':'gdp_growth_percent',
             'NY.GDP.PCAP.CD':'gdp_per_capita',
             'NY.GNP.PCAP.CD':'gni_per_capita',
             'SP.POP.GROW':'pop_growth_percent',
             'SP.POP.TOTL':'pop_total',
             'SP.URB.GROW':'urb_pop_growth_percent',
             'SP.URB.TOTL':'urban_pop_total',
             'SP.URB.TOTL.IN.ZS':'urban_pop_percent',
             'economy':'country_code',
             'time':'year',
             'Country':'country_name'    
            }

In [10]:
# Rename the columns headers
emissions_df = emissions_df.rename(columns=column_names)
emissions_df.sample(10)

Unnamed: 0,country_code,year,country_name,forest_area_percent,cereal_yield,electricity_access_percent,renew_energy_percent,energy_use_per_capita,emissions_total,emissions_per_capita,emissions_per_gdp,gdp_growth_percent,gdp_per_capita,gni_per_capita,pop_growth_percent,pop_total,urb_pop_growth_percent,urban_pop_total,urban_pop_percent
4221,FJI,1996,Fiji,53.626601,2105.1,66.9,59.714129,,770.0,0.966683,0.105055,4.8,2672.43577,2620.0,0.540289,796538.0,2.253303,368654.0,46.282
137,VIR,1992,Virgin Islands (U.S.),67.788571,,100.0,0.0,,,,,,,,0.859785,105712.0,1.509159,93936.0,88.86
3542,IDN,2003,Indonesia,53.684549,4248.1,87.940002,43.0,742.771661,333890.014648,1.496727,0.244497,4.780369,1052.413199,890.0,1.338044,223080121.0,3.130549,98949418.0,44.356
4827,COD,2006,"Congo, Dem. Rep.",61.692949,771.6,10.292965,97.33,296.233627,2279.999971,0.039053,0.050726,5.32098,247.541949,220.0,3.187172,58381630.0,4.48603,22168089.0,37.971
4463,ECU,2006,Ecuador,53.587993,2830.0,96.315262,14.19,669.50019,30129.999161,2.150751,0.219634,4.403526,3340.840903,3110.0,1.721115,14009061.0,2.038218,8672589.0,61.907
2901,LBY,2000,Libya,0.123328,637.7,99.800003,2.04,3070.475034,44720.0,8.675426,0.319767,3.67927,7424.202916,,1.895563,5154790.0,2.030487,3937589.0,76.387
7359,CSS,1994,Caribbean small states,91.164263,3439.166338,80.659451,10.738155,,25290.0,4.038811,0.363336,2.787061,3226.768475,3018.319883,0.885629,6261744.0,1.177307,3099843.0,49.504475
735,TJK,2010,Tajikistan,2.929408,3117.2,98.671501,61.83,285.493321,2450.000048,0.321447,0.136206,6.499999,740.270556,910.0,2.030278,7621779.0,2.037844,2021296.0,26.52
1439,STP,2006,Sao Tome and Principe,60.629167,2454.5,56.942719,47.33,265.500075,79.999998,0.482727,0.158908,8.866108,861.521178,910.0,2.471072,165725.0,4.451085,100012.0,60.348
4434,EGY,2007,"Egypt, Arab Rep.",0.064002,7418.5,98.46563,6.05,846.264663,183399.993896,2.230635,0.252756,7.087827,1586.472921,1460.0,1.951674,82218755.0,1.96328,35418195.0,43.078


In [11]:
# Swap year and country_name columns
col_list = list(emissions_df.columns)
x, y = col_list.index('year'), col_list.index('country_name')
col_list[y], col_list[x] = col_list[x], col_list[y]
emissions_df = emissions_df[col_list]
list(emissions_df.columns)

['country_code',
 'country_name',
 'year',
 'forest_area_percent',
 'cereal_yield',
 'electricity_access_percent',
 'renew_energy_percent',
 'energy_use_per_capita',
 'emissions_total',
 'emissions_per_capita',
 'emissions_per_gdp',
 'gdp_growth_percent',
 'gdp_per_capita',
 'gni_per_capita',
 'pop_growth_percent',
 'pop_total',
 'urb_pop_growth_percent',
 'urban_pop_total',
 'urban_pop_percent']

In [12]:
#Check for no. of rows and columns
emissions_df.shape

(7448, 19)

In [13]:
#Drop all rows with nan values
emissions_df = emissions_df.dropna() 
emissions_df.shape

(3552, 19)

In [14]:
#Check for missing values
emissions_df.isnull().sum()

country_code                  0
country_name                  0
year                          0
forest_area_percent           0
cereal_yield                  0
electricity_access_percent    0
renew_energy_percent          0
energy_use_per_capita         0
emissions_total               0
emissions_per_capita          0
emissions_per_gdp             0
gdp_growth_percent            0
gdp_per_capita                0
gni_per_capita                0
pop_growth_percent            0
pop_total                     0
urb_pop_growth_percent        0
urban_pop_total               0
urban_pop_percent             0
dtype: int64

In [15]:
#Reset the index after droping rows
emissions_df.reset_index(drop=True, inplace=True)
emissions_df.head()

Unnamed: 0,country_code,country_name,year,forest_area_percent,cereal_yield,electricity_access_percent,renew_energy_percent,energy_use_per_capita,emissions_total,emissions_per_capita,emissions_per_gdp,gdp_growth_percent,gdp_per_capita,gni_per_capita,pop_growth_percent,pop_total,urb_pop_growth_percent,urban_pop_total,urban_pop_percent
0,ZWE,Zimbabwe,2013,45.927543,668.5,40.498375,78.87,832.572236,12279.999733,0.905911,0.388196,3.196731,1408.36781,1200.0,2.163267,13555422.0,1.613531,4426387.0,32.654
1,ZWE,Zimbabwe,2012,46.046633,695.7,44.0,77.5,814.910235,12010.000229,0.905368,0.391797,15.744877,1290.193956,1120.0,1.822309,13265331.0,1.272568,4355539.0,32.834
2,ZWE,Zimbabwe,2011,46.165723,587.4,36.900002,79.27,787.030033,11409.999847,0.875955,0.43083,14.620207,1082.615774,950.0,1.438339,13025785.0,0.891612,4300463.0,33.015
3,ZWE,Zimbabwe,2010,46.284813,733.4,38.782551,82.27,736.691254,9600.000381,0.747677,0.415482,21.452061,937.840338,650.0,1.25365,12839771.0,0.706879,4262290.0,33.196
4,ZWE,Zimbabwe,2009,46.403903,452.4,43.369083,82.09,720.587138,7750.0,0.611208,0.407369,12.01956,762.297957,440.0,1.026265,12679810.0,0.482488,4232267.0,33.378


In [16]:
#Save data into csv file
emissions_df.to_csv('../Resources/ghg_emissions.csv',index= False)

### Load our dataframe into sql

In [17]:
#create the connection to the PostgreSQL database
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/GHG_emissions"

#create the db engine
engine = create_engine(db_string)
    
# Load our datafrme into sql
emissions_df.to_sql(name='ghg_emissions', con=engine, index= False, if_exists='replace')