In [1]:
import pandas as pd
import wbgapi as wb

### Needed Series from WDI Database 
- EN.ATM.CO2E.KT - CO2 emissions (kt)
- EN.ATM.CO2E.PC - CO2 emissions (metric tons per capita)
- EN.ATM.CO2E.PP.GD.KD	- CO2 emissions (kg per 2017 PPP dollar of GDP)
- EG.USE.PCAP.KG.OE	 - Energy Use (kg of oil equivalent per capita)
- EG.FEC.RNEW.ZS - Renewable energy consumption (% of total final energy consumption)
- AG.YLD.CREL.KG - Cereal yield
- EG.ELC.ACCS.ZS - Access to Electricity (% of population)
- AG.LND.FRST.ZS - Forest area (% of land area)
- NY.GDP.MKTP.KD.ZG - GDP growth (annual %)
- NY.GDP.PCAP.CD - GDP per capita (current USD)
- NY.GNP.PCAP.CD - GNI per capita, Atlas method (current USD)
- SP.POP.TOTL - Total Population
- SP.POP.GROW - Population growth (annual %)
- SP.URB.GROW - Urban population growth (annual %)
- SP.URB.TOTL - Urban population
- SP.URB.TOTL.IN.ZS	- Urban population (% of total population)

## Extract emissions data from World Bank database via API (wbgapi)
### Ensure conda environment has wbgapi installed (pip install wbgapi)

In [2]:
series_list = ['EN.ATM.CO2E.KT',
               'EN.ATM.CO2E.PC',
               'EN.ATM.CO2E.PP.GD.KD',
               'EG.USE.PCAP.KG.OE',
               'EG.FEC.RNEW.ZS',
               'AG.YLD.CREL.KG',
               'EG.ELC.ACCS.ZS',
               'AG.LND.FRST.ZS',
               'NY.GDP.MKTP.KD.ZG',
               'NY.GDP.PCAP.CD',
               'NY.GNP.PCAP.CD',
               'SP.POP.TOTL',
               'SP.POP.GROW',
               'SP.URB.GROW',
               'SP.URB.TOTL',
               'SP.URB.TOTL.IN.ZS']

In [3]:
# Using wbgapi to extract World Bank data as Pandas data frame
raw_df = wb.data.DataFrame(series_list, time=range(1990, 2018), numericTimeKeys=True, labels=True, columns='series').reset_index()
raw_df.head()

Unnamed: 0,economy,time,Country,Time,AG.LND.FRST.ZS,AG.YLD.CREL.KG,EG.ELC.ACCS.ZS,EG.FEC.RNEW.ZS,EG.USE.PCAP.KG.OE,EN.ATM.CO2E.KT,EN.ATM.CO2E.PC,EN.ATM.CO2E.PP.GD.KD,NY.GDP.MKTP.KD.ZG,NY.GDP.PCAP.CD,NY.GNP.PCAP.CD,SP.POP.GROW,SP.POP.TOTL,SP.URB.GROW,SP.URB.TOTL,SP.URB.TOTL.IN.ZS
0,ZWE,2017,Zimbabwe,2017,45.451183,1202.7,44.178635,82.46,,10340.000153,0.700965,0.300613,4.080264,1192.107012,1170.0,2.04362,14751101.0,1.860765,4755312.0,32.237
1,ZWE,2016,Zimbabwe,2016,45.570273,435.1,42.561729,81.9,,11020.000458,0.762487,0.333455,0.900955,1421.787789,1200.0,2.081806,14452704.0,1.80661,4667645.0,32.296
2,ZWE,2015,Zimbabwe,2015,45.689363,557.5,33.700001,80.82,,12430.000305,0.878139,0.379509,2.02365,1410.329174,1220.0,2.136294,14154937.0,1.769505,4584076.0,32.385
3,ZWE,2014,Zimbabwe,2014,45.808453,831.4,32.299999,80.27,,12079.999924,0.87184,0.376287,1.484543,1407.034293,1210.0,2.191391,13855753.0,1.730983,4503674.0,32.504
4,ZWE,2013,Zimbabwe,2013,45.927543,668.5,40.498375,78.87,832.572236,12279.999733,0.905911,0.388196,3.196731,1408.36781,1200.0,2.163267,13555422.0,1.613531,4426387.0,32.654


In [4]:
# Rows and columns of data set
raw_df.shape

(7448, 20)

In [5]:
# # Datatypes of columns
raw_df.dtypes

economy                  object
time                      int64
Country                  object
Time                     object
AG.LND.FRST.ZS          float64
AG.YLD.CREL.KG          float64
EG.ELC.ACCS.ZS          float64
EG.FEC.RNEW.ZS          float64
EG.USE.PCAP.KG.OE       float64
EN.ATM.CO2E.KT          float64
EN.ATM.CO2E.PC          float64
EN.ATM.CO2E.PP.GD.KD    float64
NY.GDP.MKTP.KD.ZG       float64
NY.GDP.PCAP.CD          float64
NY.GNP.PCAP.CD          float64
SP.POP.GROW             float64
SP.POP.TOTL             float64
SP.URB.GROW             float64
SP.URB.TOTL             float64
SP.URB.TOTL.IN.ZS       float64
dtype: object

In [6]:
# Descriptive statistics
raw_df.describe()

Unnamed: 0,time,AG.LND.FRST.ZS,AG.YLD.CREL.KG,EG.ELC.ACCS.ZS,EG.FEC.RNEW.ZS,EG.USE.PCAP.KG.OE,EN.ATM.CO2E.KT,EN.ATM.CO2E.PC,EN.ATM.CO2E.PP.GD.KD,NY.GDP.MKTP.KD.ZG,NY.GDP.PCAP.CD,NY.GNP.PCAP.CD,SP.POP.GROW,SP.POP.TOTL,SP.URB.GROW,SP.URB.TOTL,SP.URB.TOTL.IN.ZS
count,7448.0,7117.0,6175.0,6305.0,7089.0,4751.0,6669.0,6669.0,6206.0,6782.0,6944.0,6393.0,7418.0,7420.0,7363.0,7364.0,7364.0
mean,2003.5,32.625391,2894.700192,79.933424,31.159401,2270.160374,992709.5,4.21034,0.26854,3.581076,11171.752248,9479.314154,1.478696,258308400.0,2.22649,121029400.0,55.266415
std,8.07829,23.388023,2336.051682,29.463801,30.118135,2669.532102,3204385.0,5.229999,0.215092,5.78717,18975.239473,15075.519703,1.650633,819608100.0,2.092501,380606500.0,23.529375
min,1990.0,0.0,0.1,0.533899,0.0,9.579196,0.0,0.0,0.0,-64.047107,22.850371,40.0,-27.722225,9182.0,-27.707932,3733.0,5.416
25%,1996.75,12.51395,1404.05,65.926689,4.785963,603.049194,2230.0,0.639336,0.138454,1.545476,1007.129242,910.0,0.54664,1330466.0,0.774823,651572.2,35.29575
50%,2003.5,30.855176,2388.7,98.300003,20.946516,1238.114597,23740.0,2.421594,0.215263,3.712218,3322.03311,3020.0,1.408416,8483160.0,2.199913,4041233.0,54.165
75%,2010.25,47.617367,3796.25,100.0,54.76,3025.736971,246490.0,6.218103,0.330033,5.902814,13096.144542,10400.0,2.416364,55932340.0,3.5037,31827580.0,74.308513
max,2017.0,98.574551,36761.9,100.0,98.34,21420.628504,33514540.0,47.651306,2.085052,149.972963,203266.913745,122130.0,19.360429,7578158000.0,31.143425,4147419000.0,100.0


## Data clean and prep

In [9]:
# Assign original dataframe to another that can be modified
emissions_df = raw_df
emissions_df.sample(5)

Unnamed: 0,economy,time,Country,Time,AG.LND.FRST.ZS,AG.YLD.CREL.KG,EG.ELC.ACCS.ZS,EG.FEC.RNEW.ZS,EG.USE.PCAP.KG.OE,EN.ATM.CO2E.KT,EN.ATM.CO2E.PC,EN.ATM.CO2E.PP.GD.KD,NY.GDP.MKTP.KD.ZG,NY.GDP.PCAP.CD,NY.GNP.PCAP.CD,SP.POP.GROW,SP.POP.TOTL,SP.URB.GROW,SP.URB.TOTL,SP.URB.TOTL.IN.ZS
1659,PRT,2010,Portugal,2010,35.50606,3351.3,100.0,27.83,2222.629692,50939.998627,4.817887,0.151515,1.737625,22520.642312,23030.0,0.04591,10573100.0,1.046478,6403809.0,60.567
7020,HPC,1997,Heavily indebted poor countries (HIPC),1997,30.505149,1014.54831,,86.329242,,72309.738053,0.165196,0.092688,6.2005,349.13065,326.907644,2.790146,437721228.0,3.86046,121696838.0,27.802361
782,SYR,1991,Syrian Arab Republic,1991,2.057242,922.4,,2.107307,892.028426,35770.0,2.798405,,7.666468,2171.463634,,2.963822,12782281.0,3.388006,6281085.0,49.139
4740,CIV,2009,Cote d'Ivoire,2009,12.825368,1711.5,57.315105,73.71,453.477992,6030.00021,0.291618,0.083051,3.603322,1638.804766,1150.0,2.117819,20677762.0,3.015028,9699938.0,46.91
6541,TMN,2000,Middle East & North Africa (IDA & IBRD countries),2000,2.245404,1770.512593,91.372682,2.945707,1001.590501,812780.0,2.892693,0.363702,6.197073,1878.091589,1747.982853,1.984886,280976957.0,2.664402,154878632.0,55.121471


In [10]:
emissions_df.isnull().sum()

economy                    0
time                       0
Country                    0
Time                       0
AG.LND.FRST.ZS           331
AG.YLD.CREL.KG          1273
EG.ELC.ACCS.ZS          1143
EG.FEC.RNEW.ZS           359
EG.USE.PCAP.KG.OE       2697
EN.ATM.CO2E.KT           779
EN.ATM.CO2E.PC           779
EN.ATM.CO2E.PP.GD.KD    1242
NY.GDP.MKTP.KD.ZG        666
NY.GDP.PCAP.CD           504
NY.GNP.PCAP.CD          1055
SP.POP.GROW               30
SP.POP.TOTL               28
SP.URB.GROW               85
SP.URB.TOTL               84
SP.URB.TOTL.IN.ZS         84
dtype: int64

In [11]:
# Rename the columns headers to meaningful names
column_names = {'AG.LND.FRST.ZS':'forest_area_percent',
             'AG.YLD.CREL.KG':'cereal_yield',
             'EG.ELC.ACCS.ZS':'electricity_access_percent',
             'EG.FEC.RNEW.ZS':'renew_energy_percent',
             'EG.USE.PCAP.KG.OE':'energy_use_per_capita',
             'EN.ATM.CO2E.KT':'emissions_total',
             'EN.ATM.CO2E.PC':'emissions_per_capita',
             'EN.ATM.CO2E.PP.GD.KD':'emissions_per_gdp',
             'NY.GDP.MKTP.KD.ZG':'gdp_growth_percent',
             'NY.GDP.PCAP.CD':'gdp_per_capita',
             'NY.GNP.PCAP.CD':'gni_per_capita',
             'SP.POP.GROW':'pop_growth_percent',
             'SP.POP.TOTL':'pop_total',
             'SP.URB.GROW':'urb_pop_growth_percent',
             'SP.URB.TOTL':'urban_pop_total',
             'SP.URB.TOTL.IN.ZS':'urban_pop_percent'
            }

In [12]:
emissions_df = emissions_df.rename(columns=column_names)
emissions_df.sample(10)

Unnamed: 0,economy,time,Country,Time,forest_area_percent,cereal_yield,electricity_access_percent,renew_energy_percent,energy_use_per_capita,emissions_total,emissions_per_capita,emissions_per_gdp,gdp_growth_percent,gdp_per_capita,gni_per_capita,pop_growth_percent,pop_total,urb_pop_growth_percent,urban_pop_total,urban_pop_percent
4440,EGY,2001,"Egypt, Arab Rep.",2001,0.060127,7099.4,97.344078,7.82,625.647744,126700.0,1.739088,0.22956,3.535252,1327.096518,1380.0,2.05642,72854260.0,2.163845,31212950.0,42.843
3326,JPN,1995,Japan,1995,68.329676,6003.2,100.0,3.917189,3934.959561,1171010.0,9.332839,0.267665,2.631,44197.619101,42570.0,0.23459,125472000.0,0.407782,97888240.0,78.016
3452,IRL,2009,Ireland,2009,10.326288,6887.2,100.0,5.3,3149.565141,40560.0,8.943031,0.169046,-5.095783,52133.090616,46810.0,1.015663,4535375.0,1.343886,2782090.0,61.342
4514,DMA,2011,Dominica,2011,63.826667,1400.0,95.536674,13.8,,180.0,2.618487,0.212715,-0.223551,7288.497948,7140.0,-0.01891,68742.0,0.413514,47012.0,68.389
7225,EAP,2016,East Asia & Pacific (excluding high income),2016,30.282198,5361.116973,96.904829,,,11308840.0,5.475843,0.446529,6.585886,6591.282343,6652.983041,0.751984,2065223000.0,2.655708,1109067000.0,53.702045
1816,PNG,1993,Papua New Guinea,1993,80.296705,2680.0,,71.249161,,2210.0,0.514961,0.133411,18.202286,1159.13976,1130.0,3.6772,4291588.0,2.411408,619920.0,14.445
4995,TCD,2006,Chad,2006,4.653113,766.8,5.196557,83.74,,510.0,0.049201,0.031015,0.648262,716.667803,540.0,3.540781,10365610.0,3.692014,2263228.0,21.834
1879,PAK,2014,Pakistan,2014,5.143247,3001.2,71.224739,47.22,431.626863,154240.0,0.740642,0.168187,4.674708,1173.392454,1160.0,1.409183,208251600.0,1.985957,74593650.0,35.819
419,UGA,1990,Uganda,1990,17.89435,1497.6,,96.018513,,790.0,0.04492,0.04982,6.47414,244.754047,320.0,3.224802,17586630.0,7.006126,1947895.0,11.076
6763,LAC,2002,Latin America & Caribbean (excluding high income),2002,51.404326,2899.918782,92.062502,29.80025,1059.568185,1037720.0,2.152437,0.182599,0.428243,3568.863649,3640.149698,1.374544,482114000.0,1.856167,362317900.0,75.151906
