In [1]:
import pandas as pd
import wbgapi as wb

### Needed Series from WDI Database 
- EN.ATM.CO2E.KT - CO2 emissions (kt)
- EN.ATM.CO2E.PC - CO2 emissions (metric tons per capita)
- EN.ATM.CO2E.PP.GD.KD	- CO2 emissions (kg per 2017 PPP dollar of GDP)
- EG.USE.PCAP.KG.OE	 - Energy Use (kg of oil equivalent per capita)
- EG.FEC.RNEW.ZS - Renewable energy consumption (% of total final energy consumption)
- AG.YLD.CREL.KG - Cereal yield
- EG.ELC.ACCS.ZS - Access to Electricity (% of population)
- AG.LND.FRST.ZS - Forest area (% of land area)
- NY.GDP.MKTP.KD.ZG - GDP growth (annual %)
- NY.GDP.PCAP.CD - GDP per capita (current USD)
- NY.GNP.PCAP.CD - GNI per capita, Atlas method (current USD)
- SP.POP.TOTL - Total Population
- SP.POP.GROW - Population growth (annual %)
- SP.URB.GROW - Urban population growth (annual %)
- SP.URB.TOTL - Urban population
- SP.URB.TOTL.IN.ZS	- Urban population (% of total population)

## Extract emissions data from World Bank database via API (wbgapi)
### Ensure conda environment has wbgapi installed (pip install wbgapi)

In [2]:
series_list = ['EN.ATM.CO2E.KT',
               'EN.ATM.CO2E.PC',
               'EN.ATM.CO2E.PP.GD.KD',
               'EG.USE.PCAP.KG.OE',
               'EG.FEC.RNEW.ZS',
               'AG.YLD.CREL.KG',
               'EG.ELC.ACCS.ZS',
               'AG.LND.FRST.ZS',
               'NY.GDP.MKTP.KD.ZG',
               'NY.GDP.PCAP.CD',
               'NY.GNP.PCAP.CD',
               'SP.POP.TOTL',
               'SP.POP.GROW',
               'SP.URB.GROW',
               'SP.URB.TOTL',
               'SP.URB.TOTL.IN.ZS']

In [3]:
# Using wbgapi to extract World Bank data as Pandas data frame
raw_df = wb.data.DataFrame(series_list, time=range(1990, 2018), numericTimeKeys=True, labels=True, columns='series').reset_index()
raw_df.head()

Unnamed: 0,economy,time,Country,Time,AG.LND.FRST.ZS,AG.YLD.CREL.KG,EG.ELC.ACCS.ZS,EG.FEC.RNEW.ZS,EG.USE.PCAP.KG.OE,EN.ATM.CO2E.KT,EN.ATM.CO2E.PC,EN.ATM.CO2E.PP.GD.KD,NY.GDP.MKTP.KD.ZG,NY.GDP.PCAP.CD,NY.GNP.PCAP.CD,SP.POP.GROW,SP.POP.TOTL,SP.URB.GROW,SP.URB.TOTL,SP.URB.TOTL.IN.ZS
0,ZWE,2017,Zimbabwe,2017,45.451183,1202.7,44.178635,82.46,,10340.000153,0.700965,0.300613,4.080264,1192.107012,1170.0,2.04362,14751101.0,1.860765,4755312.0,32.237
1,ZWE,2016,Zimbabwe,2016,45.570273,435.1,42.561729,81.9,,11020.000458,0.762487,0.333455,0.900955,1421.787789,1200.0,2.081806,14452704.0,1.80661,4667645.0,32.296
2,ZWE,2015,Zimbabwe,2015,45.689363,557.5,33.700001,80.82,,12430.000305,0.878139,0.379509,2.02365,1410.329174,1220.0,2.136294,14154937.0,1.769505,4584076.0,32.385
3,ZWE,2014,Zimbabwe,2014,45.808453,831.4,32.299999,80.27,,12079.999924,0.87184,0.376287,1.484543,1407.034293,1210.0,2.191391,13855753.0,1.730983,4503674.0,32.504
4,ZWE,2013,Zimbabwe,2013,45.927543,668.5,40.498375,78.87,832.572236,12279.999733,0.905911,0.388196,3.196731,1408.36781,1200.0,2.163267,13555422.0,1.613531,4426387.0,32.654


In [4]:
# Rows and columns of data set
raw_df.shape

(7448, 20)

In [5]:
# # Datatypes of columns
raw_df.dtypes

economy                  object
time                      int64
Country                  object
Time                     object
AG.LND.FRST.ZS          float64
AG.YLD.CREL.KG          float64
EG.ELC.ACCS.ZS          float64
EG.FEC.RNEW.ZS          float64
EG.USE.PCAP.KG.OE       float64
EN.ATM.CO2E.KT          float64
EN.ATM.CO2E.PC          float64
EN.ATM.CO2E.PP.GD.KD    float64
NY.GDP.MKTP.KD.ZG       float64
NY.GDP.PCAP.CD          float64
NY.GNP.PCAP.CD          float64
SP.POP.GROW             float64
SP.POP.TOTL             float64
SP.URB.GROW             float64
SP.URB.TOTL             float64
SP.URB.TOTL.IN.ZS       float64
dtype: object

In [6]:
# Descriptive statistics
raw_df.describe()

Unnamed: 0,time,AG.LND.FRST.ZS,AG.YLD.CREL.KG,EG.ELC.ACCS.ZS,EG.FEC.RNEW.ZS,EG.USE.PCAP.KG.OE,EN.ATM.CO2E.KT,EN.ATM.CO2E.PC,EN.ATM.CO2E.PP.GD.KD,NY.GDP.MKTP.KD.ZG,NY.GDP.PCAP.CD,NY.GNP.PCAP.CD,SP.POP.GROW,SP.POP.TOTL,SP.URB.GROW,SP.URB.TOTL,SP.URB.TOTL.IN.ZS
count,7448.0,7117.0,6175.0,6305.0,7089.0,4751.0,6669.0,6669.0,6206.0,6782.0,6944.0,6393.0,7418.0,7420.0,7363.0,7364.0,7364.0
mean,2003.5,32.625391,2894.700192,79.933424,31.159401,2270.160374,992709.5,4.21034,0.26854,3.581076,11171.752248,9479.314154,1.478696,258308400.0,2.22649,121029400.0,55.266415
std,8.07829,23.388023,2336.051682,29.463801,30.118135,2669.532102,3204385.0,5.229999,0.215092,5.78717,18975.239473,15075.519703,1.650633,819608100.0,2.092501,380606500.0,23.529375
min,1990.0,0.0,0.1,0.533899,0.0,9.579196,0.0,0.0,0.0,-64.047107,22.850371,40.0,-27.722225,9182.0,-27.707932,3733.0,5.416
25%,1996.75,12.51395,1404.05,65.926689,4.785963,603.049194,2230.0,0.639336,0.138454,1.545476,1007.129242,910.0,0.54664,1330466.0,0.774823,651572.2,35.29575
50%,2003.5,30.855176,2388.7,98.300003,20.946516,1238.114597,23740.0,2.421594,0.215263,3.712218,3322.03311,3020.0,1.408416,8483160.0,2.199913,4041233.0,54.165
75%,2010.25,47.617367,3796.25,100.0,54.76,3025.736971,246490.0,6.218103,0.330033,5.902814,13096.144542,10400.0,2.416364,55932340.0,3.5037,31827580.0,74.308513
max,2017.0,98.574551,36761.9,100.0,98.34,21420.628504,33514540.0,47.651306,2.085052,149.972963,203266.913745,122130.0,19.360429,7578158000.0,31.143425,4147419000.0,100.0


## Data clean and prep

In [7]:
# Assign original dataframe to another that can be modified
emissions_df = raw_df
emissions_df.sample(5)

Unnamed: 0,economy,time,Country,Time,AG.LND.FRST.ZS,AG.YLD.CREL.KG,EG.ELC.ACCS.ZS,EG.FEC.RNEW.ZS,EG.USE.PCAP.KG.OE,EN.ATM.CO2E.KT,EN.ATM.CO2E.PC,EN.ATM.CO2E.PP.GD.KD,NY.GDP.MKTP.KD.ZG,NY.GDP.PCAP.CD,NY.GNP.PCAP.CD,SP.POP.GROW,SP.POP.TOTL,SP.URB.GROW,SP.URB.TOTL,SP.URB.TOTL.IN.ZS
5883,ATG,2014,Antigua and Barbuda,2014,19.354545,1565.2,100.0,0.14,,479.999989,5.378995,0.286429,3.795728,14004.811212,13230.0,0.831589,89236.0,0.138088,22465.0,25.175
3814,GTM,2011,Guatemala,2011,34.483389,2000.4,84.026413,67.19,750.609079,11739.999771,0.808456,0.107741,4.163907,3228.045741,2910.0,1.819488,14521515.0,2.433269,7072123.0,48.701
3496,IRQ,1993,Iraq,1993,1.847863,706.1,,0.38934,1599.145836,89560.0,4.64142,1.096164,30.289829,53.480235,40.0,4.831667,19295818.0,4.564498,13343444.0,69.152
2081,NIC,2008,Nicaragua,2008,36.815556,1811.0,77.039612,53.77,512.332306,4500.0,0.791175,0.169166,3.436374,1493.904544,1390.0,1.421708,5687744.0,1.77083,3214997.0,56.525
1868,PLW,1997,Palau,1997,85.11087,,97.562309,0.0,,200.0,10.930754,,,,,3.13662,18297.0,2.799731,13003.0,71.066


In [8]:
emissions_df.isnull().sum()

economy                    0
time                       0
Country                    0
Time                       0
AG.LND.FRST.ZS           331
AG.YLD.CREL.KG          1273
EG.ELC.ACCS.ZS          1143
EG.FEC.RNEW.ZS           359
EG.USE.PCAP.KG.OE       2697
EN.ATM.CO2E.KT           779
EN.ATM.CO2E.PC           779
EN.ATM.CO2E.PP.GD.KD    1242
NY.GDP.MKTP.KD.ZG        666
NY.GDP.PCAP.CD           504
NY.GNP.PCAP.CD          1055
SP.POP.GROW               30
SP.POP.TOTL               28
SP.URB.GROW               85
SP.URB.TOTL               84
SP.URB.TOTL.IN.ZS         84
dtype: int64

In [9]:
#Remove Time column as it is not neccessary column
raw_df.drop(columns = ['Time'], axis = 1, inplace = True)

In [10]:
# Rename the columns headers to meaningful names
column_names = {'AG.LND.FRST.ZS':'forest_area_percent',
             'AG.YLD.CREL.KG':'cereal_yield',
             'EG.ELC.ACCS.ZS':'electricity_access_percent',
             'EG.FEC.RNEW.ZS':'renew_energy_percent',
             'EG.USE.PCAP.KG.OE':'energy_use_per_capita',
             'EN.ATM.CO2E.KT':'emissions_total',
             'EN.ATM.CO2E.PC':'emissions_per_capita',
             'EN.ATM.CO2E.PP.GD.KD':'emissions_per_gdp',
             'NY.GDP.MKTP.KD.ZG':'gdp_growth_percent',
             'NY.GDP.PCAP.CD':'gdp_per_capita',
             'NY.GNP.PCAP.CD':'gni_per_capita',
             'SP.POP.GROW':'pop_growth_percent',
             'SP.POP.TOTL':'pop_total',
             'SP.URB.GROW':'urb_pop_growth_percent',
             'SP.URB.TOTL':'urban_pop_total',
             'SP.URB.TOTL.IN.ZS':'urban_pop_percent',
             'economy':'country_code',
             'time':'year',
             'Country':'country_name'    
            }

In [11]:
emissions_df = emissions_df.rename(columns=column_names)
emissions_df.sample(10)

Unnamed: 0,country_code,year,country_name,forest_area_percent,cereal_yield,electricity_access_percent,renew_energy_percent,energy_use_per_capita,emissions_total,emissions_per_capita,emissions_per_gdp,gdp_growth_percent,gdp_per_capita,gni_per_capita,pop_growth_percent,pop_total,urb_pop_growth_percent,urban_pop_total,urban_pop_percent
765,SYR,2008,Syrian Arab Republic,2.614245,896.7,91.284866,1.25,1075.741852,65280.0,3.039947,,4.476673,10155.973667,8630.0,3.656679,21474059.0,4.322216,11783890.0,54.875
2147,NCL,1998,New Caledonia,45.760722,3716.1,100.0,8.033848,,,,,-3.200276,17324.252831,14520.0,1.898768,205279.0,2.490377,125629.0,61.199
5734,AZE,1995,Azerbaijan,11.607965,1507.0,,1.36684,1809.189639,32470.0,4.225196,1.419282,-11.799999,314.561226,180.0,1.155666,7684850.0,0.565536,4011953.0,52.206
1809,PNG,2000,Papua New Guinea,80.108568,4024.2,7.577014,66.38,,2830.0,0.51377,0.168564,-2.494842,639.2792,600.0,3.452133,5508297.0,2.165399,727316.0,13.204
3767,GNB,2002,Guinea-Bissau,75.817568,1067.3,1.253706,91.04,,150.0,0.11667,0.071632,-0.985178,324.969912,300.0,2.225601,1285678.0,3.263163,475752.0,37.004
7142,ECS,2015,Europe & Central Asia,38.423074,3789.970574,99.25631,13.006583,,6089356.0,6.705429,0.205294,2.08236,22556.573644,24657.595524,0.500452,908123143.0,0.757433,649265373.0,71.636361
3268,KAZ,1997,Kazakhstan,1.169953,876.1,99.26947,1.671015,2575.552687,132240.0,8.623745,0.956493,1.7,1445.503237,1390.0,-1.577524,15334405.0,-1.540003,8581746.0,55.964
715,TZA,2002,Tanzania,59.74939,1899.5,10.797707,92.46,410.907293,3570.0,0.098202,0.068506,7.093195,400.427551,410.0,2.617087,36353531.0,4.235741,8377308.0,23.044
1630,PRI,2011,Puerto Rico,55.460654,2649.4,100.0,0.66,,,,,-0.358511,27278.88305,17670.0,-1.15654,3678732.0,-1.211993,3449657.0,93.773
2531,MUS,2006,Mauritius,19.608867,7793.1,99.050987,16.41,1006.501642,3300.0,2.674239,0.183176,4.865545,5695.969327,5910.0,0.466404,1233996.0,0.200093,518278.0,42.0


In [12]:
#swap year and country_name columns
col_list = list(emissions_df.columns)
x, y = col_list.index('year'), col_list.index('country_name')
col_list[y], col_list[x] = col_list[x], col_list[y]
emissions_df = emissions_df[col_list]
list(emissions_df.columns)

['country_code',
 'country_name',
 'year',
 'forest_area_percent',
 'cereal_yield',
 'electricity_access_percent',
 'renew_energy_percent',
 'energy_use_per_capita',
 'emissions_total',
 'emissions_per_capita',
 'emissions_per_gdp',
 'gdp_growth_percent',
 'gdp_per_capita',
 'gni_per_capita',
 'pop_growth_percent',
 'pop_total',
 'urb_pop_growth_percent',
 'urban_pop_total',
 'urban_pop_percent']

In [13]:
#Drop rows which has null values
emissions_df = emissions_df.dropna()
emissions_df.reset_index(drop = True)

Unnamed: 0,country_code,country_name,year,forest_area_percent,cereal_yield,electricity_access_percent,renew_energy_percent,energy_use_per_capita,emissions_total,emissions_per_capita,emissions_per_gdp,gdp_growth_percent,gdp_per_capita,gni_per_capita,pop_growth_percent,pop_total,urb_pop_growth_percent,urban_pop_total,urban_pop_percent
0,ZWE,Zimbabwe,2013,45.927543,668.500000,40.498375,78.870000,832.572236,12279.999733,0.905911,0.388196,3.196731,1408.367810,1200.000000,2.163267,13555422.0,1.613531,4426387.0,32.654000
1,ZWE,Zimbabwe,2012,46.046633,695.700000,44.000000,77.500000,814.910235,12010.000229,0.905368,0.391797,15.744877,1290.193956,1120.000000,1.822309,13265331.0,1.272568,4355539.0,32.834000
2,ZWE,Zimbabwe,2011,46.165723,587.400000,36.900002,79.270000,787.030033,11409.999847,0.875955,0.430830,14.620207,1082.615774,950.000000,1.438339,13025785.0,0.891612,4300463.0,33.015000
3,ZWE,Zimbabwe,2010,46.284813,733.400000,38.782551,82.270000,736.691254,9600.000381,0.747677,0.415482,21.452061,937.840338,650.000000,1.253650,12839771.0,0.706879,4262290.0,33.196000
4,ZWE,Zimbabwe,2009,46.403903,452.400000,43.369083,82.090000,720.587138,7750.000000,0.611208,0.407369,12.019560,762.297957,440.000000,1.026265,12679810.0,0.482488,4232267.0,33.378000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3547,AFE,Africa Eastern and Southern,2004,33.654274,1394.049728,23.815880,63.279292,751.434195,457599.991269,1.027664,0.334259,5.507315,985.530235,810.760703,2.644968,445281555.0,3.736205,133647466.0,30.014148
3548,AFE,Africa Eastern and Southern,2003,34.177441,1296.064498,22.548307,64.165735,733.782234,426379.996829,0.982878,0.329285,3.096045,812.946404,666.312226,2.617764,433807484.0,3.708082,128833965.0,29.698419
3549,AFE,Africa Eastern and Southern,2002,34.357452,1453.381135,21.601503,64.622779,720.036948,404210.007127,0.956164,0.322486,3.905250,626.559857,615.771592,2.606598,422741118.0,3.716958,124227507.0,29.386190
3550,AFE,Africa Eastern and Southern,2001,34.537463,1442.820237,19.986220,65.586773,732.364434,393150.000000,0.954243,0.326004,3.653224,628.204191,629.894844,2.589961,412001885.0,3.655377,119775502.0,29.071591


In [None]:
emissions_df.to_csv('../Resources/emissions.csv',index= False)