In [1]:
import pandas as pd
import wbgapi as wb

### Needed Series from WDI Database 
- EN.ATM.CO2E.KT - CO2 emissions (kt)
- EN.ATM.CO2E.PC - CO2 emissions (metric tons per capita)
- EN.ATM.CO2E.PP.GD.KD	- CO2 emissions (kg per 2017 PPP dollar of GDP)
- EG.USE.PCAP.KG.OE	 - Energy Use (kg of oil equivalent per capita)
- EG.FEC.RNEW.ZS - Renewable energy consumption (% of total final energy consumption)
- AG.YLD.CREL.KG - Cereal yield
- EG.ELC.ACCS.ZS - Access to Electricity (% of population)
- AG.LND.FRST.ZS - Forest area (% of land area)
- NY.GDP.MKTP.KD.ZG - GDP growth (annual %)
- NY.GDP.PCAP.CD - GDP per capita (current USD)
- NY.GNP.PCAP.CD - GNI per capita, Atlas method (current USD)
- SP.POP.TOTL - Total Population
- SP.POP.GROW - Population growth (annual %)
- SP.URB.GROW - Urban population growth (annual %)
- SP.URB.TOTL - Urban population
- SP.URB.TOTL.IN.ZS	- Urban population (% of total population)

## Extract emissions data from World Bank database via API (wbgapi)
### Ensure conda environment has wbgapi installed (pip install wbgapi)

In [2]:
series_list = ['EN.ATM.CO2E.KT',
               'EN.ATM.CO2E.PC',
               'EN.ATM.CO2E.PP.GD.KD',
               'EG.USE.PCAP.KG.OE',
               'EG.FEC.RNEW.ZS',
               'AG.YLD.CREL.KG',
               'EG.ELC.ACCS.ZS',
               'AG.LND.FRST.ZS',
               'NY.GDP.MKTP.KD.ZG',
               'NY.GDP.PCAP.CD',
               'NY.GNP.PCAP.CD',
               'SP.POP.TOTL',
               'SP.POP.GROW',
               'SP.URB.GROW',
               'SP.URB.TOTL',
               'SP.URB.TOTL.IN.ZS']

In [3]:
# Using wbgapi to extract World Bank data as Pandas data frame
raw_df = wb.data.DataFrame(series_list, time=range(1990, 2018), numericTimeKeys=True, labels=True, columns='series').reset_index()
raw_df.head()

Unnamed: 0,economy,time,Country,Time,AG.LND.FRST.ZS,AG.YLD.CREL.KG,EG.ELC.ACCS.ZS,EG.FEC.RNEW.ZS,EG.USE.PCAP.KG.OE,EN.ATM.CO2E.KT,EN.ATM.CO2E.PC,EN.ATM.CO2E.PP.GD.KD,NY.GDP.MKTP.KD.ZG,NY.GDP.PCAP.CD,NY.GNP.PCAP.CD,SP.POP.GROW,SP.POP.TOTL,SP.URB.GROW,SP.URB.TOTL,SP.URB.TOTL.IN.ZS
0,ZWE,2017,Zimbabwe,2017,45.451183,1202.7,44.178635,82.46,,10340.000153,0.700965,0.300613,4.080264,1192.107012,1170.0,2.04362,14751101.0,1.860765,4755312.0,32.237
1,ZWE,2016,Zimbabwe,2016,45.570273,435.1,42.561729,81.9,,11020.000458,0.762487,0.333455,0.900955,1421.787789,1200.0,2.081806,14452704.0,1.80661,4667645.0,32.296
2,ZWE,2015,Zimbabwe,2015,45.689363,557.5,33.700001,80.82,,12430.000305,0.878139,0.379509,2.02365,1410.329174,1220.0,2.136294,14154937.0,1.769505,4584076.0,32.385
3,ZWE,2014,Zimbabwe,2014,45.808453,831.4,32.299999,80.27,,12079.999924,0.87184,0.376287,1.484543,1407.034293,1210.0,2.191391,13855753.0,1.730983,4503674.0,32.504
4,ZWE,2013,Zimbabwe,2013,45.927543,668.5,40.498375,78.87,832.572236,12279.999733,0.905911,0.388196,3.196731,1408.36781,1200.0,2.163267,13555422.0,1.613531,4426387.0,32.654


In [4]:
# Rows and columns of data set
raw_df.shape

(7448, 20)

In [5]:
# # Datatypes of columns
raw_df.dtypes

economy                  object
time                      int64
Country                  object
Time                     object
AG.LND.FRST.ZS          float64
AG.YLD.CREL.KG          float64
EG.ELC.ACCS.ZS          float64
EG.FEC.RNEW.ZS          float64
EG.USE.PCAP.KG.OE       float64
EN.ATM.CO2E.KT          float64
EN.ATM.CO2E.PC          float64
EN.ATM.CO2E.PP.GD.KD    float64
NY.GDP.MKTP.KD.ZG       float64
NY.GDP.PCAP.CD          float64
NY.GNP.PCAP.CD          float64
SP.POP.GROW             float64
SP.POP.TOTL             float64
SP.URB.GROW             float64
SP.URB.TOTL             float64
SP.URB.TOTL.IN.ZS       float64
dtype: object

In [6]:
# Descriptive statistics
raw_df.describe()

Unnamed: 0,time,AG.LND.FRST.ZS,AG.YLD.CREL.KG,EG.ELC.ACCS.ZS,EG.FEC.RNEW.ZS,EG.USE.PCAP.KG.OE,EN.ATM.CO2E.KT,EN.ATM.CO2E.PC,EN.ATM.CO2E.PP.GD.KD,NY.GDP.MKTP.KD.ZG,NY.GDP.PCAP.CD,NY.GNP.PCAP.CD,SP.POP.GROW,SP.POP.TOTL,SP.URB.GROW,SP.URB.TOTL,SP.URB.TOTL.IN.ZS
count,7448.0,7117.0,6175.0,6305.0,7089.0,4751.0,6669.0,6669.0,6206.0,6782.0,6944.0,6393.0,7418.0,7420.0,7363.0,7364.0,7364.0
mean,2003.5,32.625391,2894.700192,79.933424,31.159401,2270.160374,992709.5,4.21034,0.26854,3.581076,11171.752248,9479.314154,1.478696,258308400.0,2.22649,121029400.0,55.266415
std,8.07829,23.388023,2336.051682,29.463801,30.118135,2669.532102,3204385.0,5.229999,0.215092,5.78717,18975.239473,15075.519703,1.650633,819608100.0,2.092501,380606500.0,23.529375
min,1990.0,0.0,0.1,0.533899,0.0,9.579196,0.0,0.0,0.0,-64.047107,22.850371,40.0,-27.722225,9182.0,-27.707932,3733.0,5.416
25%,1996.75,12.51395,1404.05,65.926689,4.785963,603.049194,2230.0,0.639336,0.138454,1.545476,1007.129242,910.0,0.54664,1330466.0,0.774823,651572.2,35.29575
50%,2003.5,30.855176,2388.7,98.300003,20.946516,1238.114597,23740.0,2.421594,0.215263,3.712218,3322.03311,3020.0,1.408416,8483160.0,2.199913,4041233.0,54.165
75%,2010.25,47.617367,3796.25,100.0,54.76,3025.736971,246490.0,6.218103,0.330033,5.902814,13096.144542,10400.0,2.416364,55932340.0,3.5037,31827580.0,74.308513
max,2017.0,98.574551,36761.9,100.0,98.34,21420.628504,33514540.0,47.651306,2.085052,149.972963,203266.913745,122130.0,19.360429,7578158000.0,31.143425,4147419000.0,100.0


## Data clean and prep

In [7]:
# Assign original dataframe to another that can be modified
emissions_df = raw_df
emissions_df.sample(5)

Unnamed: 0,economy,time,Country,Time,AG.LND.FRST.ZS,AG.YLD.CREL.KG,EG.ELC.ACCS.ZS,EG.FEC.RNEW.ZS,EG.USE.PCAP.KG.OE,EN.ATM.CO2E.KT,EN.ATM.CO2E.PC,EN.ATM.CO2E.PP.GD.KD,NY.GDP.MKTP.KD.ZG,NY.GDP.PCAP.CD,NY.GNP.PCAP.CD,SP.POP.GROW,SP.POP.TOTL,SP.URB.GROW,SP.URB.TOTL,SP.URB.TOTL.IN.ZS
3161,KOR,1992,"Korea, Rep.",1992,67.758656,5937.6,99.942215,0.75292,2534.341028,297050.0,6.790031,0.465372,6.198643,8126.67039,8310.0,1.039161,43747962.0,2.163903,33169705.0,75.82
4094,GAB,2011,Gabon,2011,91.735662,1596.1,88.859566,79.71,2308.796615,5780.00021,3.260931,0.230525,7.091753,10273.799012,7720.0,3.525162,1772500.0,4.176634,1525981.0,86.092
784,CHE,2017,Switzerland,2017,31.855174,6800.2,100.0,24.99,,38700.000763,4.578885,0.066261,1.3628,82254.376927,81870.0,0.933156,8451840.0,0.962986,6234162.0,73.761
4979,CHI,1994,Channel Islands,1994,,,100.0,,,,,,,,,0.122912,142466.0,-0.47662,43746.0,30.706
6293,SST,1996,Small states,1996,41.623449,1470.168398,,27.709437,,114323.477719,4.0548,,,3613.328087,,1.568983,28194606.0,2.384903,13377765.0,47.447964


In [8]:
#Remove Time column as it is not neccessary column
emissions_df.drop(columns = ['Time'], axis = 1, inplace = True)

In [9]:
# Rename the columns headers to meaningful names
column_names = {'AG.LND.FRST.ZS':'forest_area_percent',
             'AG.YLD.CREL.KG':'cereal_yield',
             'EG.ELC.ACCS.ZS':'electricity_access_percent',
             'EG.FEC.RNEW.ZS':'renew_energy_percent',
             'EG.USE.PCAP.KG.OE':'energy_use_per_capita',
             'EN.ATM.CO2E.KT':'emissions_total',
             'EN.ATM.CO2E.PC':'emissions_per_capita',
             'EN.ATM.CO2E.PP.GD.KD':'emissions_per_gdp',
             'NY.GDP.MKTP.KD.ZG':'gdp_growth_percent',
             'NY.GDP.PCAP.CD':'gdp_per_capita',
             'NY.GNP.PCAP.CD':'gni_per_capita',
             'SP.POP.GROW':'pop_growth_percent',
             'SP.POP.TOTL':'pop_total',
             'SP.URB.GROW':'urb_pop_growth_percent',
             'SP.URB.TOTL':'urban_pop_total',
             'SP.URB.TOTL.IN.ZS':'urban_pop_percent',
             'economy':'country_code',
             'time':'year',
             'Country':'country_name'    
            }

In [10]:
emissions_df = emissions_df.rename(columns=column_names)
emissions_df.sample(10)

Unnamed: 0,country_code,year,country_name,forest_area_percent,cereal_yield,electricity_access_percent,renew_energy_percent,energy_use_per_capita,emissions_total,emissions_per_capita,emissions_per_gdp,gdp_growth_percent,gdp_per_capita,gni_per_capita,pop_growth_percent,pop_total,urb_pop_growth_percent,urban_pop_total,urban_pop_percent
2713,MYS,1992,Malaysia,62.19166,2953.1,,10.266217,1603.110493,67170.0,3.625577,0.303059,8.885118,3193.635381,2950.0,2.787186,18526708.0,5.205499,9599428.0,51.814
2749,MDG,2012,Madagascar,21.546325,3348.9,18.700001,83.4,,2740.00001,0.119306,0.079696,3.011148,504.17373,480.0,2.728141,22966240.0,4.694565,7630993.0,33.227
1965,MNP,2012,Northern Mariana Islands,65.191304,,100.0,0.0,,,,,0.810811,14247.789301,,-0.307021,52359.0,-0.21987,47703.0,91.108
5865,ARG,2004,Argentina,11.734029,3658.8,96.653503,9.29,1720.311256,141380.004883,3.656178,0.203685,9.029573,4258.160261,3360.0,1.015337,38668796.0,1.209159,34747780.0,89.86
1497,WSM,2004,Samoa,59.852297,,90.728996,48.82,297.756722,189.999998,1.010246,0.201011,3.08726,2168.038807,1830.0,0.337139,188073.0,-0.78,40355.0,21.457
4187,FIN,2002,Finland,73.557609,3295.0,100.0,29.52,6728.581405,63439.998627,12.198597,0.294375,1.707149,26997.75299,25620.0,0.242381,5200598.0,0.406147,4290649.0,82.503
1810,PNG,1999,Papua New Guinea,80.135445,3981.1,6.426855,68.71084,,2600.0,0.488594,0.151001,1.855554,653.408134,680.0,3.490894,5321388.0,2.19839,711736.0,13.375
5451,BTN,1998,Bhutan,64.978492,1559.5,,91.531471,,200.0,0.358974,0.09299,5.914031,652.350989,530.0,1.861997,557143.0,6.126642,130243.0,23.377
2752,MDG,2009,Madagascar,21.681826,3183.4,17.4,87.07,,1690.000057,0.08003,0.051753,-3.978709,455.407381,460.0,2.899473,21117092.0,4.935977,6609861.0,31.301
3612,HUN,2017,Hungary,22.542954,5835.5,100.0,14.54,,46439.998627,4.744602,0.160828,4.271976,14621.239596,13180.0,-0.265861,9787966.0,0.133177,6955524.0,71.062


In [11]:
#swap year and country_name columns
col_list = list(emissions_df.columns)
x, y = col_list.index('year'), col_list.index('country_name')
col_list[y], col_list[x] = col_list[x], col_list[y]
emissions_df = emissions_df[col_list]
list(emissions_df.columns)

['country_code',
 'country_name',
 'year',
 'forest_area_percent',
 'cereal_yield',
 'electricity_access_percent',
 'renew_energy_percent',
 'energy_use_per_capita',
 'emissions_total',
 'emissions_per_capita',
 'emissions_per_gdp',
 'gdp_growth_percent',
 'gdp_per_capita',
 'gni_per_capita',
 'pop_growth_percent',
 'pop_total',
 'urb_pop_growth_percent',
 'urban_pop_total',
 'urban_pop_percent']

In [12]:
emissions_df.shape

(7448, 19)

In [13]:
emissions_df.dtypes

country_code                   object
country_name                   object
year                            int64
forest_area_percent           float64
cereal_yield                  float64
electricity_access_percent    float64
renew_energy_percent          float64
energy_use_per_capita         float64
emissions_total               float64
emissions_per_capita          float64
emissions_per_gdp             float64
gdp_growth_percent            float64
gdp_per_capita                float64
gni_per_capita                float64
pop_growth_percent            float64
pop_total                     float64
urb_pop_growth_percent        float64
urban_pop_total               float64
urban_pop_percent             float64
dtype: object

In [14]:
emissions_df.isnull().sum()

country_code                     0
country_name                     0
year                             0
forest_area_percent            331
cereal_yield                  1273
electricity_access_percent    1143
renew_energy_percent           359
energy_use_per_capita         2697
emissions_total                779
emissions_per_capita           779
emissions_per_gdp             1242
gdp_growth_percent             666
gdp_per_capita                 504
gni_per_capita                1055
pop_growth_percent              30
pop_total                       28
urb_pop_growth_percent          85
urban_pop_total                 84
urban_pop_percent               84
dtype: int64

Objective: Remove as many empty cells as possible, while preserving the highest possible amount of columns and rows. Check missing data for certain years, countries or features (columns).

In [15]:
# Assigned to a new df for dealing with NaN
clean_df = emissions_df
clean_df.head()

Unnamed: 0,country_code,country_name,year,forest_area_percent,cereal_yield,electricity_access_percent,renew_energy_percent,energy_use_per_capita,emissions_total,emissions_per_capita,emissions_per_gdp,gdp_growth_percent,gdp_per_capita,gni_per_capita,pop_growth_percent,pop_total,urb_pop_growth_percent,urban_pop_total,urban_pop_percent
0,ZWE,Zimbabwe,2017,45.451183,1202.7,44.178635,82.46,,10340.000153,0.700965,0.300613,4.080264,1192.107012,1170.0,2.04362,14751101.0,1.860765,4755312.0,32.237
1,ZWE,Zimbabwe,2016,45.570273,435.1,42.561729,81.9,,11020.000458,0.762487,0.333455,0.900955,1421.787789,1200.0,2.081806,14452704.0,1.80661,4667645.0,32.296
2,ZWE,Zimbabwe,2015,45.689363,557.5,33.700001,80.82,,12430.000305,0.878139,0.379509,2.02365,1410.329174,1220.0,2.136294,14154937.0,1.769505,4584076.0,32.385
3,ZWE,Zimbabwe,2014,45.808453,831.4,32.299999,80.27,,12079.999924,0.87184,0.376287,1.484543,1407.034293,1210.0,2.191391,13855753.0,1.730983,4503674.0,32.504
4,ZWE,Zimbabwe,2013,45.927543,668.5,40.498375,78.87,832.572236,12279.999733,0.905911,0.388196,3.196731,1408.36781,1200.0,2.163267,13555422.0,1.613531,4426387.0,32.654


### Filter years by missing values

In [16]:
#define an array with the unique year values
years_count_missing = dict.fromkeys(clean_df['year'].unique(), 0)
for ind, row in clean_df.iterrows():
    years_count_missing[row['year']] += row.isnull().sum()

# sort the years by missing values
years_missing_sorted = dict(sorted(years_count_missing.items(), key=lambda item: item[1]))

# print the missing values for each year
print("missing values by year:")
for key, val in years_missing_sorted.items():
    print(key, ":", val)


missing values by year:
2007 : 262
2006 : 264
2005 : 271
2013 : 274
2004 : 274
2014 : 277
2012 : 279
2011 : 283
2010 : 289
2009 : 293
2008 : 297
2003 : 308
2002 : 314
2001 : 334
2000 : 342
2015 : 417
1999 : 430
1998 : 442
1997 : 454
1996 : 474
1995 : 499
2017 : 508
2016 : 508
1994 : 539
1993 : 553
1992 : 587
1991 : 680
1990 : 687


Consider years 1995 - 2015  years with less than 510 missing year data.

In [17]:
print("number of missing values in the whole dataset before filtering the years:")
print(clean_df.isnull().sum().sum())
print("number of rows before filtering the years:")
print(clean_df.shape[0])

# filter only rows for years between 1991 and 2008 (having less missing values)
clean_df = clean_df[(clean_df['year'] >= 1995) & (clean_df['year'] <= 2015)]

print("number of missing values in the whole dataset after filtering the years:")
print(clean_df.isnull().sum().sum())
print("number of rows after filtering the years:")
print(clean_df.shape[0])

number of missing values in the whole dataset before filtering the years:
11139
number of rows before filtering the years:
7448
number of missing values in the whole dataset after filtering the years:
7077
number of rows after filtering the years:
5586


### Filter countries by missing value

In [18]:
# check the amount of missing values by country

# define an array with the unique country values
countries_count_missing = dict.fromkeys(clean_df['country_code'].unique(), 0)

# iterate through all rows and count the amount of NaN values for each country
for ind, row in clean_df.iterrows():
    countries_count_missing[row['country_code']] += row.isnull().sum()

# sort the countries by missing values
countries_missing_sorted = dict(sorted(countries_count_missing.items(), key=lambda item: item[1]))

# print the missing values for each country
print("missing values by country:")
for key, val in countries_missing_sorted.items():
    print(key, ":", val)

missing values by country:
USA : 0
GBR : 0
CHE : 0
SWE : 0
ESP : 0
SVK : 0
PRT : 0
POL : 0
NOR : 0
NZL : 0
NLD : 0
MEX : 0
KOR : 0
JPN : 0
ITA : 0
IRL : 0
HUN : 0
DEU : 0
FRA : 0
FIN : 0
DNK : 0
CZE : 0
CHL : 0
AUT : 0
AUS : 0
PST : 0
OED : 0
NAC : 0
HIC : 0
EUU : 0
EMU : 0
URY : 1
UKR : 1
TUN : 1
TZA : 1
SEN : 1
SAU : 1
RUS : 1
ROU : 1
PHL : 1
PER : 1
PRY : 1
PAN : 1
OMN : 1
MKD : 1
NGA : 1
NIC : 1
NAM : 1
MAR : 1
MUS : 1
MLT : 1
KWT : 1
KEN : 1
KAZ : 1
JOR : 1
JAM : 1
IDN : 1
IND : 1
HND : 1
HTI : 1
GTM : 1
GHA : 1
SLV : 1
EGY : 1
ECU : 1
DOM : 1
CYP : 1
CIV : 1
COL : 1
CMR : 1
BGR : 1
BRN : 1
BRA : 1
BWA : 1
BOL : 1
BLR : 1
BGD : 1
ARG : 1
ALB : 1
TSA : 1
SAS : 1
LMC : 1
TLA : 1
LAC : 1
LCN : 1
TEC : 1
ECA : 1
ECS : 1
EAR : 1
CEB : 1
AFW : 1
ZWE : 2
ZMB : 2
ZAF : 2
NPL : 2
BIH : 2
BEN : 2
TSS : 2
SSA : 2
SSF : 2
ARB : 2
UZB : 3
SVN : 3
MOZ : 3
KGZ : 3
ISR : 3
PRE : 3
VNM : 4
TGO : 4
PAK : 4
MDA : 4
LTU : 4
LVA : 4
HRV : 4
KHM : 4
WLD : 4
IDA : 4
IDX : 4
IDB : 4
TUR : 5
TJK : 5
BEL :

Remove countries with more than 100 missing values 

In [19]:
print("number of missing values in the whole dataset before filtering the countries:")
print(clean_df.isnull().sum().sum())
print("number of rows before filtering the countries:")
print(clean_df.shape[0])


# filter only rows for countries with less than 90 missing values
countries_filter = []
for key, val in countries_missing_sorted.items():
    if val<100:
        countries_filter.append(key)

clean_df = clean_df[clean_df['country_code'].isin(countries_filter)]

print("number of missing values in the whole dataset after filtering the countries:")
print(clean_df.isnull().sum().sum())
print("number of rows after filtering the countries:")
print(clean_df.shape[0])

number of missing values in the whole dataset before filtering the countries:
7077
number of rows before filtering the countries:
5586
number of missing values in the whole dataset after filtering the countries:
2941
number of rows after filtering the countries:
5019


### Filter years by missing values

In [20]:
clean_df.isnull().sum()

country_code                     0
country_name                     0
year                             0
forest_area_percent             27
cereal_yield                   384
electricity_access_percent     407
renew_energy_percent            30
energy_use_per_capita         1264
emissions_total                 51
emissions_per_capita            51
emissions_per_gdp              289
gdp_growth_percent             133
gdp_per_capita                  74
gni_per_capita                 231
pop_growth_percent               0
pop_total                        0
urb_pop_growth_percent           0
urban_pop_total                  0
urban_pop_percent                0
dtype: int64

Top 4 features with missing values (energy_use_per_capita, electricity_access_percent, cereal_yield, emissions_per_gdp)

In [21]:
# remove features with more than 25 missing values

from itertools import compress

# create a boolean mapping of features with more than 25 missing values
vars_bad = clean_df.isnull().sum()>25

# remove the columns corresponding to the mapping of the features with many missing values
clean_df2 = clean_df.drop(compress(data = clean_df.columns, selectors = vars_bad), axis='columns')

print("Remaining missing values per column:")
print(clean_df2.isnull().sum())

Remaining missing values per column:
country_code              0
country_name              0
year                      0
pop_growth_percent        0
pop_total                 0
urb_pop_growth_percent    0
urban_pop_total           0
urban_pop_percent         0
dtype: int64


In [22]:
clean_df2.shape

(5019, 8)

In [23]:
clean_df2.to_csv('../Resources/emissions_2.csv',index= False)

### Compare to DropNa on emissions_df

In [24]:
#Drop rows which has null values
emissions_df = emissions_df.dropna()
emissions_df.reset_index(drop = True)

Unnamed: 0,country_code,country_name,year,forest_area_percent,cereal_yield,electricity_access_percent,renew_energy_percent,energy_use_per_capita,emissions_total,emissions_per_capita,emissions_per_gdp,gdp_growth_percent,gdp_per_capita,gni_per_capita,pop_growth_percent,pop_total,urb_pop_growth_percent,urban_pop_total,urban_pop_percent
0,ZWE,Zimbabwe,2013,45.927543,668.500000,40.498375,78.870000,832.572236,12279.999733,0.905911,0.388196,3.196731,1408.367810,1200.000000,2.163267,13555422.0,1.613531,4426387.0,32.654000
1,ZWE,Zimbabwe,2012,46.046633,695.700000,44.000000,77.500000,814.910235,12010.000229,0.905368,0.391797,15.744877,1290.193956,1120.000000,1.822309,13265331.0,1.272568,4355539.0,32.834000
2,ZWE,Zimbabwe,2011,46.165723,587.400000,36.900002,79.270000,787.030033,11409.999847,0.875955,0.430830,14.620207,1082.615774,950.000000,1.438339,13025785.0,0.891612,4300463.0,33.015000
3,ZWE,Zimbabwe,2010,46.284813,733.400000,38.782551,82.270000,736.691254,9600.000381,0.747677,0.415482,21.452061,937.840338,650.000000,1.253650,12839771.0,0.706879,4262290.0,33.196000
4,ZWE,Zimbabwe,2009,46.403903,452.400000,43.369083,82.090000,720.587138,7750.000000,0.611208,0.407369,12.019560,762.297957,440.000000,1.026265,12679810.0,0.482488,4232267.0,33.378000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3547,AFE,Africa Eastern and Southern,2004,33.654274,1394.049728,23.815880,63.279292,751.434195,457599.991269,1.027664,0.334259,5.507315,985.530235,810.760703,2.644968,445281555.0,3.736205,133647466.0,30.014148
3548,AFE,Africa Eastern and Southern,2003,34.177441,1296.064498,22.548307,64.165735,733.782234,426379.996829,0.982878,0.329285,3.096045,812.946404,666.312226,2.617764,433807484.0,3.708082,128833965.0,29.698419
3549,AFE,Africa Eastern and Southern,2002,34.357452,1453.381135,21.601503,64.622779,720.036948,404210.007127,0.956164,0.322486,3.905250,626.559857,615.771592,2.606598,422741118.0,3.716958,124227507.0,29.386190
3550,AFE,Africa Eastern and Southern,2001,34.537463,1442.820237,19.986220,65.586773,732.364434,393150.000000,0.954243,0.326004,3.653224,628.204191,629.894844,2.589961,412001885.0,3.655377,119775502.0,29.071591


In [25]:
emissions_df.to_csv('../Resources/emissions.csv',index= False)

### Shape of data frame:
- Filter and Bin by missing values: (5019, 8)
- Drop (NA: 3552, 19)