Ensure conda environment has wbgapi installed: pip install wbgapi

In [243]:
# Import dependencies
import pandas as pd
import wbgapi as wb
import numpy as np
from sqlalchemy import create_engine
import psycopg2
from config import db_password

### Needed Series from WDI Database 
- EN.ATM.CO2E.KT - CO2 emissions (kt)
- EN.ATM.CO2E.PC - CO2 emissions (metric tons per capita)
- EN.ATM.CO2E.PP.GD.KD	- CO2 emissions (kg per 2017 PPP dollar of GDP)
- EG.USE.PCAP.KG.OE	 - Energy Use (kg of oil equivalent per capita)
- EG.FEC.RNEW.ZS - Renewable energy consumption (% of total final energy consumption)
- AG.YLD.CREL.KG - Cereal yield
- EG.ELC.ACCS.ZS - Access to Electricity (% of population)
- AG.LND.FRST.ZS - Forest area (% of land area)
- NY.GDP.MKTP.KD.ZG - GDP growth (annual %)
- NY.GDP.PCAP.CD - GDP per capita (current USD)
- NY.GNP.PCAP.CD - GNI per capita, Atlas method (current USD)
- SP.POP.TOTL - Total Population
- SP.POP.GROW - Population growth (annual %)
- SP.URB.GROW - Urban population growth (annual %)
- SP.URB.TOTL - Urban population
- SP.URB.TOTL.IN.ZS	- Urban population (% of total population)

## Extract emissions data from World Bank database via API (wbgapi)

In [244]:
series_list = ['EN.ATM.CO2E.KT',
               'EN.ATM.CO2E.PC',
               'EN.ATM.CO2E.PP.GD.KD',
               'EG.USE.PCAP.KG.OE',
               'EG.FEC.RNEW.ZS',
               'AG.YLD.CREL.KG',
               'EG.ELC.ACCS.ZS',
               'AG.LND.FRST.ZS',
               'NY.GDP.MKTP.KD.ZG',
               'NY.GDP.PCAP.CD',
               'NY.GNP.PCAP.CD',
               'SP.POP.TOTL',
               'SP.POP.GROW',
               'SP.URB.GROW',
               'SP.URB.TOTL',
               'SP.URB.TOTL.IN.ZS']

In [245]:
# Using wbgapi to extract World Bank data as Pandas data frame
raw_df = wb.data.DataFrame(series_list, time=range(1990, 2018), numericTimeKeys=True, labels=True, columns='series').reset_index()
raw_df.head()

Unnamed: 0,economy,time,Country,Time,AG.LND.FRST.ZS,AG.YLD.CREL.KG,EG.ELC.ACCS.ZS,EG.FEC.RNEW.ZS,EG.USE.PCAP.KG.OE,EN.ATM.CO2E.KT,EN.ATM.CO2E.PC,EN.ATM.CO2E.PP.GD.KD,NY.GDP.MKTP.KD.ZG,NY.GDP.PCAP.CD,NY.GNP.PCAP.CD,SP.POP.GROW,SP.POP.TOTL,SP.URB.GROW,SP.URB.TOTL,SP.URB.TOTL.IN.ZS
0,ZWE,2017,Zimbabwe,2017,45.45,1202.7,44.18,82.46,,10340.0,0.7,0.3,4.08,1192.11,1170.0,2.04,14751101.0,1.86,4755312.0,32.24
1,ZWE,2016,Zimbabwe,2016,45.57,435.1,42.56,81.9,,11020.0,0.76,0.33,0.9,1421.79,1200.0,2.08,14452704.0,1.81,4667645.0,32.3
2,ZWE,2015,Zimbabwe,2015,45.69,557.5,33.7,80.82,,12430.0,0.88,0.38,2.02,1410.33,1220.0,2.14,14154937.0,1.77,4584076.0,32.38
3,ZWE,2014,Zimbabwe,2014,45.81,831.4,32.3,80.27,,12080.0,0.87,0.38,1.48,1407.03,1210.0,2.19,13855753.0,1.73,4503674.0,32.5
4,ZWE,2013,Zimbabwe,2013,45.93,668.5,40.5,78.87,832.57,12280.0,0.91,0.39,3.2,1408.37,1200.0,2.16,13555422.0,1.61,4426387.0,32.65


In [246]:
# Rows and columns of data set
raw_df.shape

(7448, 20)

In [247]:
# # Datatypes of columns
raw_df.dtypes

economy                  object
time                      int64
Country                  object
Time                     object
AG.LND.FRST.ZS          float64
AG.YLD.CREL.KG          float64
EG.ELC.ACCS.ZS          float64
EG.FEC.RNEW.ZS          float64
EG.USE.PCAP.KG.OE       float64
EN.ATM.CO2E.KT          float64
EN.ATM.CO2E.PC          float64
EN.ATM.CO2E.PP.GD.KD    float64
NY.GDP.MKTP.KD.ZG       float64
NY.GDP.PCAP.CD          float64
NY.GNP.PCAP.CD          float64
SP.POP.GROW             float64
SP.POP.TOTL             float64
SP.URB.GROW             float64
SP.URB.TOTL             float64
SP.URB.TOTL.IN.ZS       float64
dtype: object

In [248]:
# Descriptive statistics
raw_df.describe()

Unnamed: 0,time,AG.LND.FRST.ZS,AG.YLD.CREL.KG,EG.ELC.ACCS.ZS,EG.FEC.RNEW.ZS,EG.USE.PCAP.KG.OE,EN.ATM.CO2E.KT,EN.ATM.CO2E.PC,EN.ATM.CO2E.PP.GD.KD,NY.GDP.MKTP.KD.ZG,NY.GDP.PCAP.CD,NY.GNP.PCAP.CD,SP.POP.GROW,SP.POP.TOTL,SP.URB.GROW,SP.URB.TOTL,SP.URB.TOTL.IN.ZS
count,7448.0,7117.0,6175.0,6305.0,7089.0,4751.0,6669.0,6669.0,6206.0,6782.0,6944.0,6393.0,7418.0,7420.0,7363.0,7364.0,7364.0
mean,2003.5,32.63,2894.7,79.93,31.16,2270.16,992709.46,4.21,0.27,3.58,11171.75,9479.31,1.48,258308400.43,2.23,121029449.38,55.27
std,8.08,23.39,2336.05,29.46,30.12,2669.53,3204384.87,5.23,0.22,5.79,18975.24,15075.52,1.65,819608141.63,2.09,380606489.23,23.53
min,1990.0,0.0,0.1,0.53,0.0,9.58,0.0,0.0,0.0,-64.05,22.85,40.0,-27.72,9182.0,-27.71,3733.0,5.42
25%,1996.75,12.51,1404.05,65.93,4.79,603.05,2230.0,0.64,0.14,1.55,1007.13,910.0,0.55,1330466.0,0.77,651572.25,35.3
50%,2003.5,30.86,2388.7,98.3,20.95,1238.11,23740.0,2.42,0.22,3.71,3322.03,3020.0,1.41,8483160.5,2.2,4041233.0,54.16
75%,2010.25,47.62,3796.25,100.0,54.76,3025.74,246490.0,6.22,0.33,5.9,13096.14,10400.0,2.42,55932344.25,3.5,31827577.25,74.31
max,2017.0,98.57,36761.9,100.0,98.34,21420.63,33514537.91,47.65,2.09,149.97,203266.91,122130.0,19.36,7578157615.0,31.14,4147418821.0,100.0


## Data clean starts here

In [249]:
# Assign original dataframe to another that can be modified
emissions_df = raw_df
emissions_df.sample(5)

Unnamed: 0,economy,time,Country,Time,AG.LND.FRST.ZS,AG.YLD.CREL.KG,EG.ELC.ACCS.ZS,EG.FEC.RNEW.ZS,EG.USE.PCAP.KG.OE,EN.ATM.CO2E.KT,EN.ATM.CO2E.PC,EN.ATM.CO2E.PP.GD.KD,NY.GDP.MKTP.KD.ZG,NY.GDP.PCAP.CD,NY.GNP.PCAP.CD,SP.POP.GROW,SP.POP.TOTL,SP.URB.GROW,SP.URB.TOTL,SP.URB.TOTL.IN.ZS
4416,SLV,1997,El Salvador,1997,33.17,1736.3,80.03,53.06,608.04,5320.0,0.91,0.15,3.14,1749.5,1660.0,0.78,5842638.0,3.05,3303252.0,56.54
7026,HPC,1991,Heavily indebted poor countries (HIPC),1991,31.44,947.45,,86.9,,63398.98,0.17,0.1,0.85,475.42,414.94,2.63,367531126.0,4.16,94901917.0,25.82
2513,MEX,1996,Mexico,1996,35.63,2552.9,96.09,13.06,1470.57,322700.0,3.52,0.22,6.77,4487.29,4700.0,1.78,91586555.0,2.19,67471815.0,73.67
3363,ITA,2014,Italy,2014,31.26,5717.5,100.0,17.13,2414.48,327500.0,5.39,0.14,-0.0,35565.72,34910.0,0.92,60789140.0,1.34,42109853.0,69.27
4739,CIV,2010,Cote d'Ivoire,2010,12.47,2270.5,58.3,75.42,481.19,6350.0,0.3,0.08,6.85,1654.18,1190.0,2.12,21120042.0,3.01,9996116.0,47.33


In [250]:
# Remove Time column as it is a duplicate
emissions_df.drop(columns = ['Time'], axis = 1, inplace = True)

In [251]:
# Create a dictionary to rename the columns headers to meaningful names
column_names = {'AG.LND.FRST.ZS':'forest_area_percent',
             'AG.YLD.CREL.KG':'cereal_yield',
             'EG.ELC.ACCS.ZS':'electricity_access_percent',
             'EG.FEC.RNEW.ZS':'renew_energy_percent',
             'EG.USE.PCAP.KG.OE':'energy_use_per_capita',
             'EN.ATM.CO2E.KT':'emissions_total',
             'EN.ATM.CO2E.PC':'emissions_per_capita',
             'EN.ATM.CO2E.PP.GD.KD':'emissions_per_gdp',
             'NY.GDP.MKTP.KD.ZG':'gdp_growth_percent',
             'NY.GDP.PCAP.CD':'gdp_per_capita',
             'NY.GNP.PCAP.CD':'gni_per_capita',
             'SP.POP.GROW':'pop_growth_percent',
             'SP.POP.TOTL':'pop_total',
             'SP.URB.GROW':'urb_pop_growth_percent',
             'SP.URB.TOTL':'urban_pop_total',
             'SP.URB.TOTL.IN.ZS':'urban_pop_percent',
             'economy':'country_code',
             'time':'year',
             'Country':'country_name'    
            }

In [252]:
# Rename the columns headers
emissions_df = emissions_df.rename(columns=column_names)
emissions_df.sample(10)

Unnamed: 0,country_code,year,country_name,forest_area_percent,cereal_yield,electricity_access_percent,renew_energy_percent,energy_use_per_capita,emissions_total,emissions_per_capita,emissions_per_gdp,gdp_growth_percent,gdp_per_capita,gni_per_capita,pop_growth_percent,pop_total,urb_pop_growth_percent,urban_pop_total,urban_pop_percent
35,ZMB,2010,Zambia,62.81,2540.9,22.0,88.59,596.25,2660.0,0.19,0.06,10.3,1469.36,1330.0,3.5,13792086.0,4.76,5427875.0,39.35
134,VIR,1995,Virgin Islands (U.S.),64.3,,100.0,0.0,,,,,,,,0.46,107818.0,1.02,97507.0,90.44
4012,DEU,2009,Germany,32.71,7201.4,100.0,10.72,3790.5,734810.0,8.97,0.2,-5.69,41650.37,43660.0,-0.25,81902307.0,0.0,62877220.0,76.77
6600,MEA,1997,Middle East & North Africa,1.85,1901.29,,2.42,1490.3,1224087.41,4.06,0.36,3.7,2929.04,2863.11,2.14,301680248.0,2.76,173106008.0,57.38
5579,BLR,2010,Belarus,42.53,2808.3,100.0,7.3,2902.11,61670.0,6.5,0.38,7.8,6033.69,6150.0,-0.22,9483836.0,0.45,7081770.0,74.67
4132,PYF,2001,French Polynesia,42.82,,100.0,7.37,,,,,-0.34,14012.24,,1.63,255049.0,1.49,142743.0,55.97
1563,RUS,1994,Russian Federation,49.39,1457.8,100.0,3.87,4426.67,1685050.0,11.35,0.82,-12.57,2662.1,2650.0,-0.03,148407912.0,-0.04,108895790.0,73.38
1092,ZAF,2017,South Africa,14.15,5702.0,84.4,10.45,,435649.99,7.69,0.55,1.16,6734.48,5950.0,0.39,56641209.0,1.16,37298236.0,65.85
2395,MNG,2002,Mongolia,9.21,585.4,72.28,6.51,1036.48,9600.0,3.85,0.82,4.73,559.83,520.0,0.89,2494617.0,2.73,1479408.0,59.3
566,TTO,2011,Trinidad and Tobago,45.22,1639.6,100.0,0.38,13903.56,22260.0,15.68,0.58,-0.29,17910.32,14090.0,0.69,1420020.0,0.34,764482.0,53.84


In [253]:
# Swap year and country_name columns
col_list = list(emissions_df.columns)
x, y = col_list.index('year'), col_list.index('country_name')
col_list[y], col_list[x] = col_list[x], col_list[y]
emissions_df = emissions_df[col_list]
list(emissions_df.columns)

['country_code',
 'country_name',
 'year',
 'forest_area_percent',
 'cereal_yield',
 'electricity_access_percent',
 'renew_energy_percent',
 'energy_use_per_capita',
 'emissions_total',
 'emissions_per_capita',
 'emissions_per_gdp',
 'gdp_growth_percent',
 'gdp_per_capita',
 'gni_per_capita',
 'pop_growth_percent',
 'pop_total',
 'urb_pop_growth_percent',
 'urban_pop_total',
 'urban_pop_percent']

### Objective: Remove as many empty cells as possible while preserving the highest number of rows or columns to get a large and diverse quality dataset.

How to deal with NA?   
Approaches:
1. Drop all NA in the dataset.(Specifying threshold)
2. Filter dataset based on missing values 
    - Years, countries, features (columns)  
    - Check for null values in each of the above, manual decision to remove those with more missing values (~ bins)
    - Decide to drop NA or fill them 

### Approach 1: Drop all NA in dataset

In [254]:
emissions_df.shape

(7448, 19)

In [255]:
df_1 = emissions_df
df_1 = df_1.dropna(thresh=15) 
df_1.shape

(6530, 19)

### Approach 2: Filter dataset based on missing year values

In [256]:
# Make a copy of data frame to filter missing data based on year
filter_year_df = emissions_df
filter_year_df.shape

(7448, 19)

In [257]:
# Check for null values in dataset
filter_year_df.isnull().sum()

country_code                     0
country_name                     0
year                             0
forest_area_percent            331
cereal_yield                  1273
electricity_access_percent    1143
renew_energy_percent           359
energy_use_per_capita         2697
emissions_total                779
emissions_per_capita           779
emissions_per_gdp             1242
gdp_growth_percent             666
gdp_per_capita                 504
gni_per_capita                1055
pop_growth_percent              30
pop_total                       28
urb_pop_growth_percent          85
urban_pop_total                 84
urban_pop_percent               84
dtype: int64

In [258]:
# Define an array with the unique year values
years_count_missing = dict.fromkeys(filter_year_df['year'].unique(), 0)
for ind, row in filter_year_df.iterrows():
    years_count_missing[row['year']] += row.isnull().sum()

# Sort the years by missing values
years_missing_sorted = dict(sorted(years_count_missing.items(), key=lambda item: item[1]))

# Print the missing values for each year
print("Missing values by year:")
for key, val in years_missing_sorted.items():
    print(key, ":", val)

Missing values by year:
2007 : 262
2006 : 264
2005 : 271
2013 : 274
2004 : 274
2014 : 277
2012 : 279
2011 : 283
2010 : 289
2009 : 293
2008 : 297
2003 : 308
2002 : 314
2001 : 334
2000 : 342
2015 : 417
1999 : 430
1998 : 442
1997 : 454
1996 : 474
1995 : 499
2017 : 508
2016 : 508
1994 : 539
1993 : 553
1992 : 587
1991 : 680
1990 : 687


#### Filter criteria for years
- Discard data for years having >500 missing values.
- Consider years 1995 through 2015.

In [259]:
print(f"Number of missing values in dataset before filtering the years: {filter_year_df.isnull().sum().sum()}")
print(f"Number of rows before filtering the years: {filter_year_df.shape[0]}")

# Filter rows for years between 1995 and 2015
filter_year = (filter_year_df['year'] >= 1996) & (filter_year_df['year'] <= 2014)
filter_year_df = filter_year_df[filter_year]

print(f"Number of missing values in the dataset after filtering the years: {filter_year_df.isnull().sum().sum()}")
print(f"Number of rows after filtering the years: {filter_year_df.shape[0]}")

Number of missing values in dataset before filtering the years: 11139
Number of rows before filtering the years: 7448
Number of missing values in the dataset after filtering the years: 6161
Number of rows after filtering the years: 5054


### Approach 2: Filter missing data based on countries

In [260]:
# Create a df to filter by missing values of countries
filter_countries_df = filter_year_df
filter_countries_df.shape

(5054, 19)

In [261]:
# Find the number of nulls in df
filter_countries_df.isnull().sum()

country_code                     0
country_name                     0
year                             0
forest_area_percent            180
cereal_yield                   820
electricity_access_percent     372
renew_energy_percent           148
energy_use_per_capita         1440
emissions_total                521
emissions_per_capita           521
emissions_per_gdp              767
gdp_growth_percent             344
gdp_per_capita                 261
gni_per_capita                 578
pop_growth_percent              19
pop_total                       19
urb_pop_growth_percent          57
urban_pop_total                 57
urban_pop_percent               57
dtype: int64

In [262]:
# Check the numner of missing values by country

# Define an array with the unique country values
countries_count_missing = dict.fromkeys(filter_countries_df['country_code'].unique(), 0)

# Iterate through all rows and count the amount of NaN values for each country
for ind, row in filter_countries_df.iterrows():
    countries_count_missing[row['country_code']] += row.isnull().sum()

# sort the countries by missing values
countries_missing_sorted = dict(sorted(countries_count_missing.items(), key=lambda item: item[1]))

# print the missing values for each country
print("Missing values by country:")
for key, val in countries_missing_sorted.items():
    print(key, ":", val)

Missing values by country:
URY : 0
USA : 0
GBR : 0
UKR : 0
TUN : 0
TZA : 0
CHE : 0
SWE : 0
ESP : 0
ZAF : 0
SVK : 0
SEN : 0
SAU : 0
RUS : 0
ROU : 0
PRT : 0
POL : 0
PHL : 0
PER : 0
PRY : 0
PAN : 0
OMN : 0
NOR : 0
MKD : 0
NGA : 0
NIC : 0
NZL : 0
NLD : 0
NPL : 0
NAM : 0
MAR : 0
MEX : 0
MUS : 0
MLT : 0
KWT : 0
KOR : 0
KEN : 0
KAZ : 0
JOR : 0
JPN : 0
JAM : 0
ITA : 0
IRL : 0
IDN : 0
IND : 0
HUN : 0
HND : 0
HTI : 0
GTM : 0
GHA : 0
DEU : 0
FRA : 0
FIN : 0
SLV : 0
EGY : 0
ECU : 0
DOM : 0
DNK : 0
CZE : 0
CYP : 0
CIV : 0
COL : 0
CHL : 0
CMR : 0
BGR : 0
BRN : 0
BRA : 0
BWA : 0
BIH : 0
BOL : 0
BEN : 0
BLR : 0
BGD : 0
AUT : 0
AUS : 0
ARG : 0
ALB : 0
TSS : 0
SSA : 0
SSF : 0
TSA : 0
SAS : 0
PST : 0
OED : 0
NAC : 0
LMC : 0
TLA : 0
LAC : 0
LCN : 0
HIC : 0
EUU : 0
TEC : 0
ECA : 0
ECS : 0
EMU : 0
EAR : 0
CEB : 0
ARB : 0
AFW : 0
ZWE : 1
ZMB : 1
UZB : 1
SVN : 1
MOZ : 1
MDA : 1
LTU : 1
LVA : 1
KGZ : 1
ISR : 1
HRV : 1
PRE : 1
VNM : 2
TGO : 2
PAK : 2
KHM : 2
WLD : 2
IDA : 2
IDX : 2
IDB : 2
TJK : 3
AZE : 3
ARE :

#### Filter criteria for countries
- Discard countries having >90 missing values.

In [263]:
print(f"Number of missing values in the dataset before filtering the countries: {filter_countries_df.isnull().sum().sum()}")
print(f"Number of rows before filtering the countries: {filter_countries_df.shape[0]}")

# filter only rows for countries with less than 90 missing values
countries_filter = []
for key, val in countries_missing_sorted.items():
    if val<90:
        countries_filter.append(key)

filter_countries_df = filter_countries_df[filter_countries_df['country_code'].isin(countries_filter)]

print(f"Number of missing values in the dataset after filtering the countries: {filter_countries_df.isnull().sum().sum()}")
print(f"Number of rows after filtering the countries:{filter_countries_df.shape[0]}")

Number of missing values in the dataset before filtering the countries: 6161
Number of rows before filtering the countries: 5054
Number of missing values in the dataset after filtering the countries: 2418
Number of rows after filtering the countries:4541


### Approach 2:  Filter missing values based on features

In [264]:
# Assign a new data frame to clean features
filter_feature_df = filter_countries_df
filter_feature_df.shape

(4541, 19)

In [265]:
filter_feature_df.isnull().sum()

country_code                     0
country_name                     0
year                             0
forest_area_percent             24
cereal_yield                   348
electricity_access_percent     309
renew_energy_percent            25
energy_use_per_capita         1002
emissions_total                 46
emissions_per_capita            46
emissions_per_gdp              256
gdp_growth_percent             105
gdp_per_capita                  63
gni_per_capita                 194
pop_growth_percent               0
pop_total                        0
urb_pop_growth_percent           0
urban_pop_total                  0
urban_pop_percent                0
dtype: int64

- Label: emissions_total (46 null values)
- Essential features: GDP related
- Removing all features with >200 missing values

In [266]:
# TODO: move import to the top
from itertools import compress

# Create a boolean mapping of features with more missing values
features_to_remove = filter_feature_df.isnull().sum()>200

# Remove the corresponding columns
df_2 = filter_feature_df.drop(compress(data = filter_feature_df.columns, selectors = features_to_remove), axis='columns')

print("Remaining missing values per column:")
print(df_2.isnull().sum())

Remaining missing values per column:
country_code                0
country_name                0
year                        0
forest_area_percent        24
renew_energy_percent       25
emissions_total            46
emissions_per_capita       46
gdp_growth_percent        105
gdp_per_capita             63
gni_per_capita            194
pop_growth_percent          0
pop_total                   0
urb_pop_growth_percent      0
urban_pop_total             0
urban_pop_percent           0
dtype: int64


In [267]:
# Removing the rows with the missing values 
clean_df = df_2.dropna(axis='rows', how='any')

print(f"Remaining missing values per column: {clean_df.isnull().sum()}")
print(f"Final shape of the cleaned dataset: {clean_df.shape}")

Remaining missing values per column: country_code              0
country_name              0
year                      0
forest_area_percent       0
renew_energy_percent      0
emissions_total           0
emissions_per_capita      0
gdp_growth_percent        0
gdp_per_capita            0
gni_per_capita            0
pop_growth_percent        0
pop_total                 0
urb_pop_growth_percent    0
urban_pop_total           0
urban_pop_percent         0
dtype: int64
Final shape of the cleaned dataset: (4278, 15)


In [268]:
#Save data into csv file
clean_df.to_csv('../Resources/emissions.csv',index= False)

### Approach 2 : Filter countries by UN countries list 
- Check country name against UN country list.
- Drop countries not in UN list.  

(Works on emissions_df)

In [269]:
# Official countries list from https://unstats.un.org/unsd/methodology/m49/
countries = ["AFG","ALA","ALB","DZA","ASM","AND","AGO","AIA","ATA","ATG","ARG","ARM","ABW","AUS","AUT","AZE","BHS","BHR","BGD","BRB","BLR","BEL","BLZ","BEN","BMU","BTN","BOL","BES","BIH","BWA","BVT","BRA","IOT","VGB","BRN","BGR","BFA","BDI","CPV","KHM","CMR","CAN","CYM","CAF","TCD","CHL","CHN","HKG","MAC","CXR","CCK","COL","COM","COG","COK","CRI","CIV","HRV","CUB","CUW","CYP","CZE","PRK","COD","DNK","DJI","DMA","DOM","ECU","EGY","SLV","GNQ","ERI","EST","SWZ","ETH","FLK","FRO","FJI","FIN","FRA","GUF","PYF","ATF","GAB","GMB","GEO","DEU","GHA","GIB","GRC","GRL","GRD","GLP","GUM","GTM","GGY","GIN","GNB","GUY","HTI","HMD","VAT","HND","HUN","ISL","IND","IDN","IRN","IRQ","IRL","IMN","ISR","ITA","JAM","JPN","JEY","JOR","KAZ","KEN","KIR","KWT","KGZ","LAO","LVA","LBN","LSO","LBR","LBY","LIE","LTU","LUX","MDG","MWI","MYS","MDV","MLI","MLT","MHL","MTQ","MRT","MUS","MYT","MEX","FSM","MCO","MNG","MNE","MSR","MAR","MOZ","MMR","NAM","NRU","NPL","NLD","NCL","NZL","NIC","NER","NGA","NIU","NFK","MKD","MNP","NOR","OMN","PAK","PLW","PAN","PNG","PRY","PER","PHL","PCN","POL","PRT","PRI","QAT","KOR","MDA","REU","ROU","RUS","RWA","BLM","SHN","KNA","LCA","MAF","SPM","VCT","WSM","SMR","STP","","SAU","SEN","SRB","SYC","SLE","SGP","SXM","SVK","SVN","SLB","SOM","ZAF","SGS","SSD","ESP","LKA","PSE","SDN","SUR","SJM","SWE","CHE","SYR","TJK","THA","TLS","TGO","TKL","TON","TTO","TUN","TUR","TKM","TCA","TUV","UGA","UKR","ARE","GBR","TZA","UMI","USA","VIR","URY","UZB","VUT","VEN","VNM","WLF","ESH","YEM","ZMB","ZWE"]
len(countries) # 249 countries

249

In [270]:
#List of countries which are not in countries official list
countries_to_drop = emissions_df.loc[~emissions_df["country_code"].isin(countries)]
countries_to_drop['country_name'].unique()
# After filtering based on official list of countries we have data for 150 countries (removes 852 records)

array(['Kosovo', 'Channel Islands', 'World', 'Upper middle income',
       'Sub-Saharan Africa (IDA & IBRD countries)',
       'Sub-Saharan Africa (excluding high income)', 'Sub-Saharan Africa',
       'South Asia (IDA & IBRD)', 'South Asia', 'Small states',
       'Pre-demographic dividend', 'Post-demographic dividend',
       'Pacific island small states', 'Other small states',
       'OECD members', 'Not classified', 'North America', 'Middle income',
       'Middle East & North Africa (IDA & IBRD countries)',
       'Middle East & North Africa (excluding high income)',
       'Middle East & North Africa', 'Lower middle income', 'Low income',
       'Low & middle income',
       'Least developed countries: UN classification',
       'Latin America & the Caribbean (IDA & IBRD countries)',
       'Latin America & Caribbean (excluding high income)',
       'Latin America & Caribbean', 'Late-demographic dividend',
       'IDA total', 'IDA only', 'IDA blend', 'IDA & IBRD total',
       'I

In [271]:
#Number of missing values
countries_to_drop.isnull().sum().sum()

1745

Question:
Since the number/names of countries in the world keep changing over time, should we drop data for countries that are not existent (as of 2022) according to UN official list?
Yes, if countries_to_drop have lot of missing data.

Verify:
- Compare regions and economies with UN list 
    - Region and economies
    - The number of missing values in countries_to drop list (TODO)
- Decide to drop those countries (TODO)

In [272]:
# All regions in WBGAPI
wb.region.info()

code,name
AFE,Africa Eastern and Southern
AFR,Africa
AFW,Africa Western and Central
ARB,Arab World
CAA,Sub-Saharan Africa (IFC classification)
CEA,East Asia and the Pacific (IFC classification)
CEB,Central Europe and the Baltics
CEU,Europe and Central Asia (IFC classification)
CLA,Latin America and the Caribbean (IFC classification)
CME,Middle East and North Africa (IFC classification)


In [273]:
# Select countries which are in UN official list
emissions_df = emissions_df.loc[emissions_df["country_code"].isin(countries)]
emissions_df

Unnamed: 0,country_code,country_name,year,forest_area_percent,cereal_yield,electricity_access_percent,renew_energy_percent,energy_use_per_capita,emissions_total,emissions_per_capita,emissions_per_gdp,gdp_growth_percent,gdp_per_capita,gni_per_capita,pop_growth_percent,pop_total,urb_pop_growth_percent,urban_pop_total,urban_pop_percent
0,ZWE,Zimbabwe,2017,45.45,1202.70,44.18,82.46,,10340.00,0.70,0.30,4.08,1192.11,1170.00,2.04,14751101.00,1.86,4755312.00,32.24
1,ZWE,Zimbabwe,2016,45.57,435.10,42.56,81.90,,11020.00,0.76,0.33,0.90,1421.79,1200.00,2.08,14452704.00,1.81,4667645.00,32.30
2,ZWE,Zimbabwe,2015,45.69,557.50,33.70,80.82,,12430.00,0.88,0.38,2.02,1410.33,1220.00,2.14,14154937.00,1.77,4584076.00,32.38
3,ZWE,Zimbabwe,2014,45.81,831.40,32.30,80.27,,12080.00,0.87,0.38,1.48,1407.03,1210.00,2.19,13855753.00,1.73,4503674.00,32.50
4,ZWE,Zimbabwe,2013,45.93,668.50,40.50,78.87,832.57,12280.00,0.91,0.39,3.20,1408.37,1200.00,2.16,13555422.00,1.61,4426387.00,32.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6071,AFG,Afghanistan,1994,1.85,1140.40,,32.80,,1290.00,0.08,,,,,9.86,15455555.00,10.28,3328199.00,21.53
6072,AFG,Afghanistan,1993,1.85,1132.90,,30.59,,1340.00,0.10,,,,,14.96,14003760.00,15.38,3002966.00,21.44
6073,AFG,Afghanistan,1992,1.85,1097.80,,26.52,,1390.00,0.12,,,,,11.52,12057433.00,11.94,2574865.00,21.36
6074,AFG,Afghanistan,1991,1.85,1160.40,,17.04,,2230.00,0.21,,,,,0.47,10745167.00,0.89,2285067.00,21.27


In [274]:
#Drop rows which has all missing values for all features
emissions_df.dropna(thresh = 16)

Unnamed: 0,country_code,country_name,year,forest_area_percent,cereal_yield,electricity_access_percent,renew_energy_percent,energy_use_per_capita,emissions_total,emissions_per_capita,emissions_per_gdp,gdp_growth_percent,gdp_per_capita,gni_per_capita,pop_growth_percent,pop_total,urb_pop_growth_percent,urban_pop_total,urban_pop_percent
0,ZWE,Zimbabwe,2017,45.45,1202.70,44.18,82.46,,10340.00,0.70,0.30,4.08,1192.11,1170.00,2.04,14751101.00,1.86,4755312.00,32.24
1,ZWE,Zimbabwe,2016,45.57,435.10,42.56,81.90,,11020.00,0.76,0.33,0.90,1421.79,1200.00,2.08,14452704.00,1.81,4667645.00,32.30
2,ZWE,Zimbabwe,2015,45.69,557.50,33.70,80.82,,12430.00,0.88,0.38,2.02,1410.33,1220.00,2.14,14154937.00,1.77,4584076.00,32.38
3,ZWE,Zimbabwe,2014,45.81,831.40,32.30,80.27,,12080.00,0.87,0.38,1.48,1407.03,1210.00,2.19,13855753.00,1.73,4503674.00,32.50
4,ZWE,Zimbabwe,2013,45.93,668.50,40.50,78.87,832.57,12280.00,0.91,0.39,3.20,1408.37,1200.00,2.16,13555422.00,1.61,4426387.00,32.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6059,AFG,Afghanistan,2006,1.85,1551.70,30.72,31.89,,1760.00,0.07,0.05,5.36,274.00,,4.14,25442944.00,5.03,5828215.00,22.91
6060,AFG,Afghanistan,2005,1.85,1790.40,25.39,33.88,,1550.00,0.06,0.05,11.23,255.06,,3.58,24411191.00,4.47,5542073.00,22.70
6061,AFG,Afghanistan,2004,1.85,1334.80,20.06,44.24,,1030.00,0.04,0.03,1.41,221.66,,3.93,23553551.00,4.59,5299549.00,22.50
6062,AFG,Afghanistan,2003,1.85,1458.00,14.74,36.66,,1220.00,0.05,0.04,8.83,200.46,,7.54,22645130.00,7.95,5061866.00,22.35


In [275]:
pd.set_option("display.float_format", lambda x : f"{x:.2f}")
emissions_df = emissions_df.replace(np.nan, 0.0)
emissions_df

Unnamed: 0,country_code,country_name,year,forest_area_percent,cereal_yield,electricity_access_percent,renew_energy_percent,energy_use_per_capita,emissions_total,emissions_per_capita,emissions_per_gdp,gdp_growth_percent,gdp_per_capita,gni_per_capita,pop_growth_percent,pop_total,urb_pop_growth_percent,urban_pop_total,urban_pop_percent
0,ZWE,Zimbabwe,2017,45.45,1202.70,44.18,82.46,0.00,10340.00,0.70,0.30,4.08,1192.11,1170.00,2.04,14751101.00,1.86,4755312.00,32.24
1,ZWE,Zimbabwe,2016,45.57,435.10,42.56,81.90,0.00,11020.00,0.76,0.33,0.90,1421.79,1200.00,2.08,14452704.00,1.81,4667645.00,32.30
2,ZWE,Zimbabwe,2015,45.69,557.50,33.70,80.82,0.00,12430.00,0.88,0.38,2.02,1410.33,1220.00,2.14,14154937.00,1.77,4584076.00,32.38
3,ZWE,Zimbabwe,2014,45.81,831.40,32.30,80.27,0.00,12080.00,0.87,0.38,1.48,1407.03,1210.00,2.19,13855753.00,1.73,4503674.00,32.50
4,ZWE,Zimbabwe,2013,45.93,668.50,40.50,78.87,832.57,12280.00,0.91,0.39,3.20,1408.37,1200.00,2.16,13555422.00,1.61,4426387.00,32.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6071,AFG,Afghanistan,1994,1.85,1140.40,0.00,32.80,0.00,1290.00,0.08,0.00,0.00,0.00,0.00,9.86,15455555.00,10.28,3328199.00,21.53
6072,AFG,Afghanistan,1993,1.85,1132.90,0.00,30.59,0.00,1340.00,0.10,0.00,0.00,0.00,0.00,14.96,14003760.00,15.38,3002966.00,21.44
6073,AFG,Afghanistan,1992,1.85,1097.80,0.00,26.52,0.00,1390.00,0.12,0.00,0.00,0.00,0.00,11.52,12057433.00,11.94,2574865.00,21.36
6074,AFG,Afghanistan,1991,1.85,1160.40,0.00,17.04,0.00,2230.00,0.21,0.00,0.00,0.00,0.00,0.47,10745167.00,0.89,2285067.00,21.27


In [276]:
#Save data into csv file
emissions_df.to_csv('../Resources/emissions_2.csv',index= False)

#### Considering Approach 2 for the database

In [277]:
#create the connection to the PostgreSQL database
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/emissions"

#create the db engine
engine = create_engine(db_string)
    
# Load our datafrme into sql
clean_df.to_sql(name='emissions', con=engine, index= False, if_exists='replace')