### Retrieve CO2 emissions by sectors 

In [1]:
# Import dependencies
import pandas as pd
import wbgapi as wb
import numpy as np
from sqlalchemy import create_engine
import psycopg2
from config import db_password

In [2]:
#Listing CO2 emissions by sectors
emissions_sector_list = ['CC.CO2.EMSE.BF',
               'CC.CO2.EMSE.BL',
               'CC.CO2.EMSE.EH',
               'CC.CO2.EMSE.EL',
               'CC.CO2.EMSE.EN',
               'CC.CO2.EMSE.FE',
               'CC.CO2.EMSE.IL',
               'CC.CO2.EMSE.IP',
               'CC.CO2.EMSE.LU',
               'CC.CO2.EMSE.MC',
               'CC.CO2.EMSE.OF',
               'CC.CO2.EMSE.TR'   
            ]

In [3]:
#Fetching data from the wbgapi(db = 87, Country Climate and Development Report (CCDR))
wb.db = 87
CO2_sectors_df = wb.data.DataFrame(emissions_sector_list, time=range(1990, 2018), numericTimeKeys=True, skipBlanks=True, labels=True, columns='series').reset_index()
CO2_sectors_df

Unnamed: 0,economy,time,Country,Time,CC.CO2.EMSE.BF,CC.CO2.EMSE.BL,CC.CO2.EMSE.EH,CC.CO2.EMSE.EL,CC.CO2.EMSE.EN,CC.CO2.EMSE.FE,CC.CO2.EMSE.IL,CC.CO2.EMSE.IP,CC.CO2.EMSE.LU,CC.CO2.EMSE.MC,CC.CO2.EMSE.OF,CC.CO2.EMSE.TR
0,ZWE,2017,Zimbabwe,2017,0.52,0.22,5.41,10.23,9.65,,97.40,0.58,87.16,1.12,0.66,2.24
1,ZWE,2016,Zimbabwe,2016,0.52,0.23,6.22,10.99,10.41,,98.15,0.58,87.16,1.09,0.68,2.18
2,ZWE,2015,Zimbabwe,2015,0.54,0.22,7.21,12.40,11.90,,23.72,0.50,11.32,1.09,0.81,2.57
3,ZWE,2014,Zimbabwe,2014,0.54,0.19,6.85,12.15,11.64,0.0,23.47,0.50,11.32,1.07,0.89,2.64
4,ZWE,2013,Zimbabwe,2013,0.58,0.17,6.78,12.36,11.89,0.0,23.68,0.46,11.32,1.05,0.96,2.93
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5399,AND,1993,Andorra,1993,,,,0.41,0.41,,0.41,,0.00,,,
5400,AND,1992,Andorra,1992,,,,0.41,0.41,,0.41,,0.00,,,
5401,AND,1991,Andorra,1991,,,,0.41,0.41,,0.41,,0.00,,,
5402,AND,1990,Andorra,1990,,,,0.41,0.41,,0.41,,0.00,,,


In [4]:
#Create a dictionary to rename the columns headers to meaningful names
column_names = {'CC.CO2.EMSE.BF':'bunker_fuels',
               'CC.CO2.EMSE.BL':'building',
               'CC.CO2.EMSE.EH':'electricity_heat',
               'CC.CO2.EMSE.EL':'total_excluding_LUCF',
               'CC.CO2.EMSE.EN':'energy',
               'CC.CO2.EMSE.FE':'fugitive_emissions',
               'CC.CO2.EMSE.IL':'total_including_LUCF',
               'CC.CO2.EMSE.IP':'industrial_processes',
               'CC.CO2.EMSE.LU':'land-use_change_and_forestry',
               'CC.CO2.EMSE.MC':'manufacturing_construction',
               'CC.CO2.EMSE.OF':'other_fuel_combustion',
               'CC.CO2.EMSE.TR':'transportation',
               'Time':'year',
               'economy' : 'country_code'
            }
               

In [5]:
# Rename the columns headers
clean_sectors_df = CO2_sectors_df.rename(columns=column_names)
clean_sectors_df.sample(10)

Unnamed: 0,country_code,time,Country,year,bunker_fuels,building,electricity_heat,total_excluding_LUCF,energy,fugitive_emissions,total_including_LUCF,industrial_processes,land-use_change_and_forestry,manufacturing_construction,other_fuel_combustion,transportation
2502,LBR,1995,Liberia,1995,0.01,0.02,0.23,0.42,0.42,0.0,13.82,0.0,13.4,0.0,0.07,0.1
2364,MWI,1993,Malawi,1993,0.04,0.05,0.12,0.88,0.83,0.0,9.02,0.05,8.14,0.16,0.03,0.46
2479,LBY,1990,Libya,1990,0.64,1.31,16.26,36.09,34.98,9.14,36.09,1.11,0.0,2.08,0.0,6.19
4006,CUB,1991,Cuba,1991,1.95,2.2,11.06,27.43,26.56,0.0,16.6,0.86,-10.83,7.36,2.22,3.72
4799,BRB,2005,Barbados,2005,0.6,0.09,0.69,1.42,1.26,0.0,1.41,0.15,0.0,0.05,0.01,0.44
4547,BRA,2005,Brazil,2005,18.4,19.22,58.23,330.51,316.17,4.54,1431.63,14.35,1101.12,82.04,15.06,137.08
1367,ROU,1990,Romania,1990,6.22,8.65,79.4,172.63,168.19,0.0,173.34,4.45,0.7,60.4,8.01,11.73
1580,PAN,2001,Panama,2001,0.52,0.34,2.21,6.22,5.87,0.0,10.62,0.35,4.4,1.09,0.0,2.23
490,TTO,2003,Trinidad and Tobago,2003,0.24,0.46,10.12,15.32,15.0,0.0,15.45,0.32,0.14,2.74,0.0,1.69
3591,ETH,2016,Ethiopia,2016,2.37,0.75,0.0,14.95,11.75,,46.75,3.2,31.8,4.28,0.99,5.74


In [6]:
#Drop unnecessary columns
clean_sectors_df.drop(columns = ['time', 'Country'], axis = 1, inplace = True)
clean_sectors_df

Unnamed: 0,country_code,year,bunker_fuels,building,electricity_heat,total_excluding_LUCF,energy,fugitive_emissions,total_including_LUCF,industrial_processes,land-use_change_and_forestry,manufacturing_construction,other_fuel_combustion,transportation
0,ZWE,2017,0.52,0.22,5.41,10.23,9.65,,97.40,0.58,87.16,1.12,0.66,2.24
1,ZWE,2016,0.52,0.23,6.22,10.99,10.41,,98.15,0.58,87.16,1.09,0.68,2.18
2,ZWE,2015,0.54,0.22,7.21,12.40,11.90,,23.72,0.50,11.32,1.09,0.81,2.57
3,ZWE,2014,0.54,0.19,6.85,12.15,11.64,0.0,23.47,0.50,11.32,1.07,0.89,2.64
4,ZWE,2013,0.58,0.17,6.78,12.36,11.89,0.0,23.68,0.46,11.32,1.05,0.96,2.93
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5399,AND,1993,,,,0.41,0.41,,0.41,,0.00,,,
5400,AND,1992,,,,0.41,0.41,,0.41,,0.00,,,
5401,AND,1991,,,,0.41,0.41,,0.41,,0.00,,,
5402,AND,1990,,,,0.41,0.41,,0.41,,0.00,,,


In [7]:
#fugitive_emissions has 3110 - 0 values,770 - blank, total - 4828, has only 1400 records
clean_sectors_df.drop(columns = ['fugitive_emissions'], axis = 1, inplace = True)

In [8]:
#Check for data types
clean_sectors_df.dtypes

country_code                     object
year                             object
bunker_fuels                    float64
building                        float64
electricity_heat                float64
total_excluding_LUCF            float64
energy                          float64
total_including_LUCF            float64
industrial_processes            float64
land-use_change_and_forestry    float64
manufacturing_construction      float64
other_fuel_combustion           float64
transportation                  float64
dtype: object

In [9]:
#Change datatype of year from object to int
clean_sectors_df['year'] = pd.to_numeric(clean_sectors_df['year'])
clean_sectors_df.dtypes

country_code                     object
year                              int64
bunker_fuels                    float64
building                        float64
electricity_heat                float64
total_excluding_LUCF            float64
energy                          float64
total_including_LUCF            float64
industrial_processes            float64
land-use_change_and_forestry    float64
manufacturing_construction      float64
other_fuel_combustion           float64
transportation                  float64
dtype: object

In [10]:
clean_sectors_df.shape

(5404, 13)

In [11]:
#Replacing zero values into null values helps in deleting rows which has missing values for all the sector columns
clean_sectors_df = clean_sectors_df.replace(0, np.nan)
clean_sectors_df = clean_sectors_df.dropna(thresh = 11)
clean_sectors_df

Unnamed: 0,country_code,year,bunker_fuels,building,electricity_heat,total_excluding_LUCF,energy,total_including_LUCF,industrial_processes,land-use_change_and_forestry,manufacturing_construction,other_fuel_combustion,transportation
0,ZWE,2017,0.52,0.22,5.41,10.23,9.65,97.40,0.58,87.16,1.12,0.66,2.24
1,ZWE,2016,0.52,0.23,6.22,10.99,10.41,98.15,0.58,87.16,1.09,0.68,2.18
2,ZWE,2015,0.54,0.22,7.21,12.40,11.90,23.72,0.50,11.32,1.09,0.81,2.57
3,ZWE,2014,0.54,0.19,6.85,12.15,11.64,23.47,0.50,11.32,1.07,0.89,2.64
4,ZWE,2013,0.58,0.17,6.78,12.36,11.89,23.68,0.46,11.32,1.05,0.96,2.93
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5174,AFG,1994,0.02,0.02,0.16,1.30,1.26,-1.08,0.05,-2.39,0.36,,0.71
5175,AFG,1993,0.02,0.03,0.16,1.36,1.31,-1.03,0.05,-2.39,0.38,,0.74
5176,AFG,1992,0.02,0.03,0.16,1.43,1.38,-0.96,0.05,-2.39,0.39,,0.77
5177,AFG,1991,0.02,0.07,0.27,2.74,2.69,0.35,0.04,-2.39,0.53,,1.55


In [12]:
#Check for null values by sectors
clean_sectors_df.isnull().sum()

country_code                      0
year                              0
bunker_fuels                     44
building                        105
electricity_heat                 94
total_excluding_LUCF              0
energy                            0
total_including_LUCF              1
industrial_processes            698
land-use_change_and_forestry    355
manufacturing_construction      121
other_fuel_combustion           963
transportation                    0
dtype: int64

In [13]:
# Drop LUCF and other fuel combustion which contribute more missing values. 
clean_sectors_df.drop(columns = ['land-use_change_and_forestry','other_fuel_combustion'], axis = 1, inplace = True)
clean_sectors_df

Unnamed: 0,country_code,year,bunker_fuels,building,electricity_heat,total_excluding_LUCF,energy,total_including_LUCF,industrial_processes,manufacturing_construction,transportation
0,ZWE,2017,0.52,0.22,5.41,10.23,9.65,97.40,0.58,1.12,2.24
1,ZWE,2016,0.52,0.23,6.22,10.99,10.41,98.15,0.58,1.09,2.18
2,ZWE,2015,0.54,0.22,7.21,12.40,11.90,23.72,0.50,1.09,2.57
3,ZWE,2014,0.54,0.19,6.85,12.15,11.64,23.47,0.50,1.07,2.64
4,ZWE,2013,0.58,0.17,6.78,12.36,11.89,23.68,0.46,1.05,2.93
...,...,...,...,...,...,...,...,...,...,...,...
5174,AFG,1994,0.02,0.02,0.16,1.30,1.26,-1.08,0.05,0.36,0.71
5175,AFG,1993,0.02,0.03,0.16,1.36,1.31,-1.03,0.05,0.38,0.74
5176,AFG,1992,0.02,0.03,0.16,1.43,1.38,-0.96,0.05,0.39,0.77
5177,AFG,1991,0.02,0.07,0.27,2.74,2.69,0.35,0.04,0.53,1.55


In [14]:
#Fill zero for missing values so that we can use other available data
clean_sectors_df.fillna(0, inplace = True)
clean_sectors_df.isnull().sum()

country_code                  0
year                          0
bunker_fuels                  0
building                      0
electricity_heat              0
total_excluding_LUCF          0
energy                        0
total_including_LUCF          0
industrial_processes          0
manufacturing_construction    0
transportation                0
dtype: int64

In [15]:
##Change each sectors from column to rows
clean_sectors_df=pd.melt(clean_sectors_df, id_vars=["country_code", "year"], var_name="sector_name", value_name="emissions")
clean_sectors_df

Unnamed: 0,country_code,year,sector_name,emissions
0,ZWE,2017,bunker_fuels,0.52
1,ZWE,2016,bunker_fuels,0.52
2,ZWE,2015,bunker_fuels,0.54
3,ZWE,2014,bunker_fuels,0.54
4,ZWE,2013,bunker_fuels,0.58
...,...,...,...,...
40504,AFG,1994,transportation,0.71
40505,AFG,1993,transportation,0.74
40506,AFG,1992,transportation,0.77
40507,AFG,1991,transportation,1.55


In [16]:
clean_sectors_df.dtypes

country_code     object
year              int64
sector_name      object
emissions       float64
dtype: object

In [17]:
clean_sectors_df.shape

(40509, 4)

In [18]:
#Save data into csv
clean_sectors_df.to_csv('../Resources/sector_emissions.csv',index= False)

### Database Connection

In [19]:
# create database connection
user = "postgres"
password = db_password
host = "emissions-db.cr5mfnfivfxl.us-east-1.rds.amazonaws.com"
port = '5432'
database = "GHG_emissions"

conn_string = f'postgresql://{user}:{password}@{host}:{port}/{database}'
engine = create_engine(conn_string)

### Load data into the database

In [20]:
# Load sectors dataframe into SQL tables
clean_sectors_df.to_sql('sector_emissions', con=engine, index=False, if_exists='replace')