### Retrieve CO2 emissions by sectors 

In [7]:
# Import dependencies
import pandas as pd
import wbgapi as wb
import numpy as np
from sqlalchemy import create_engine
import psycopg2
from config import db_password

In [8]:
#Listing CO2 emissions by sectors
emissions_sector_list = ['CC.CO2.EMSE.BF',
               'CC.CO2.EMSE.BL',
               'CC.CO2.EMSE.EH',
               'CC.CO2.EMSE.EL',
               'CC.CO2.EMSE.EN',
               'CC.CO2.EMSE.FE',
               'CC.CO2.EMSE.IL',
               'CC.CO2.EMSE.IP',
               'CC.CO2.EMSE.LU',
               'CC.CO2.EMSE.MC',
               'CC.CO2.EMSE.OF',
               'CC.CO2.EMSE.TR'   
            ]

In [9]:
#Fetching data from the wbgapi(db = 87, Country Climate and Development Report (CCDR))
wb.db = 87
CO2_sectors_df = wb.data.DataFrame(emissions_sector_list, time=range(1990, 2018), numericTimeKeys=True, skipBlanks=True, labels=True, columns='series').reset_index()
CO2_sectors_df

Unnamed: 0,economy,time,Country,Time,CC.CO2.EMSE.BF,CC.CO2.EMSE.BL,CC.CO2.EMSE.EH,CC.CO2.EMSE.EL,CC.CO2.EMSE.EN,CC.CO2.EMSE.FE,CC.CO2.EMSE.IL,CC.CO2.EMSE.IP,CC.CO2.EMSE.LU,CC.CO2.EMSE.MC,CC.CO2.EMSE.OF,CC.CO2.EMSE.TR
0,ZWE,2017,Zimbabwe,2017,0.52,0.22,5.41,10.23,9.65,,97.40,0.58,87.16,1.12,0.66,2.24
1,ZWE,2016,Zimbabwe,2016,0.52,0.23,6.22,10.99,10.41,,98.15,0.58,87.16,1.09,0.68,2.18
2,ZWE,2015,Zimbabwe,2015,0.54,0.22,7.21,12.40,11.90,,23.72,0.50,11.32,1.09,0.81,2.57
3,ZWE,2014,Zimbabwe,2014,0.54,0.19,6.85,12.15,11.64,0.0,23.47,0.50,11.32,1.07,0.89,2.64
4,ZWE,2013,Zimbabwe,2013,0.58,0.17,6.78,12.36,11.89,0.0,23.68,0.46,11.32,1.05,0.96,2.93
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5399,AND,1993,Andorra,1993,,,,0.41,0.41,,0.41,,0.00,,,
5400,AND,1992,Andorra,1992,,,,0.41,0.41,,0.41,,0.00,,,
5401,AND,1991,Andorra,1991,,,,0.41,0.41,,0.41,,0.00,,,
5402,AND,1990,Andorra,1990,,,,0.41,0.41,,0.41,,0.00,,,


In [27]:
#Create a dictionary to rename the columns headers to meaningful names
column_names = {'CC.CO2.EMSE.BF':'bunker_fuels',
               'CC.CO2.EMSE.BL':'building',
               'CC.CO2.EMSE.EH':'electricity_heat',
               'CC.CO2.EMSE.EL':'total_excluding_LUCF',
               'CC.CO2.EMSE.EN':'energy',
               'CC.CO2.EMSE.FE':'fugitive_emissions',
               'CC.CO2.EMSE.IL':'total_including_LUCF',
               'CC.CO2.EMSE.IP':'industrial_processes',
               'CC.CO2.EMSE.LU':'land-use_change_and_forestry',
               'CC.CO2.EMSE.MC':'manufacturing_construction',
               'CC.CO2.EMSE.OF':'other_fuel_combustion',
               'CC.CO2.EMSE.TR':'transportation',
               'Time':'year',
               'economy' : 'country_code'
            }
               

In [28]:
# Rename the columns headers
clean_sectors_df = CO2_sectors_df.rename(columns=column_names)
clean_sectors_df.sample(10)

Unnamed: 0,country_code,time,Country,year,bunker_fuels,building,electricity_heat,total_excluding_LUCF,energy,fugitive_emissions,total_including_LUCF,industrial_processes,land-use_change_and_forestry,manufacturing_construction,other_fuel_combustion,transportation
1970,MMR,1992,Myanmar,1992,0.18,0.03,1.71,4.29,4.1,0.01,146.26,0.19,141.98,1.13,0.18,1.04
2654,KWT,2001,Kuwait,2001,1.25,0.34,36.06,50.82,50.47,0.82,50.8,0.35,-0.02,7.12,0.0,6.13
4274,TCD,2003,Chad,2003,0.06,0.07,0.1,0.76,0.76,0.0,17.44,0.0,16.68,0.22,0.31,0.07
149,VUT,2008,Vanuatu,2008,0.02,0.01,0.04,0.1,0.1,0.0,0.1,0.0,,0.0,0.0,0.05
189,UZB,1996,Uzbekistan,1996,3.81,32.17,37.98,97.1,95.66,0.0,96.3,1.43,-0.79,10.53,5.62,9.37
4186,COL,2007,Colombia,2007,3.22,5.35,12.57,59.19,54.65,0.55,139.7,4.54,80.51,13.48,1.48,21.23
5390,AND,2002,Andorra,2002,,,,0.53,0.53,,0.53,,0.0,,,
1730,MKD,1991,North Macedonia,1991,0.16,0.39,5.67,8.38,8.38,0.0,8.38,,,1.41,0.14,0.77
3489,GAB,2006,Gabon,2006,0.22,0.17,0.61,4.04,3.94,2.13,7.23,0.1,3.19,0.63,0.02,0.37
3069,IND,2006,India,2006,34.48,81.35,638.85,1214.25,1150.02,1.64,1219.23,64.23,4.97,260.24,47.3,120.65


In [29]:
#Drop unnecessary columns
clean_sectors_df.drop(columns = ['time', 'Country'], axis = 1, inplace = True)
clean_sectors_df

Unnamed: 0,country_code,year,bunker_fuels,building,electricity_heat,total_excluding_LUCF,energy,fugitive_emissions,total_including_LUCF,industrial_processes,land-use_change_and_forestry,manufacturing_construction,other_fuel_combustion,transportation
0,ZWE,2017,0.52,0.22,5.41,10.23,9.65,,97.40,0.58,87.16,1.12,0.66,2.24
1,ZWE,2016,0.52,0.23,6.22,10.99,10.41,,98.15,0.58,87.16,1.09,0.68,2.18
2,ZWE,2015,0.54,0.22,7.21,12.40,11.90,,23.72,0.50,11.32,1.09,0.81,2.57
3,ZWE,2014,0.54,0.19,6.85,12.15,11.64,0.0,23.47,0.50,11.32,1.07,0.89,2.64
4,ZWE,2013,0.58,0.17,6.78,12.36,11.89,0.0,23.68,0.46,11.32,1.05,0.96,2.93
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5399,AND,1993,,,,0.41,0.41,,0.41,,0.00,,,
5400,AND,1992,,,,0.41,0.41,,0.41,,0.00,,,
5401,AND,1991,,,,0.41,0.41,,0.41,,0.00,,,
5402,AND,1990,,,,0.41,0.41,,0.41,,0.00,,,


In [30]:
#fugitive_emissions has 3110 - 0 values,770 - blank, total - 4828, has only 1400 records
clean_sectors_df.drop(columns = ['fugitive_emissions'], axis = 1, inplace = True)


In [31]:
#Check for data types
clean_sectors_df.dtypes

country_code                     object
year                             object
bunker_fuels                    float64
building                        float64
electricity_heat                float64
total_excluding_LUCF            float64
energy                          float64
total_including_LUCF            float64
industrial_processes            float64
land-use_change_and_forestry    float64
manufacturing_construction      float64
other_fuel_combustion           float64
transportation                  float64
dtype: object

In [32]:
#Change datatype of year from object to int
clean_sectors_df['year'] = pd.to_numeric(clean_sectors_df['year'])
clean_sectors_df.dtypes

country_code                     object
year                              int64
bunker_fuels                    float64
building                        float64
electricity_heat                float64
total_excluding_LUCF            float64
energy                          float64
total_including_LUCF            float64
industrial_processes            float64
land-use_change_and_forestry    float64
manufacturing_construction      float64
other_fuel_combustion           float64
transportation                  float64
dtype: object

In [33]:
clean_sectors_df.shape

(5404, 13)

In [34]:
#Replacing zero values into null values helps in deleting rows which has missing values for all the sector columns
clean_sectors_df = clean_sectors_df.replace(0, np.nan)
clean_sectors_df = clean_sectors_df.dropna(thresh = 11)
clean_sectors_df

Unnamed: 0,country_code,year,bunker_fuels,building,electricity_heat,total_excluding_LUCF,energy,total_including_LUCF,industrial_processes,land-use_change_and_forestry,manufacturing_construction,other_fuel_combustion,transportation
0,ZWE,2017,0.52,0.22,5.41,10.23,9.65,97.40,0.58,87.16,1.12,0.66,2.24
1,ZWE,2016,0.52,0.23,6.22,10.99,10.41,98.15,0.58,87.16,1.09,0.68,2.18
2,ZWE,2015,0.54,0.22,7.21,12.40,11.90,23.72,0.50,11.32,1.09,0.81,2.57
3,ZWE,2014,0.54,0.19,6.85,12.15,11.64,23.47,0.50,11.32,1.07,0.89,2.64
4,ZWE,2013,0.58,0.17,6.78,12.36,11.89,23.68,0.46,11.32,1.05,0.96,2.93
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5174,AFG,1994,0.02,0.02,0.16,1.30,1.26,-1.08,0.05,-2.39,0.36,,0.71
5175,AFG,1993,0.02,0.03,0.16,1.36,1.31,-1.03,0.05,-2.39,0.38,,0.74
5176,AFG,1992,0.02,0.03,0.16,1.43,1.38,-0.96,0.05,-2.39,0.39,,0.77
5177,AFG,1991,0.02,0.07,0.27,2.74,2.69,0.35,0.04,-2.39,0.53,,1.55


In [35]:
#Check for null values by sectors
clean_sectors_df.isnull().sum()

country_code                      0
year                              0
bunker_fuels                     44
building                        105
electricity_heat                 94
total_excluding_LUCF              0
energy                            0
total_including_LUCF              1
industrial_processes            698
land-use_change_and_forestry    355
manufacturing_construction      121
other_fuel_combustion           963
transportation                    0
dtype: int64

In [36]:
# Drop LUCF and other fuel combustion which contribute more missing values. 
clean_sectors_df.drop(columns = ['land-use_change_and_forestry','other_fuel_combustion'], axis = 1, inplace = True)
clean_sectors_df

Unnamed: 0,country_code,year,bunker_fuels,building,electricity_heat,total_excluding_LUCF,energy,total_including_LUCF,industrial_processes,manufacturing_construction,transportation
0,ZWE,2017,0.52,0.22,5.41,10.23,9.65,97.40,0.58,1.12,2.24
1,ZWE,2016,0.52,0.23,6.22,10.99,10.41,98.15,0.58,1.09,2.18
2,ZWE,2015,0.54,0.22,7.21,12.40,11.90,23.72,0.50,1.09,2.57
3,ZWE,2014,0.54,0.19,6.85,12.15,11.64,23.47,0.50,1.07,2.64
4,ZWE,2013,0.58,0.17,6.78,12.36,11.89,23.68,0.46,1.05,2.93
...,...,...,...,...,...,...,...,...,...,...,...
5174,AFG,1994,0.02,0.02,0.16,1.30,1.26,-1.08,0.05,0.36,0.71
5175,AFG,1993,0.02,0.03,0.16,1.36,1.31,-1.03,0.05,0.38,0.74
5176,AFG,1992,0.02,0.03,0.16,1.43,1.38,-0.96,0.05,0.39,0.77
5177,AFG,1991,0.02,0.07,0.27,2.74,2.69,0.35,0.04,0.53,1.55


In [37]:
#Fill zero for missing values so that we can use other available data
clean_sectors_df.fillna(0, inplace = True)
clean_sectors_df.isnull().sum()

country_code                  0
year                          0
bunker_fuels                  0
building                      0
electricity_heat              0
total_excluding_LUCF          0
energy                        0
total_including_LUCF          0
industrial_processes          0
manufacturing_construction    0
transportation                0
dtype: int64

In [38]:
##Change each sectors from column to rows
clean_sectors_df=pd.melt(clean_sectors_df, id_vars=["country_code", "year"], var_name="sector_name", value_name="emissions")
clean_sectors_df

Unnamed: 0,country_code,year,sector_name,emissions
0,ZWE,2017,bunker_fuels,0.52
1,ZWE,2016,bunker_fuels,0.52
2,ZWE,2015,bunker_fuels,0.54
3,ZWE,2014,bunker_fuels,0.54
4,ZWE,2013,bunker_fuels,0.58
...,...,...,...,...
40504,AFG,1994,transportation,0.71
40505,AFG,1993,transportation,0.74
40506,AFG,1992,transportation,0.77
40507,AFG,1991,transportation,1.55


In [39]:
clean_sectors_df.dtypes

country_code     object
year              int64
sector_name      object
emissions       float64
dtype: object

In [40]:
clean_sectors_df.shape

(40509, 4)

In [42]:
#Save data into csv
clean_sectors_df.to_csv('../Resources/sector_emissions.csv',index= False)

In [43]:
#create the connection to the PostgreSQL database
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/GHG_emissions"

#create the db engine
engine = create_engine(db_string)
    
# Load our datafrme into sql
clean_sectors_df.to_sql(name='sector_emissions', con=engine, index= False, if_exists='replace')