# Step 3. Integrate two other emission datasets focusing on developing countries
CDP, carbonn, PKU

In [37]:
#conda install functools32
from matplotlib.pyplot import *
%matplotlib inline

In [38]:
import numpy as np
from matplotlib import pyplot as plt

In [39]:
#pandas
import pandas as pd
import collections
import os

import csv

In [40]:
pd.set_option('display.max_colwidth', -1)

# Variables

In [41]:
var_city = 'city name'

# Functions

In [42]:
def tag_column_names_with_datasetname(df, tagname):
    """ Add to column name dataset from which column was obtained"""
    
    col_names = list(df)
    
    for cname in col_names:
        if cname != var_city:
            if cname != 'country':
                df.rename(columns = {cname:cname + ' (' + tagname + ')'}, inplace = True)
    
    return df

# Read data

## Emissions datasets

### Read CDP dataset after QA/QC corrections and adjustments for CH<sub>4</sub> contributions

In [66]:
# Output file from Methods2_CH4corrections.ipynb
fname = 'DATA/INTERMEDIATE_FILES/cdp_correctedCO2.csv'
df_cdp = pd.read_csv(fname, sep="\t")

In [67]:
df_cdp = df_cdp.drop('Unnamed: 0', 1)

print df_cdp.shape
df_cdp.head(2)

(187, 41)


Unnamed: 0,city name,City Name (CDP),country,region (CDP),Boundary (CDP),Reporting Year (CDP),Measurement year only (CDP),Current Population (CDP),Current Population Year (CDP),scope fraction (CDP),...,corrected CH4_(waste+natgas)/person,Production (m3),Exports (m3),Export/Production,data from CIA World Fact Book for 2014,City Location (CDP),City GDP (CDP),GDP Currency (CDP),Year of GDP (CDP),GDP Source (CDP)
0,Pingtung,Pingtung County Government,Taiwan,East Asia,Administrative boundary of a local government,2017.0,2013.0,835792.0,2016.0,1.0,...,0.014925,1,0,0.0,,"(22.551976, 120.54876)",,,,
1,Lima,Metropolitan Municipality of Lima,Peru,Latin America & Caribbean,A metropolitan area,2016.0,2012.0,8755262.0,2014.0,1.0,...,0.009998,1,0,0.0,,"(-12.046374, -77.042793)",,,,INEI


In [68]:
#Rename 'London' to 'Greater London' to match Creutzig
df_cdp.loc[df_cdp[var_city]=='London', var_city] = 'Greater London'

### Read carbonn datasets

In [69]:
#48 new cities NOT in CDP and NOT in GEA+
f_carbonn_new = 'DATA/ORIGINAL_EMISSIONS_SOURCEFILES/carbonn_selectedCities.tsv'

df_carbonn_new = pd.read_csv(f_carbonn_new, sep='\t')

df_carbonn_new.drop('Unnamed: 0', axis=1, inplace=True)

print df_carbonn_new.shape
df_carbonn_new.head(2)

(48, 16)


Unnamed: 0,city name,full city name,city_type,lat,lon,pop,pop_yr,country,region,most recent emissions,most recent emissions year,protocol,nGDP,GDP,GDP year,GDP unit
0,Aguascalientes,Municipality of Aguascalientes,Municipality,21.8897,-102.295,934424,2012.0,Mexico,Latin America & Caribbean,3839803.0,2010.0,Community,9736522.0,8321480.0,2013.0,USD
1,Balikpapan,City of Balikpapan,Municipality,-1.23793,116.852853,762492,2016.0,Indonesia,Southeast Asia,481003.0,2015.0,Community - GPC,6066.077,69874824.44,2015.0,IDR


In [70]:
#Read in 25 carbonn cities in GEA+ but NOT in CDP.

fname='DATA/ORIGINAL_EMISSIONS_SOURCEFILES/carbonn_Creutzig_notCDP.tsv'
df_carbonn_creutzig_notCDP = pd.read_csv(fname, sep="\t")

df_carbonn_creutzig_notCDP.drop('Unnamed: 0', 1, inplace=True)

print df_carbonn_creutzig_notCDP.shape
df_carbonn_creutzig_notCDP.head(2)

(25, 16)


Unnamed: 0,city name,full city name,city_type,lat,lon,pop,pop_yr,country,region,most recent emissions,most recent emissions year,protocol,nGDP,GDP,GDP year,GDP unit
0,Ahmedabad,Ahmedabad Municipal Corporation,Municipality,23.03,72.58,5570585,2011.0,India,South Asia,13855311.0,2008.0,Community,10930380000.0,,,
1,Bangalore,Bengaluru Municipal Corporation,Municipality,12.97029,77.596152,9621551,2011.0,India,South Asia,5568706.7,2008.0,Community,16056970000.0,13627990000.0,2011.0,USD


### tag column names

In [71]:
tag_column_names_with_datasetname(df_carbonn_new, 'carbonn')
tag_column_names_with_datasetname(df_carbonn_creutzig_notCDP, 'carbonn')

list(df_carbonn_creutzig_notCDP)

['city name',
 'full city name (carbonn)',
 'city_type (carbonn)',
 'lat (carbonn)',
 'lon (carbonn)',
 'pop (carbonn)',
 'pop_yr (carbonn)',
 'country',
 'region (carbonn)',
 'most recent emissions (carbonn)',
 'most recent emissions year (carbonn)',
 'protocol (carbonn)',
 'nGDP (carbonn)',
 'GDP (carbonn)',
 'GDP year (carbonn)',
 'GDP unit (carbonn)']

### Read PKU dataset

In [72]:
fchina='DATA/ORIGINAL_EMISSIONS_SOURCEFILES/china_prepared.tsv'
df_china = pd.read_csv(fchina, sep="\t")

df_china.drop('Unnamed: 0', 1, inplace=True)

print df_china.shape
df_china.head(2)

(83, 12)


Unnamed: 0,city name,full city name,Urban Population,Built-up Area (km2),Income Per Capita (RMB),GDP (10000 RMB),CO2 Emissions Per Capita (Ton),Scope-1 GHG emissions,Scope-1 source dataset,Scope-1 GHG emissions units,Year of Emission,Emissions Protocol
0,Beijing,Bejing,16446857,1733.8,66458.74,139044131,4.566519,75104880.0,PKU,tCO2,2010,Other
1,Tianjin,Tianjin,9562255,1294.1,53192.42,85614571,3.64,34806610.0,PKU,tCO2,2010,Other


In [73]:
list(df_china)

['city name',
 'full city name',
 'Urban Population',
 'Built-up Area (km2)',
 'Income Per Capita (RMB)',
 'GDP (10000 RMB)',
 'CO2 Emissions Per Capita (Ton)',
 'Scope-1 GHG emissions',
 'Scope-1 source dataset',
 'Scope-1 GHG emissions units',
 'Year of Emission',
 'Emissions Protocol']

In [74]:
#Add country, protocol and region columns
df_china['country'] = 'China'
df_china['region'] = 'East Asia'

In [75]:
tag_column_names_with_datasetname(df_china, 'PKU')

list(df_china)

['city name',
 'full city name (PKU)',
 'Urban Population (PKU)',
 'Built-up Area (km2) (PKU)',
 'Income Per Capita (RMB) (PKU)',
 'GDP (10000 RMB) (PKU)',
 'CO2 Emissions Per Capita (Ton) (PKU)',
 'Scope-1 GHG emissions (PKU)',
 'Scope-1 source dataset (PKU)',
 'Scope-1 GHG emissions units (PKU)',
 'Year of Emission (PKU)',
 'Emissions Protocol (PKU)',
 'country',
 'region (PKU)']

# Concatenate emission datasets

In [76]:
#Remove tags from the following columns which are in common across all datasets
df_cdp.rename(columns = {'region (CDP)':'region'}, inplace = True)

df_carbonn_creutzig_notCDP.rename(columns = {'region (carbonn)':'region'}, inplace = True)
df_carbonn_new.rename(columns = {'region (carbonn)':'region'}, inplace = True)

df_carbonn_creutzig_notCDP.rename(columns = {'protocol (carbonn)':'Emissions Protocol'}, inplace = True)
df_carbonn_new.rename(columns = {'protocol (carbonn)':'Emissions Protocol'}, inplace = True)

df_china.rename(columns = {'Scope-1 GHG emissions (PKU)':'Scope-1 GHG emissions'}, inplace = True)
df_china.rename(columns = {'Scope-1 source dataset (PKU)':'Scope-1 source dataset'}, inplace = True)
df_china.rename(columns = {'Scope-1 GHG emissions units (PKU)':'Scope-1 GHG emissions units'}, inplace = True)
df_china.rename(columns = {'Year of Emission (PKU)':'Year of Emission'}, inplace = True)
df_china.rename(columns = {'Emissions Protocol (PKU)':'Emissions Protocol'}, inplace = True)
df_china.rename(columns = {'region (PKU)':'region'}, inplace = True)

## Master df
Start populating a master df with CDP data.

In [77]:
df_master = pd.DataFrame()

In [78]:
df_master[var_city] = df_cdp[var_city]
df_master['country'] = df_cdp['country']
df_master['region'] = df_cdp['region']
df_master['Scope-1 GHG emissions'] = df_cdp['Total Scope 1 Emissions (metric ton CO2e) (CDP)']
df_master['Scope-1 source dataset'] = 'CDP' 
df_master['Scope-1 GHG emissions units'] = 'tCO2-eq' 
df_master['Year of Emission'] = df_cdp['Measurement year only (CDP)']
df_master['Emissions Protocol'] = df_cdp['Primary Methodology (CDP)']
df_master['Scope 2 [metric ton CO2e] (CDP)'] = df_cdp['Total Scope 2 Emissions (metric ton CO2e) (CDP)']
df_master['Total Emissions [metric ton CO2e] (CDP)'] = df_cdp['Total City-wide Emissions (metric ton CO2e) (CDP)']


In [79]:
col_toadd = ['City Name (CDP)', 'Boundary (CDP)', 'City Location (CDP)',
             'Reporting Year (CDP)', 
             'Current Population (CDP)', 'Current Population Year (CDP)', 'scope fraction (CDP)',
 'CDP2016 data edited (CDP)', 'Emissions Quality Flag (CDP)', 'Gases included (CDP)',
 'TOT lower bound [tCO2] (CDP)', 'TOT upper bound [tCO2] (CDP)', 'TOT mean [tCO2] (CDP)', 'S1 lower bound [tCO2] (CDP)',
 'S1 upper bound [tCO2] (CDP)', 'S1 mean [tCO2] (CDP)', 
 'Methodology Details (CDP)', 'Increase/Decrease from last year (CDP)', 'Reason for increase/decrease in emissions (CDP)',
 'Average annual temperature (in Celsius) (CDP)', 'Land area (in square km) (CDP)',
 'Average altitude (m) (CDP)', 'City GDP (CDP)', 'GDP Currency (CDP)', 'Year of GDP (CDP)',
 'GDP Source (CDP)','CH4_waste/person', 'CH4_(waste+natgas)/person',
 'corrected CH4_(waste+natgas)/person', 'Production (m3)', 'Exports (m3)', 'Export/Production',
 'data from CIA World Fact Book for 2014', ]

In [80]:
for idx in range(len(col_toadd)):
    df_master[col_toadd[idx]] = df_cdp[col_toadd[idx]]  

**Add 25 carbonn cities in Creutzig but NOT in CDP (df_carbonn_creutzig_notCDP).**

In [81]:
#Add the common columns
df_carbonn_creutzig_notCDP.rename(columns = {'most recent emissions (carbonn)':'Scope-1 GHG emissions'}, inplace = True)
df_carbonn_creutzig_notCDP.rename(columns = {'most recent emissions year (carbonn)':'Year of Emission'}, inplace = True)
df_carbonn_creutzig_notCDP.rename(columns = {'protocol':'Emissions Protocol'}, inplace = True)

df_carbonn_creutzig_notCDP['Scope-1 source dataset'] = 'carbonn' 
df_carbonn_creutzig_notCDP['Scope-1 GHG emissions units'] = 'tCO2-eq' 

In [82]:
print df_master.shape
print df_carbonn_creutzig_notCDP.shape

df_master = pd.concat([df_master,df_carbonn_creutzig_notCDP], axis=0, ignore_index=True)

print df_master.shape

(187, 43)
(25, 18)
(212, 53)


**Add 48 unique carbonn cities (df_carbonn_new)**

In [83]:
#Add the common columns
df_carbonn_new.rename(columns = {'most recent emissions (carbonn)':'Scope-1 GHG emissions'}, inplace = True)
df_carbonn_new.rename(columns = {'most recent emissions year (carbonn)':'Year of Emission'}, inplace = True)
df_carbonn_new.rename(columns = {'protocol':'Emissions Protocol'}, inplace = True)

df_carbonn_new['Scope-1 source dataset'] = 'carbonn' 
df_carbonn_new['Scope-1 GHG emissions units'] = 'tCO2-eq'

In [84]:
print df_master.shape
print df_carbonn_new.shape

df_master = pd.concat([df_master,df_carbonn_new], axis=0, ignore_index=True)

print df_master.shape

(212, 53)
(48, 18)
(260, 53)


**Add the 83 Chinese cities (df_china)**  

In [85]:
print df_master.shape
print df_china.shape

df_master = pd.concat([df_master,df_china], axis=0, ignore_index=True)

print df_master.shape

(260, 53)
(83, 14)
(343, 59)


In [86]:
df_master.rename(columns = {'Average altitude (m) (CDP)':'Average altitude (m)'}, inplace = True)

In [87]:
#Sort by city name
df_master = df_master.sort_values(var_city)
df_master[var_city].head(5)

129    Aarhus     
40     Abington   
147    Addis Ababa
91     Adelaide   
130    Aerøskøbing
Name: city name, dtype: object

# Change units from tCO2-eq to tCO2 for CDP cities that specify CO2 only

In [88]:
df_master.loc[df_master['Gases included (CDP)']=='CO2', 'Scope-1 GHG emissions units'] = 'tCO2'

# Reorder columns

In [89]:
col_order = [
 'city name',
 'Average altitude (m)',
 'Average annual temperature (in Celsius) (CDP)',
 'Boundary (CDP)',
 'Built-up Area (km2) (PKU)',
 'CDP2016 data edited (CDP)',
 'CH4_(waste+natgas)/person',
 'CH4_waste/person',
 'CO2 Emissions Per Capita (Ton) (PKU)',
 'City GDP (CDP)',
 'City Location (CDP)',
 'City Name (CDP)',
 'Current Population (CDP)',
 'Current Population Year (CDP)',
 'Emissions Quality Flag (CDP)',
 'Emissions Protocol',
 'Export/Production',
 'Exports (m3)',
 'GDP (10000 RMB) (PKU)',
 'GDP Currency (CDP)',
 'GDP Source (CDP)',
 'Gases included (CDP)',
 'Income Per Capita (RMB) (PKU)',
 'Increase/Decrease from last year (CDP)',
 'Land area (in square km) (CDP)',
 'Methodology Details (CDP)',
 'Production (m3)',
 'Reason for increase/decrease in emissions (CDP)',
 'Reporting Year (CDP)',
 'S1 lower bound [tCO2] (CDP)',
 'S1 mean [tCO2] (CDP)',
 'S1 upper bound [tCO2] (CDP)',
 'Scope-1 GHG emissions',
 'Scope-1 source dataset',
 'Scope-1 GHG emissions units',
 'Year of Emission',
 'Scope 2 [metric ton CO2e] (CDP)',
 'TOT lower bound [tCO2] (CDP)',
 'TOT mean [tCO2] (CDP)',
 'TOT upper bound [tCO2] (CDP)',
 'Total Emissions [metric ton CO2e] (CDP)',
 'Urban Population (PKU)',
 'Year of GDP (CDP)',
 'city_type (carbonn)',
 'corrected CH4_(waste+natgas)/person',
 'country',
 'data from CIA World Fact Book for 2014',
 'full city name (PKU)',
 'full city name (carbonn)',
 'GDP unit (carbonn)',
 'GDP (carbonn)',
 'GDP year (carbonn)',
 'lat (carbonn)',
 'lon (carbonn)',
 'nGDP (carbonn)',
 'pop (carbonn)',
 'pop_yr (carbonn)',
 'region',
 'scope fraction (CDP)'
]

print len(col_order)
df_master.shape

59


(343, 59)

In [90]:
df_master = df_master[col_order]
print df_master.shape

(343, 59)



# Save data

In [91]:
df_master.to_csv('DATA/INTERMEDIATE_FILES/merged_emissions_dataset.tsv', sep='\t' , encoding='utf-8')

# Continue to Methods4_addAncillaryData.ipynb