In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.options.display.max_rows=100

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Dataset description and relation to competition
This dataset is available here: http://citycarbonfootprints.info/ and derives from a recent publication on carbon footprints of world cities: https://iopscience.iop.org/article/10.1088/1748-9326/aac72a "Carbon footprints of 13 000 cities", published in 2018. The data appear to come from earlier than 2018, although it seems much effort went in to producing the output, which includes carbon footprints not only of 13,000 cities around the world, but also carbon footprints on a 250m spanning the globe.

The study appears to be one of the most extensive efforts yet to characterize carbon footprints with as much spatial coverage as possible. This opens up numerous possibilities for analysis and incorporation into KPIs for the CDP: Unlocking Climate Solutions competition. The global extent should enable actual carbon footprint estimates to be spatially joined to the competition data, which include:
- City-level information about commitments to improving carbon budgets, as well as
- Fine-grained spatial analysis within cities at the zip code and census tract level

The data presented here could be used for both large, multi-city analysis, as well as within-city analysis due to the 250m spatial resolution.

Here I load the data and visualize for Los Angeles County in California, USA.

### Load Carbon Footprint Data

In [None]:
import rasterio
import geopandas as gpd
co2_df = rasterio.open('../input/global-gridded-model-of-carbon-footprints-ggmcf/GGMCF_v1.0.tif')
co2_df.crs.wkt

In [None]:
## Cities polygons
cities_poly_df = gpd.read_file('/kaggle/input/geospatial-environmental-and-socioeconomic-data/1_CITIES_landscan/ne_10m_urban_areas_landscan/ne_10m_urban_areas_landscan.shp')
cities_poly_df.crs

In [None]:
# create an output directory
!mkdir /kaggle/working/global-gridded-model-of-carbon-footprints-ggmcf

In [None]:
%%time

# The coordinate reference system of these data  need to match other systems, to be spatially joined.
from rasterio.warp import calculate_default_transform, reproject, Resampling

# reprojection of Carbon Footprint data
dst_crs = 'EPSG:4326'

with rasterio.open('../input/global-gridded-model-of-carbon-footprints-ggmcf/GGMCF_v1.0.tif') as src:
    transform, width, height = calculate_default_transform(
        src.crs, dst_crs, src.width, src.height, *src.bounds)
    kwargs = src.meta.copy()
    kwargs.update({
        'crs': dst_crs,
        'transform': transform,
        'width': width,
        'height': height
    })

    with rasterio.open('/kaggle/working/global-gridded-model-of-carbon-footprints-ggmcf/GGMCF_v1.0.EPSG4326.tif', 'w', **kwargs) as dst:
        for i in range(1, src.count + 1):
            reproject(
                source=rasterio.band(src, i),
                destination=rasterio.band(dst, i),
                src_transform=src.transform,
                src_crs=src.crs,
                dst_transform=transform,
                dst_crs=dst_crs,
                resampling=Resampling.nearest)

## Now we can combine exemplary city mask with carbon footprint

In [None]:
from rasterio.mask import mask

city_name = 'Warsaw'

with rasterio.open('/kaggle/working/global-gridded-model-of-carbon-footprints-ggmcf/GGMCF_v1.0.tif') as src:
    carbon_data, carbon_transform = mask(src, cities_poly_df.loc[cities_poly_df['name_conve']==city_name]['geometry'], crop=True)
    carbon_meta = src.meta

# let's visualize this:
import matplotlib.pyplot as plt
from rasterio.plot import show

fig, axs = plt.subplots(1,2)
cities_poly_df.loc[cities_poly_df['name_conve']==city_name].plot(ax=axs[0])
show(carbon_data, ax=axs[1])

## What could we calculate?

In [None]:
print(f'Mean co2 footprint of {city_name}: {carbon_data.mean()}')

## Combine with CDP's list of cities

In [None]:
import geopandas as gpd
import pandas as pd
import numpy as np
pd.options.display.max_colwidth=None
pd.options.display.max_rows=200

In [None]:
PREFIX = '../input/cdp-unlocking-climate-solutions/Cities/Cities Disclosing/'

cdp_cities_disclosing_df = pd.concat([pd.read_csv(PREFIX+'2018_Cities_Disclosing_to_CDP.csv'),\
                                           pd.read_csv(PREFIX+'2019_Cities_Disclosing_to_CDP.csv'),\
                                           pd.read_csv(PREFIX+'2020_Cities_Disclosing_to_CDP.csv')])
cdp_cities_disclosing_df.head()

In [None]:
!pip install geotext

In [None]:
from geotext import GeoText

In [None]:
# get valid city name from 'Organization' and then 'City' field of the City Disclosure Form, with the help of GeoText library

cdp_cities_disclosing_df['Organization_clean'] = cdp_cities_disclosing_df.apply(lambda x: GeoText(str(x['Organization'])).cities, axis=1)\
                                                        .apply(lambda x: ','.join(x))\
                                                        .replace('', np.nan)

print('cdp_cities_disclosing_df[\'Organization_clean\']: ', cdp_cities_disclosing_df['Organization_clean'].dropna().shape)

# check more than one city decoded anywhere in one field
cdp_cities_disclosing_df.loc[cdp_cities_disclosing_df['Organization_clean'].fillna('').str.split(',').apply(len)>1]

In [None]:
df_unique_cities = cdp_cities_disclosing_df[['City', 'Country', 'City Location']].drop_duplicates().reset_index(drop=True)

In [None]:
df_unique_cities.count()

In [None]:
cdp_cities_disclosing_df['City_clean'] = cdp_cities_disclosing_df.apply(lambda x: GeoText(str(x['City'])).cities, axis=1)\
.apply(lambda x: ','.join(x))\
.replace('', np.nan)
# check if over one city decoded anywhere in one field
print('cdp_cities_disclosing_df[\'City_clean\']: ', cdp_cities_disclosing_df['City_clean'].dropna().unique().shape)
cdp_cities_disclosing_df.loc[cdp_cities_disclosing_df['City_clean'].fillna('').str.split(',').apply(len)>1]


In [None]:
cdp_cities_disclosing_df.loc[cdp_cities_disclosing_df['City_clean']=='Yaoundé']

In [None]:
cdp_cities_disclosing_df['City_clean'].value_counts()[:100]

In [None]:
cdp_cities_disclosing_df['City'].value_counts()[:100]#isna().sum()#.drop_duplicates()#.dropna()#.value_counts()#.dropna().unique().shape

In [None]:
cdp_cities_disclosing_df['City_clean'] = cdp_cities_disclosing_df[['Organization', 'City', 'Organization_clean', 'City_clean']]\
                                                .ffill(axis=1)['City_clean']
print('cdp_cities_disclosing_df[\'City_clean\']: ', cdp_cities_disclosing_df['City_clean'].dropna().unique().shape)
cdp_cities_disclosing_df.loc[cdp_cities_disclosing_df['City_clean'].fillna('').str.split(',').apply(len)>1].head()

## Country

In [None]:
GeoText('Orange County').cities

In [None]:
cdp_cities_disclosing_df['Country_clean'] = cdp_cities_disclosing_df['Country']\
                                                    .apply(lambda x: GeoText(str(x)).countries)\
                                                    .apply(lambda x: ','.join(x))

In [None]:
## so, if we use the same method for Countries, will that be unambiguous? (there should be zero rows below):
cdp_cities_disclosing_df.loc[cdp_cities_disclosing_df['Country']\
                                  .apply(lambda x: GeoText(str(x)).countries)\
                                  .apply(lambda x: len(x))>1]

In [None]:
cdp_cities_disclosing_df[['Country', 'City', 'Organization', 'Country_clean', 'City_clean']]

In [None]:
cdp_cities_disclosing_df['City_clean'].dropna().shape

In [None]:
cdp_cities_disclosing_df[['Country_clean', 'City_clean']]\
.drop_duplicates()\
.sort_values(by=['Country_clean', 'City_clean'])\
.shape

In [None]:
## as temp df
###'Country_clean', 'City_clean', ]]
unique_cities_disclosing_df = cdp_cities_disclosing_df[['Account Number', 'City', 'Country', 'City Location']]\
.drop_duplicates()\
.reset_index(drop=True)
unique_cities_disclosing_df.shape

In [None]:
cdp_cities_disclosing_df.head()

In [None]:
(~unique_cities_disclosing_df['City Location'].isnull()).sum()#.dropna() # 687 has geographical point assigned

## Now get the city name from map by Point Coordinates

In [None]:
## as temp df
###'Country_clean', 'City_clean', ]]
unique_cities_disclosing_df = cdp_cities_disclosing_df[['Account Number', 'City', 'Country', 'City Location']]\
.drop_duplicates()\
.reset_index(drop=True)
unique_cities_disclosing_df.shape

In [None]:
unique_cities_disclosing_df

In [None]:
## Cities polygons
cities_poly_df = gpd.read_file('../input/1-cities-landscan/1_CITIES_landscan/ne_10m_urban_areas_landscan/ne_10m_urban_areas_landscan.shp')
cities_poly_df.crs

In [None]:
import matplotlib.pyplot as plt

# Plot the WGS84
cities_poly_df.loc[cities_poly_df['name_conve']=='Warsaw'].plot(facecolor='gray');
# Add title
plt.title("WGS84 (EPSG:4326) projection");
# Remove empty white space around the plot
plt.tight_layout()

# Plot the one with ETRS-LAEA projection (more realistic for this region)
cities_poly_df.loc[cities_poly_df['name_conve']=='Warsaw'].to_crs(epsg=3035).plot(facecolor='blue');
# Add title
plt.title("ETRS Lambert Azimuthal Equal Area projection");
# Remove empty white space around the plot
plt.tight_layout()

In [None]:
# We use shapely.wkt sub-module to parse wkt format:
from shapely import wkt

# for WKT to convert correctly, in columns must be valid strings. Execute only once!
unique_cities_disclosing_df.loc[~unique_cities_disclosing_df['City Location'].isnull(), 'City Location'] = unique_cities_disclosing_df.loc[~unique_cities_disclosing_df['City Location'].isnull(), 'City Location'].apply(wkt.loads)

# cast Pandas DF into Geo Pandas DF
gdf = gpd.GeoDataFrame(unique_cities_disclosing_df, geometry='City Location', crs="EPSG:4326")

In [None]:
poly_df = cities_poly_df.reset_index(drop=True).copy()     #.loc[cities_poly_df['name_conve'].isin(['Warsaw', 'Moscow'])].reset_index(drop=True)
poly_df.shape

In [None]:
points_df = gdf.reset_index(drop=True).copy()     #.loc[gdf['Country']=='Poland'].reset_index(drop=True)
points_df.shape

In [None]:
# fig, ax = plt.subplots()
# poly_df.plot(ax=ax, facecolor='red');
# points_df.plot(ax=ax, color='blue', markersize=5);
# plt.tight_layout();

In [None]:
assert points_df.crs == poly_df.crs, 'Coordinate Systems do not match!!'

In [None]:
points_df.head()

## Spatial Join - joining cities with polygons

In [None]:
print('We have coordinates for the following number of cities:')
print(unique_cities_disclosing_df.loc[unique_cities_disclosing_df['City Location'].notnull()].shape[0])

print('We don\'t have coordinates for the following number of cities:')
print(unique_cities_disclosing_df.loc[unique_cities_disclosing_df['City Location'].isnull()].shape[0])

In [None]:
## Assigning polygons:

print(f'Initially we have {points_df.shape} unique cities')
# this spatial join works well here, but two problems: some polygons in our dataset are wrong like (Yokohama inside Tokyo). 
result = gpd.sjoin(points_df, poly_df, how="left", op="within")
print(f'after left joining: {result.shape}')
# We lost the geometry so now, we just have to add it back again.
result = result.merge(poly_df[['name_conve', 'geometry']], left_on='name_conve', right_on='name_conve', how='left')
print(f'and then: {result.shape}')

In [None]:
# But problem: city Tokyo was found in Tokio and in Yokohama polygon 
result.loc[result['Account Number']==31111].iloc[:,:-1]

In [None]:
# The problem is, that more cities have their polgyons overlapping over each other like Tokyo and Yokohama.
# I'll assign the polygon with the smaller area in that cases: using 'min_areakm' column (so Yokohama in this case)
# when duplicated rows (by keys) that should keep only the first, smaller polygon (because sorted ascending)
result = result.sort_values(by=['min_areakm']).drop_duplicates(subset=['Account Number', 'City', 'Country', 'City Location'])

print('Shape: ', result.shape)

# check if Yokohama.
result.loc[result['Account Number']==31111].iloc[:,:-3]

In [None]:
## how many points don't have the pologyon assigned
print(result.loc[result['City Location'].notnull() & (result['geometry'].isnull())].shape)
result.loc[result['City Location'].notnull() & (result['geometry'].isnull())]

In [None]:
# so we have polygons only for 
print(f'we have polygons for:', result.loc[(result['geometry'].notnull())].shape[0])

In [None]:
# test
fig, ax = plt.subplots()
result.loc[result['name_conve']=='Warsaw', 'geometry'].plot(ax=ax, facecolor='gray');
result.loc[result['name_conve']=='Warsaw', 'City Location'].plot(ax=ax, facecolor='red');
plt.tight_layout();

## Saving results

In [None]:
# create an output directory
!mkdir /kaggle/working/CDP

In [None]:
## Saving results
result.to_pickle('/kaggle/working/CDP/CDP_cities_with_polygons.pkl')

## Geocoding - let's try to geocode more points (the rest of the cities)
Because for many cities, coordinates are outside cities

In [None]:
## not null geometries:
result.geometry.notnull().sum()

In [None]:
!pip install geopy

In [None]:
import geopy

# create an address if City is not empty (because that would be misleading)
result['addr'] = result[['City', 'Country']].fillna('').apply(lambda x: str(x['City']) + ',' + str(x['Country']), axis=1)
result.loc[result['City'].isna(), 'addr'] = np.nan

# let's geocode
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="CDP Kaggle challenge")
result['addr_geocoded'] = None
## TODO: doesn't work for greater dataset
result.loc[result['addr'].notnull(), 'addr_geocoded'] = geocode(result.loc[result['addr'].notnull(), 'addr'][:10])['geometry']

# cast to geometry type
result['addr_geocoded'] = gpd.GeoSeries(result['addr_geocoded'], crs='EPSG:4326')

In [None]:
# same Coordinate System?
result['City Location'].crs == result['addr_geocoded'].crs

# with pyproj
# 
# Points are in a lon, lat coordinate system (EPSG:4326 or WGS 84). To calculate a distance in meters, 
# we would need to either use the Great-circle distance or project them in a local coordinate system 
# to approximate the distance with a good precision.
# compare with http://www.csgnetwork.com/lldistcalc.html
import pyproj
geod = pyproj.Geod(ellps='WGS84')
dist_zip = result.loc[(result['addr_geocoded'].notnull()) & (result['City Location'].notnull())]\
            .apply(lambda x: geod.inv(x['addr_geocoded'].x, x['addr_geocoded'].y, x['City Location'].x, x['City Location'].y), axis=1)
dist_zip = dist_zip.apply(pd.Series)
dist_zip.columns=['angle1','angle2','distance']
dist_zip.index.name=None
result['geocoder_dist_km'] = np.NaN
result.loc[(result['addr_geocoded'].notnull()) & (result['City Location'].notnull()), 'geocoder_dist_km'] = (dist_zip['distance']/1000).round(2)

In [None]:

result['geocoder_dist_km'][:10]

In [None]:
result.loc[result['addr_geocoded'].notnull(), ['addr_geocoded', 'addr', 'City Location', 'name_conve', 'geocoder_dist_km']]

In [None]:
result.shape

In [None]:
result.loc[result['geocoded'].notnull()].shape

In [None]:
# create an output directory
!mkdir /kaggle/working/CDP/

In [None]:
type(result)

In [None]:
result.to_pickle('/kaggle/working/CDP/CDP_cities_with_polygons.pkl')

In [None]:
result_x = pd.read_pickle('/kaggle/working/CDP/CDP_cities_with_polygons.pkl')

In [None]:
type(result_x)

In [None]:
## Writing results to Shapefile:
#result.to_file('/kaggle/working/CDP/CDP_cities_with_polygons.shp')
result.to_file("countries.geojson", driver='GeoJSON')

In [None]:
temp = pd.read_pickle('CDP_cities_with_polygons.pkl')

In [None]:
temp.columns

## Assign values from different dataset to what we've got (688 for 1032 overall)

In [None]:
# how many points we had originally:
result.loc[result['City Location'].notnull()].shape

In [None]:
result.loc[result['geometry'].notnull()].shape

In [None]:
cdp_cities_disclosing_df.loc[cdp_cities_disclosing_df['City']=='Warsaw']

In [None]:
## End!

In [None]:
fig, ax = plt.subplots()
result.loc[result['name_conve']=='Warsaw', 'geometry'].plot(ax=ax, facecolor='gray');
result.loc[result['name_conve']=='Warsaw', 'City Location'].plot(ax=ax, facecolor='red');
plt.tight_layout();

In [None]:

fig, ax = plt.subplots()

result.loc[result['name_conve']=='Tokyo', 'geometry'].plot(ax=ax, facecolor='yellow');
result.loc[result['Account Number']==31111, 'City Location'].plot(ax=ax, facecolor='red');
plt.tight_layout();

In [None]:

fig, ax = plt.subplots()

result.loc[result['name_conve']=='Yokohama', 'geometry'].plot(ax=ax, facecolor='blue');
result.loc[result['Account Number']==31111, 'City Location'].plot(ax=ax, facecolor='red');
plt.tight_layout();

In [None]:
result.loc[result['name_conve']=='Yokohama', 'geometry'].plot()

In [None]:
import maup
maup.resolve_overlaps(poly_df['geometry'].values)

In [None]:
poly_df['geometry'].values()

In [None]:
points_df.shape

In [None]:
result.shape

In [None]:
city = points_df.iloc[0]
city.crs

In [None]:
for city in points_df:
    for poly in poly_df:
        city.intersects(poly.unary_union)

In [None]:
points_df.apply(lambda x: x['City Location'].intersects(poly_df.unary_union))

In [None]:
points_df[:1].intersects(poly_df.unary_union) 

In [None]:
gdf[:1].intersects(cities_poly_df.unary_union)

In [None]:
gdf[:1].within(cities_poly_df.loc[:10, 'geometry'])

In [None]:
cities_poly_df

## Now we can merge by account with Cities Responses

In [None]:
## TODO:....   


# import cities response df
cdp_full_cities_df = pd.read_csv("../input/cdp-unlocking-climate-solutions/Cities/Cities Responses/2020_Full_Cities_Dataset.csv")
#cdp_full_cities_df.head()

cdp_full_cities_df[['Country', 'Organization']].replace('City of ', '', regex=True)\
       .replace('City of ', '', regex=True)\
       .replace('Township of ', '', regex=True)\
       .drop_duplicates()\
       .sort_values(by=['Country', 'Organization'])[:100]

## Combine with raster data for cities

In [None]:
unique_cities_disclosing_df.head()

In [None]:
cities_poly_df.shape

In [None]:
cities_poly_df.iloc[:, :-1].head(3).T

In [None]:
cities_poly_df['City_clean'] = cities_poly_df['name_conve']\
                                .apply(lambda x: GeoText(x).cities)\
                                .apply(lambda x: ','.join(x))\
                                .replace('', np.nan)
print('GeoTexted: ', cities_poly_df['City_clean'].dropna().shape[0], ' out of ', cities_poly_df.shape[0])

# the rest (names not found in GeoText) will be filled with the original names
cities_poly_df['City_clean'] = cities_poly_df[['name_conve', 'City_clean']].ffill(axis=1)

In [None]:
unique_cities_disclosing_df.shape

In [None]:
# assign polygons to cities
df_uniq_cities_polygons_comb = unique_cities_disclosing_df.merge(cities_poly_df, left_on='City_clean', right_on='City_clean', how='left')

In [None]:
df_uniq_cities_polygons_comb[['geometry']].dropna().shape   ## 383 

In [None]:
## Now we can print on map and calculate Co2 Footprint for those cities ~380 cities .... only... 
from rasterio.mask import mask
import matplotlib.pyplot as plt
from rasterio.plot import show


In [None]:
#df_uniq_cities_polygons_comb['co2_footprint'] = np.nan

i = 0 
with rasterio.open('/kaggle/working/global-gridded-model-of-carbon-footprints-ggmcf/GGMCF_v1.0.tif') as src:
    for city_name in df_uniq_cities_polygons_comb.dropna(subset=['geometry'])['City_clean'].tolist()[:10]:
        carbon_data, carbon_transform = mask(src, df_uniq_cities_polygons_comb.loc[df_uniq_cities_polygons_comb['City_clean']==city_name]['geometry'], crop=True)
        carbon_meta = src.meta
        
        print(city_name, 'CO2 footprint: ', carbon_data.mean())
        
        df_uniq_cities_polygons_comb.loc[df_uniq_cities_polygons_comb['City_clean']==city_name, 'co2_footprint'] = carbon_data.mean()

        #while i < 10:  # show first 10 polygons
        fig, axs = plt.subplots(1,2)      # (1,2)
        #df_uniq_cities_polygons_comb.loc[df_uniq_cities_polygons_comb['City_clean']==city_name].plot(ax=axs[0])
        show(carbon_data, ax=axs[1], title=city_name)
        i += 1

In [None]:
cities_poly_df.loc[cities_poly_df['name_conve']=='Warsaw']['geometry']

In [None]:
cities_poly_df.