In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import geopandas as gpd

pd.options.display.max_colwidth=None
pd.options.display.max_rows=200

In [None]:
PREFIX = '../input/cdp-unlocking-climate-solutions/Cities/Cities Disclosing/'

cdp_cities_disclosing_df = pd.concat([pd.read_csv(PREFIX+'2018_Cities_Disclosing_to_CDP.csv'),\
                                           pd.read_csv(PREFIX+'2019_Cities_Disclosing_to_CDP.csv'),\
                                           pd.read_csv(PREFIX+'2020_Cities_Disclosing_to_CDP.csv')])
cdp_cities_disclosing_df.head()

In [None]:
# # geometries often come in the WKT format, as in case of City Location:

## TODO: That should be fixed to handle missing/Null values

# import geopandas as gpd
# import shapely.wkt

# # for null geometries
# from shapely.geometry import GeometryCollection

# geometry = cdp_cities_disclosing_df['City Location'].map(shapely.wkt.loads)
# cdp_cities_disclosing_df = cdp_cities_disclosing_df.drop('City Location', axis=1)
# cdp_cities_disclosing_df = gpd.GeoDataFrame(cdp_cities_disclosing_df, crs="EPSG:4326", geometry=geometry)
# cdp_cities_disclosing_df = cdp_cities_disclosing_df.rename({'geometry': 'City Location'}, axis=1)

In [None]:
# ## convert 'City Location' to GeoSeries
# cdp_cities_disclosing_df['City Location x'] = cdp_cities_disclosing_df['City Location'].str.replace('POINT \(', '')
# cdp_cities_disclosing_df['City Location x'] = cdp_cities_disclosing_df['City Location x'].str.replace(')', '')
# cdp_cities_disclosing_df['City Location LONG'] = cdp_cities_disclosing_df['City Location x'].apply(lambda x: x.split(' ')[0] if (pd.isnull(x)==False) else None)
# cdp_cities_disclosing_df['City Location LAT'] = cdp_cities_disclosing_df['City Location x'].apply(lambda x: x.split(' ')[1] if (pd.isnull(x)==False) else None)
# cdp_cities_disclosing_df['City Location'] = gpd.points_from_xy(cdp_cities_disclosing_df['City Location LONG'], cdp_cities_disclosing_df['City Location LAT'])
# cdp_cities_disclosing_df.loc[cdp_cities_disclosing_df['City Location LONG'].isnull() |  cdp_cities_disclosing_df['City Location LAT'].isnull(), ]
# cdp_cities_disclosing_df.loc[cdp_cities_disclosing_df['City Location x'].isna(), 'City Location'] = np.NaN
# cdp_cities_disclosing_df = cdp_cities_disclosing_df.drop(['City Location x', 'City Location LONG', 'City Location LAT'],axis=1)

In [None]:
df_unique_cities = cdp_cities_disclosing_df[['Account Number', 'City', 'Country', 'City Location']].drop_duplicates().reset_index(drop=True)

In [None]:
df_unique_cities.count()

In [None]:
print(f'There are {1032-688} cities without coordinates')

# Let's geocode City Name

In [None]:
!pip install geopy

In [None]:
# Geocoding

import geopy

# create an address if City is not empty (otherwise it would be misleading)
df_unique_cities['addr'] = df_unique_cities[['City', 'Country']].fillna('').apply(lambda x: str(x['City']) + ',' + str(x['Country']), axis=1)
df_unique_cities.loc[df_unique_cities['City'].isna(), 'addr'] = np.nan

from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="Kaggle_CDP_challenge")

import geopandas as gpd

# let's geocode
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

df_unique_cities['geolocated_string'] = None

from tqdm import tqdm
tqdm.pandas()

# GEOCODE:
df_unique_cities.loc[df_unique_cities['addr'].notnull(), 'geoloc_string'] = df_unique_cities.loc[df_unique_cities['addr'].notnull(), 'addr'].apply(geocode)


In [None]:
df_unique_cities.iloc[0]['geoloc_string'].raw

In [None]:
# in GeoCode - LAT, LON are in the reverse order:(location.latitude, location.longitude). As in Google Maps.
# In GeoPandas (x,y) it suppose to be (LONGITUDE aka x, LATITUDE aka y).
from shapely.geometry import Point
df_unique_cities['geoloc_point'] = df_unique_cities['geoloc_string'].apply(lambda loc: Point(tuple(loc.point)[1], tuple(loc.point)[0]) if (pd.isnull(loc)==False) else None)
df_unique_cities['geoloc_point'] = gpd.GeoSeries(df_unique_cities['geoloc_point'],  crs="EPSG:4326")


df_unique_cities['geoloc_addr'] = df_unique_cities['geoloc_string'].apply(lambda loc: loc.address if (pd.isnull(loc)==False) else None)
#df_unique_cities['geoloc_addr'] = df_unique_cities['geoloc_string'].apply(lambda loc: loc.address if loc else None)

In [None]:
df_unique_cities[['City', 'Country', 'City Location', 'geoloc_point', 'geoloc_addr']].count()

In [None]:
print('New addresses geolocated:', df_unique_cities.loc[(df_unique_cities['City Location'].isnull()) & (df_unique_cities['geoloc_addr'].notnull())].shape[0])

In [None]:
df_unique_cities.dtypes

In [None]:
# We use shapely.wkt sub-module to parse geoinfo (City Location) in wkt format:
from shapely import wkt

# for WKT to convert correctly, in columns must be valid strings. Execute only once!
df_unique_cities.loc[~df_unique_cities['City Location'].isnull(), 'City Location'] = df_unique_cities.loc[~df_unique_cities['City Location'].isnull(), 'City Location'].apply(wkt.loads)

# cast Pandas DF into Geo Pandas DF
gdf = gpd.GeoDataFrame(df_unique_cities, geometry='City Location', crs="EPSG:4326")

points_df = gdf.reset_index(drop=True).copy()     #.loc[gdf['Country']=='Poland'].reset_index(drop=True)
points_df.shape

In [None]:
points_df.head()

# Checking distances

In [None]:
#points_df['distance'] = 
df_tmp = points_df.loc[(points_df['City Location'].isna()==False) & (points_df['geoloc_point'].isna()==False)].reset_index(drop=True)
# I need to change projection. If epsg:4326 then distance will be in degrees. So epsg=3310 should be used according to this:
# https://gis.stackexchange.com/a/293342
df_tmp['City Location'] = df_tmp['City Location'].to_crs(epsg=3310)
df_tmp['geoloc_point'] = df_tmp['geoloc_point'].to_crs(epsg=3310)
# distance in km
df_tmp['distance'] = (df_tmp['City Location'].distance(df_tmp['geoloc_point'])/1000).round(2)
# back to WGS for google maps.
df_tmp['City Location'] = df_tmp['City Location'].to_crs(epsg=4326)
df_tmp['geoloc_point'] = df_tmp['geoloc_point'].to_crs(epsg=4326)

# over 50 km difference
df_tmp[df_tmp['distance']>50]

In [None]:
# over 50 km difference
print(df_tmp[df_tmp['distance']>50].shape[0], ' out of ', df_tmp.shape[0], 'were geocoded with over 50 km away. ~', round(df_tmp[df_tmp['distance']>50].shape[0]/df_tmp.shape[0]*100, 2), '%' )
print('Mistakes are on both sides. In case of wrong geocoding, I see that comparing countries could help a lot.')
print('If country is ok, than often coordinates of a City Location were given in the wrong order or without - sign.')


## Load cities polygons

In [None]:
cities_poly_df = gpd.read_file('../input/1-cities-landscan/1_CITIES_landscan/ne_10m_urban_areas_landscan/ne_10m_urban_areas_landscan.shp')
cities_poly_df.crs

In [None]:
import matplotlib.pyplot as plt

# Plot the WGS84
cities_poly_df.loc[cities_poly_df['name_conve']=='Warsaw'].plot(facecolor='gray');
# Add title
plt.title("WGS84 (EPSG:4326) projection");
# Remove empty white space around the plot
plt.tight_layout()

# Plot the one with ETRS-LAEA projection (more realistic for this region)
cities_poly_df.loc[cities_poly_df['name_conve']=='Warsaw'].to_crs(epsg=3035).plot(facecolor='blue');
# Add title
plt.title("ETRS Lambert Azimuthal Equal Area projection");
# Remove empty white space around the plot
plt.tight_layout()

In [None]:
poly_df = cities_poly_df.reset_index(drop=True).copy()     #.loc[cities_poly_df['name_conve'].isin(['Warsaw', 'Moscow'])].reset_index(drop=True)
poly_df.shape

In [None]:
fig, ax = plt.subplots()
poly_df.plot(ax=ax, facecolor='red');
points_df.plot(ax=ax, color='blue', markersize=5);
plt.tight_layout();



assert points_df.crs == poly_df.crs, 'Coordinate Systems do not match!!'


# Spatial join with citie's polygons

In [None]:
## Assigning polygons:

print(f'Initially we have {points_df.shape} unique cities')
# this spatial join works well here, but two problems: some polygons in our dataset are wrong like (Yokohama inside Tokyo). 
result = gpd.sjoin(points_df, poly_df, how="left", op="within")
print(f'after left joining: {result.shape}')
# We lost the geometry so now, we just have to add it back again.
result = result.merge(poly_df[['name_conve', 'geometry']], left_on='name_conve', right_on='name_conve', how='left')
print(f'and then: {result.shape}')

In [None]:
# But problem: city Tokyo was found in Tokio and in Yokohama polygon 
result.loc[result['Account Number']==31111].iloc[:,:-1]

In [None]:
# The problem is, that more cities have their polgyons overlapping over each other like Tokyo and Yokohama.
# I'll assign the polygon with the smaller area in that cases: using 'min_areakm' column (so Yokohama in this case)
# when duplicated rows (by keys) that should keep only the first, smaller polygon (because sorted ascending)
result = result.sort_values(by=['min_areakm']).drop_duplicates(subset=['Account Number', 'City', 'Country', 'City Location'])

print('Shape: ', result.shape)

# check if Yokohama.
result.loc[result['Account Number']==31111].iloc[:,:-3]

In [None]:
## how many points don't have the pologyon assigned
print(result.loc[result['City Location'].notnull() & (result['geometry'].isnull())].shape)
result.loc[result['City Location'].notnull() & (result['geometry'].isnull())]

In [None]:
# so we have polygons only for 
print(f'we have polygons for:', result.loc[(result['geometry'].notnull())].shape[0])

In [None]:
# test
fig, ax = plt.subplots()
result.loc[result['name_conve']=='Warsaw', 'geometry'].plot(ax=ax, facecolor='gray');
result.loc[result['name_conve']=='Warsaw', 'City Location'].plot(ax=ax, facecolor='red');
plt.tight_layout();

In [None]:
# test
for city in ['Santiago', 'City of Sydney', 'Kasama', 'San Francisco']:
    try:
        fig, ax = plt.subplots()
        result.loc[result['City']==city, 'geometry'].plot(ax=ax, facecolor='gray');
        result.loc[result['City']==city, 'City Location'].plot(ax=ax, facecolor='red');
        plt.tight_layout();
    except:
        pass;

# Save results

In [None]:
# create an output directory
!mkdir /kaggle/working/CDP

In [None]:
## Saving results
df_unique_cities.to_pickle('/kaggle/working/CDP/CDP_cities_with_polygons_v1.pkl')