# Choropleth Mapy by using GPKG File 

This notebook is aming to explore the gpkg file and use its information to create choropleth map.

Obejectives:
- Exploring the GPKG file from GreenSpace datasets
- Create choropleth map by using converted gpkg file and merged datasets


Conclusion:
- The file can be read by geopandas. 
- The ...V1_2.gpkg contains MULTIPOLYGON data, can be used as area boundary for the choropleth map.

*Note: ...V1_2.gpkg file is very large, so it may take few minutes to load.*

In [1]:
import GeoBound_ChoroplethMap as gcm
import geopandas as gpd
import pandas as pd
import numpy as np
import os

## Explore GPKG

In [2]:
gpkg_path = 'GreenspaceDownload/GHS_STAT_UCDB2015MT_GLOBE_R2019A_V1_2.gpkg'

def load_gpkg(gpkg_path):
    print('Loading large file, will take 1-2 minutes...')
    gdf = gpd.read_file(gpkg_path)
    return gdf

In [3]:
gdf = load_gpkg(gpkg_path)
gdf.head()

Loading large file, will take 1-2 minutes...


Unnamed: 0,ID_HDC_G0,QA2_1V,AREA,BBX_LATMN,BBX_LONMN,BBX_LATMX,BBX_LONMX,GCPNT_LAT,GCPNT_LON,CTR_MN_NM,...,EX_SS_P00,EX_SS_P15,EX_EQ19PGA,EX_EQ19MMI,EX_EQ19_Q,EX_HW_IDX,SDG_LUE9015,SDG_A2G14,SDG_OS15MX,geometry
0,1.0,1.0,185.0,21.247683,-158.043016,21.422193,-157.730529,21.340678,-157.893497,United States,...,397443.031445,444041.529529,,,missing,,0.074385,0.226415,56.41,"MULTIPOLYGON (((-158.01244 21.42219, -157.9915..."
1,2.0,2.0,42.0,-17.641184,-149.628088,-17.517631,-149.508018,-17.534103,-149.568053,French Polynesia,...,0.0,0.0,,,missing,,0.128,0.284119,,"MULTIPOLYGON (((-149.56967 -17.51763, -149.508..."
2,3.0,1.0,55.0,34.858517,-120.475511,34.989334,-120.389183,34.923123,-120.434372,United States,...,0.0,0.0,0.0,0.0,available,2.79174,0.48114,0.040129,23.64,"MULTIPOLYGON (((-120.46375 34.98933, -120.4411..."
3,4.0,1.0,48.0,36.582997,-121.952215,36.635743,-121.811816,36.60772,-121.882378,United States,...,0.0,0.0,0.0,0.0,available,,0.44484,0.138683,42.17,"MULTIPOLYGON (((-121.95221 36.63574, -121.9179..."
4,5.0,1.0,60.0,34.38822,-119.853855,34.457831,-119.658413,34.427664,-119.743693,United States,...,0.0,0.0,0.0,0.0,available,4.25502,0.55676,0.061348,36.5,"MULTIPOLYGON (((-119.82444 34.45783, -119.8131..."


The file V1_2.gpkg contains columns that are almost identical to those in the greenspace CSV file, with the addition of a geometry column containing MULTIPOLYGON data.

However, this file takes a long time to be loaded due to its size, it is better to output it as a smaller, more readable geojson file. Additionally, we should only retain the data we are interested in -- data about the United States.

Hence, in the following code, I will filter out data that is not related to the United States and output the result as a geojson file named `Greenspace_US.geojson`.

In [4]:
gdf.head()

Unnamed: 0,ID_HDC_G0,QA2_1V,AREA,BBX_LATMN,BBX_LONMN,BBX_LATMX,BBX_LONMX,GCPNT_LAT,GCPNT_LON,CTR_MN_NM,...,EX_SS_P00,EX_SS_P15,EX_EQ19PGA,EX_EQ19MMI,EX_EQ19_Q,EX_HW_IDX,SDG_LUE9015,SDG_A2G14,SDG_OS15MX,geometry
0,1.0,1.0,185.0,21.247683,-158.043016,21.422193,-157.730529,21.340678,-157.893497,United States,...,397443.031445,444041.529529,,,missing,,0.074385,0.226415,56.41,"MULTIPOLYGON (((-158.01244 21.42219, -157.9915..."
1,2.0,2.0,42.0,-17.641184,-149.628088,-17.517631,-149.508018,-17.534103,-149.568053,French Polynesia,...,0.0,0.0,,,missing,,0.128,0.284119,,"MULTIPOLYGON (((-149.56967 -17.51763, -149.508..."
2,3.0,1.0,55.0,34.858517,-120.475511,34.989334,-120.389183,34.923123,-120.434372,United States,...,0.0,0.0,0.0,0.0,available,2.79174,0.48114,0.040129,23.64,"MULTIPOLYGON (((-120.46375 34.98933, -120.4411..."
3,4.0,1.0,48.0,36.582997,-121.952215,36.635743,-121.811816,36.60772,-121.882378,United States,...,0.0,0.0,0.0,0.0,available,,0.44484,0.138683,42.17,"MULTIPOLYGON (((-121.95221 36.63574, -121.9179..."
4,5.0,1.0,60.0,34.38822,-119.853855,34.457831,-119.658413,34.427664,-119.743693,United States,...,0.0,0.0,0.0,0.0,available,4.25502,0.55676,0.061348,36.5,"MULTIPOLYGON (((-119.82444 34.45783, -119.8131..."


In [27]:
# import code from Greenspace_Data_Cleaning.ipynb to generate UC_Grouping coloumn
# which is the index after cleaned the whole dataset and exploded the Cities in Urban Center column

def Greenspace_Data_Cleaning(rawdf):

    cols_to_keep = ['GCPNT_LAT', 'GCPNT_LON', 'CTR_MN_NM', 'UC_NM_MN', 'UC_NM_LST', 'E_GR_AV14', 'E_GR_AT14', 'SDG_A2G14', 'SDG_OS15MX', 'P15', 'B15', 'BUCAP15', 'INCM_CMI', 'DEV_CMI', 'GDP15_SM', 'E_BM_NM_LST', 'E_WR_T_14','geometry'] # add 'geometry' to keep the geometry column

    df = rawdf[cols_to_keep]
    df = df[df['CTR_MN_NM'] == 'United States']
    df.replace(to_replace=['?', '??', '???', 'NAN'], value = [np.nan, np.nan, np.nan, np.nan], inplace=True)
    df.rename(columns={'GCPNT_LAT': 'Latitude', 'GCPNT_LON': 'Longitude', 'CTR_MN_NM': 'Country', 'UC_NM_MN': 'Urban Center', 'UC_NM_LST': 'Cities in Urban Center'}, inplace=True)

    checker = df[df['Urban Center'].str.contains("?", regex=False)] 

    df['Cities in Urban Center_copy'] = df['Cities in Urban Center']
    df['Cities in Urban Center'] = df['Cities in Urban Center'].str.split(';')
    df = df.explode('Cities in Urban Center')
    df.reset_index(inplace=True, drop=False)
    df.rename(columns={'index': 'UC_Grouping'}, inplace=True) # update UC Grouping to UC_Grouping
    df['Cities in Urban Center'] = df['Cities in Urban Center'].str.strip()


    return df, checker

In [28]:
green_us, checker = Greenspace_Data_Cleaning(gdf)
print(f"'Urban Center' contains {len(checker)} rows with '?'")
print(green_us.shape)
green_us.head()

'Urban Center' contains 0 rows with '?'
(585, 20)


Unnamed: 0,UC_Grouping,Latitude,Longitude,Country,Urban Center,Cities in Urban Center,E_GR_AV14,E_GR_AT14,SDG_A2G14,SDG_OS15MX,P15,B15,BUCAP15,INCM_CMI,DEV_CMI,GDP15_SM,E_BM_NM_LST,E_WR_T_14,geometry,Cities in Urban Center_copy
0,0,21.340678,-157.893497,United States,Honolulu,Honolulu,0.36929,183.811667,0.226415,56.41,512853.666675,80.647377,157.252219,HIC,MDR,21926680000.0,Tropical and Subtropical Dry Broadleaf Forests,23.526622,"MULTIPOLYGON (((-158.01244 21.42219, -157.9915...",Honolulu; Waipahu; Pearl City; Aiea
1,0,21.340678,-157.893497,United States,Honolulu,Waipahu,0.36929,183.811667,0.226415,56.41,512853.666675,80.647377,157.252219,HIC,MDR,21926680000.0,Tropical and Subtropical Dry Broadleaf Forests,23.526622,"MULTIPOLYGON (((-158.01244 21.42219, -157.9915...",Honolulu; Waipahu; Pearl City; Aiea
2,0,21.340678,-157.893497,United States,Honolulu,Pearl City,0.36929,183.811667,0.226415,56.41,512853.666675,80.647377,157.252219,HIC,MDR,21926680000.0,Tropical and Subtropical Dry Broadleaf Forests,23.526622,"MULTIPOLYGON (((-158.01244 21.42219, -157.9915...",Honolulu; Waipahu; Pearl City; Aiea
3,0,21.340678,-157.893497,United States,Honolulu,Aiea,0.36929,183.811667,0.226415,56.41,512853.666675,80.647377,157.252219,HIC,MDR,21926680000.0,Tropical and Subtropical Dry Broadleaf Forests,23.526622,"MULTIPOLYGON (((-158.01244 21.42219, -157.9915...",Honolulu; Waipahu; Pearl City; Aiea
4,2,34.923123,-120.434372,United States,Santa Maria,Santa Maria,0.312846,54.450694,0.040129,23.64,123181.284843,42.000805,340.96742,HIC,MDR,4174295000.0,"Mediterranean Forests, Woodlands, and Scrub",14.718191,"MULTIPOLYGON (((-120.46375 34.98933, -120.4411...",Santa Maria


In [31]:
# output cleaned gdf to geojson
_ = gcm.bound_load_file_output_geojson(file_path=_, df=green_us, full_state=True, output=True, output_folder='', output_filename='Greenspace_US.geojson')

Be aware of large dataset!


In [29]:
# load merged data
merged_df = pd.read_csv('uc_group_merged_greenspace_mh.csv', index_col = 0)
merged_df.head(3)

Unnamed: 0,Population2010,MHLTH_AdjPrev,UC_Grouping,Latitude,Longitude,E_GR_AV14,E_GR_AT14,SDG_A2G14,SDG_OS15MX,P15,B15,BUCAP15,GDP15_SM,E_WR_T_14,State,INCM_CMI,DEV_CMI,E_BM_NM_LST,Cities in Urban Center_copy
0,212237,15.6,485,33.509025,-86.823651,0.494568,219.99623,0.773812,74.85,196387.767,152.894608,778.534274,6184143000.0,17.497644,AL,HIC,MDR,Temperate Broadleaf and Mixed Forests,Birmingham;
1,180105,13.4,501,34.726065,-86.609995,0.521522,88.700999,0.802599,66.37,86467.06209,59.674004,690.135667,2498489000.0,16.321889,AL,HIC,MDR,Temperate Broadleaf and Mixed Forests,Huntsville
2,195111,15.0,422,30.692377,-88.093685,0.467515,122.669298,0.822213,63.32,118578.6789,71.298004,601.271703,4072112000.0,20.312027,AL,HIC,MDR,Temperate Coniferous Forests,Mobile


In [32]:
f = open('Greenspace_US.geojson', 'r')
f.readlines()[:10]

['{\n',
 '"type": "FeatureCollection",\n',
 '"crs": { "type": "name", "properties": { "name": "urn:ogc:def:crs:OGC:1.3:CRS84" } },\n',
 '"features": [\n',
 '{ "type": "Feature", "properties": { "UC_Grouping": 0, "Latitude": 21.340677689500001, "Longitude": -157.89349674900001, "Country": "United States", "Urban Center": "Honolulu", "Cities in Urban Center": "Honolulu", "E_GR_AV14": 0.36929038789038199, "E_GR_AT14": 183.81166679399999, "SDG_A2G14": 0.22641503950399999, "SDG_OS15MX": 56.41, "P15": 512853.66667499999, "B15": 80.647377014200003, "BUCAP15": 157.25221881900001, "INCM_CMI": "HIC", "DEV_CMI": "MDR", "GDP15_SM": 21926684672.0, "E_BM_NM_LST": "Tropical and Subtropical Dry Broadleaf Forests", "E_WR_T_14": 23.526622295399999, "Cities in Urban Center_copy": "Honolulu; Waipahu; Pearl City; Aiea" }, "geometry": { "type": "MultiPolygon", "coordinates": [ [ [ [ -158.012436513808552, 21.422192591895211 ], [ -157.991578163004789, 21.422192591895211 ], [ -157.986010505889595, 21.413880829

In [46]:
import folium
import json
lat=39.5 
lon=-98.35
geo_col=['UC_Grouping', 'MHLTH_AdjPrev']
key='feature.properties.UC_Grouping'
color='YlGnBu'
opacity=0.4
weight=1
zoom_start=3
legend='Average Mental Health Prevalence (%)'

boundary_file = 'Greenspace_US.geojson'
df = merged_df

m = folium.Map(location=[lat, lon], zoom_start=zoom_start)

geodata = json.load(open(boundary_file, 'r'))

cp = folium.Choropleth(
    geo_data=geodata,
    data=df,
    columns=geo_col,
    key_on=key,
    fill_color=color,
    fill_opacity=opacity,
    line_weight=weight,
    legend_name=legend
).add_to(m)


display(m)