# Explore GPKG File 
Exploring the GPKG file from GreenSpace downlaod folder

Conclusion:
- The file can be read by geopandas. 
- The ...short_pnt.gpkg file contains geometery POINT data, 
- The ...V1_2.gpkg contains MULTIPOLYGON data,
- Both can be used as area boundary for the choropleth map.

Note: ...V1_2.gpkg file is very large, so it may take few minutes to load.

In [41]:
import geopandas as gpd
import pandas as pd
import os

In [2]:
gpkg_path = 'GreenspaceDownload/GHS_STAT_UCDB2015MT_GLOBE_R2019A_V1_2.gpkg'

def load_gpkg(gpkg_path):
    print('Loading large file, may take 1-2 minutes...')
    gdf = gpd.read_file(gpkg_path)
    return gdf

In [3]:
gdf = load_gpkg(gpkg_path)
gdf.head()

Loading large file, may take 1-2 minutes...


Unnamed: 0,ID_HDC_G0,QA2_1V,AREA,BBX_LATMN,BBX_LONMN,BBX_LATMX,BBX_LONMX,GCPNT_LAT,GCPNT_LON,CTR_MN_NM,...,EX_SS_P00,EX_SS_P15,EX_EQ19PGA,EX_EQ19MMI,EX_EQ19_Q,EX_HW_IDX,SDG_LUE9015,SDG_A2G14,SDG_OS15MX,geometry
0,1.0,1.0,185.0,21.247683,-158.043016,21.422193,-157.730529,21.340678,-157.893497,United States,...,397443.031445,444041.529529,,,missing,,0.074385,0.226415,56.41,"MULTIPOLYGON (((-158.01244 21.42219, -157.9915..."
1,2.0,2.0,42.0,-17.641184,-149.628088,-17.517631,-149.508018,-17.534103,-149.568053,French Polynesia,...,0.0,0.0,,,missing,,0.128,0.284119,,"MULTIPOLYGON (((-149.56967 -17.51763, -149.508..."
2,3.0,1.0,55.0,34.858517,-120.475511,34.989334,-120.389183,34.923123,-120.434372,United States,...,0.0,0.0,0.0,0.0,available,2.79174,0.48114,0.040129,23.64,"MULTIPOLYGON (((-120.46375 34.98933, -120.4411..."
3,4.0,1.0,48.0,36.582997,-121.952215,36.635743,-121.811816,36.60772,-121.882378,United States,...,0.0,0.0,0.0,0.0,available,,0.44484,0.138683,42.17,"MULTIPOLYGON (((-121.95221 36.63574, -121.9179..."
4,5.0,1.0,60.0,34.38822,-119.853855,34.457831,-119.658413,34.427664,-119.743693,United States,...,0.0,0.0,0.0,0.0,available,4.25502,0.55676,0.061348,36.5,"MULTIPOLYGON (((-119.82444 34.45783, -119.8131..."


The file V1_2.gpkg contains columns that are almost identical to those in the greenspace CSV file, with the addition of a geometry column containing MULTIPOLYGON data.

However, this file takes a long time to be loaded due to its size, it is better to output it as a smaller, more readable geojson file. Additionally, we should only retain the data we are interested in -- data about the United States.

Hence, in the following code, I will filter out data that is not related to the United States and output the result as a geojson file named `Greenspace_US.geojson`.

In [14]:
# filter out US data and output to geojson
def output_US_geojson(gdf, output_path):
    df = gdf[gdf['CTR_MN_NM'] == 'United States'].reset_index(drop=True)
    
    if os.path.exists(output_path):
        print(f'{output_path} already exists')
    else:
        print(f'Output {output_path}...')
        df.to_file(output_path, driver='GeoJSON')
    
    return df

In [31]:
output_path = 'Greenspace_US.geojson'
us_gdf = output_US_geojson(gdf, output_path)

print(us_gdf.shape)
us_gdf.head()

Output Greenspace_US.geojson...
(324, 161)


Unnamed: 0,ID_HDC_G0,QA2_1V,AREA,BBX_LATMN,BBX_LONMN,BBX_LATMX,BBX_LONMX,GCPNT_LAT,GCPNT_LON,CTR_MN_NM,...,EX_SS_P00,EX_SS_P15,EX_EQ19PGA,EX_EQ19MMI,EX_EQ19_Q,EX_HW_IDX,SDG_LUE9015,SDG_A2G14,SDG_OS15MX,geometry
0,1.0,1.0,185.0,21.247683,-158.043016,21.422193,-157.730529,21.340678,-157.893497,United States,...,397443.031445,444041.529529,,,missing,,0.074385,0.226415,56.41,"MULTIPOLYGON (((-158.01244 21.42219, -157.9915..."
1,3.0,1.0,55.0,34.858517,-120.475511,34.989334,-120.389183,34.923123,-120.434372,United States,...,0.0,0.0,0.0,0.0,available,2.79174,0.48114,0.040129,23.64,"MULTIPOLYGON (((-120.46375 34.98933, -120.4411..."
2,4.0,1.0,48.0,36.582997,-121.952215,36.635743,-121.811816,36.60772,-121.882378,United States,...,0.0,0.0,0.0,0.0,available,,0.44484,0.138683,42.17,"MULTIPOLYGON (((-121.95221 36.63574, -121.9179..."
3,5.0,1.0,60.0,34.38822,-119.853855,34.457831,-119.658413,34.427664,-119.743693,United States,...,0.0,0.0,0.0,0.0,available,4.25502,0.55676,0.061348,36.5,"MULTIPOLYGON (((-119.82444 34.45783, -119.8131..."
4,6.0,1.0,57.0,36.943697,-122.067166,36.996536,-121.86873,36.971274,-121.978684,United States,...,0.0,0.0,0.007287,2.0,available,1.5197,0.31759,0.109287,39.46,"MULTIPOLYGON (((-122.02481 36.99654, -122.0133..."


In [17]:
nm_df = us_gdf[['UC_NM_MN','UC_NM_LST']]
nm_df.head()

Unnamed: 0,UC_NM_MN,UC_NM_LST
0,Honolulu,Honolulu; Waipahu; Pearl City; Aiea
1,Santa Maria,Santa Maria
2,Monterey,Monterey
3,Santa Barbara,Santa Barbara
4,Santa Cruz,Santa Cruz


In [35]:
def nm_labels(df):
    nm_dict = {}
    for n in range(len(df)):
        nm_dict[nm_df.loc[n]['UC_NM_MN']] = nm_df.loc[n]['UC_NM_LST'].split(';')
    return nm_dict

nm_dict = nm_labels(nm_df)
nm_dict

{'Honolulu': ['Honolulu', ' Waipahu', ' Pearl City', ' Aiea'],
 'Santa Maria': ['Santa Maria'],
 'Monterey': ['Monterey'],
 'Santa Barbara': ['Santa Barbara'],
 'Santa Cruz': ['Santa Cruz'],
 'Salinas': ['Salinas'],
 'Oxnard': ['Oxnard', ' Ventura'],
 'Watsonville': ['Watsonville'],
 'San Jose': ['San Jose',
  ' San Francisco',
  ' Oakland',
  ' Fremont',
  ' Sunnyvale',
  ' Hayward',
  ' Berkeley',
  ' Santa Clara',
  ' San Mateo',
  ' Richmond',
  ' Mountain View',
  ' Redwood City',
  ' Alameda',
  ' Palo Alto'],
 'Camarillo': ['Camarillo'],
 'Thousand Oaks': ['Thousand Oaks'],
 'San Rafael': ['San Rafael', ' San Anselmo', ' Larkspur'],
 'Los Angeles': ['Los Angeles',
  ' Long Beach',
  ' Anaheim',
  ' Santa Ana',
  ' Riverside',
  ' Irvine',
  ' Glendale',
  ' San Bernardino',
  ' Fontana',
  ' Huntington Beach',
  ' Ontario',
  ' Garden Grove',
  ' Rancho Cucamonga',
  ' Pomona',
  ' Corona',
  ' Torrance',
  ' Pasadena',
  ' Fullerton',
  ' Orange',
  ' Inglewood',
  ' Downey',
 

In [38]:
# apply labels to mental health data 
mhdf = pd.read_csv('MH_cleaned.csv')
mhdf.head()

Unnamed: 0,StateAbbr,PlaceName,PlaceFIPS,Population2010,MHLTH_AdjPrev,MHLTH_Adj95CI,Geolocation
0,AL,Birmingham,107000,212237,15.6,"(15.4, 15.8)","[33.5275663773, -86.7988174678]"
1,AL,Hoover,135896,81619,10.4,"(10.1, 10.7)","[33.3767602729, -86.8051937568]"
2,AL,Huntsville,137000,180105,13.4,"(13.2, 13.7)","[34.6989692671, -86.6387042882]"
3,AL,Mobile,150000,195111,15.0,"(14.9, 15.2)","[30.6776248648, -88.1184482714]"
4,AL,Montgomery,151000,205764,14.8,"(14.6, 15.1)","[32.3472645333, -86.2677059552]"


In [70]:
def apply_labels(mhdf, nm_dict):
    for key, values in nm_dict.items():
        mhdf.loc[mhdf['PlaceName'].isin(values), 'MainCity'] = key

    mhdf['MainCity'] = mhdf['MainCity'].fillna('NaN')
    return mhdf

lab_df = apply_labels(mhdf, nm_dict)
print(f" There are {len(lab_df[lab_df['MainCity'] == 'NaN'])} cities with no label.")
lab_df.head()

 There are 259 cities with no label.


Unnamed: 0,StateAbbr,PlaceName,PlaceFIPS,Population2010,MHLTH_AdjPrev,MHLTH_Adj95CI,Geolocation,MainCity
0,AL,Birmingham,107000,212237,15.6,"(15.4, 15.8)","[33.5275663773, -86.7988174678]",Birmingham
1,AL,Hoover,135896,81619,10.4,"(10.1, 10.7)","[33.3767602729, -86.8051937568]",
2,AL,Huntsville,137000,180105,13.4,"(13.2, 13.7)","[34.6989692671, -86.6387042882]",Huntsville
3,AL,Mobile,150000,195111,15.0,"(14.9, 15.2)","[30.6776248648, -88.1184482714]",Mobile
4,AL,Montgomery,151000,205764,14.8,"(14.6, 15.1)","[32.3472645333, -86.2677059552]",Montgomery
