In [1]:
# read data
import pandas as pd
# Set the maximum number of rows and columns to display
pd.set_option('display.max_rows', 200)  # Adjust this number as needed
pd.set_option('display.max_columns', 50)  # Adjust this number as needed

In [7]:
# Get shapefiles
import geopandas as gpd
poly = gpd.GeoDataFrame.from_file('../../data_CityEvent/Shp/US_blck_grp_2019.shp')
# poly = gpd.GeoDataFrame.from_file(r'data_CityEvent\\Shp\\US_blck_grp_2019.shp')
poly = poly.to_crs(epsg=4326)
print(poly.shape)
california = poly[poly['STATEFP']=='06']
print(california.shape)

(219773, 16)
(23192, 16)


In [8]:
display(california[:1])

Unnamed: 0,GISJOIN,STATEFP,COUNTYFP,TRACTCE,BLKGRPCE,GEOID,NAMELSAD,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,Shape_Leng,Shape_Area,geometry
10295,G06000104001001,6,1,400100,1,60014001001,Block Group 1,G5030,S,6894339.0,0.0,37.8676275,-122.231946,14302.721555,6894335.0,"POLYGON ((-122.2132 37.8576, -122.21313 37.857..."


In [None]:
import pandas as pd
import json
from shapely.geometry import shape, Point
import geopandas as gpd

# Function to parse GEO column and create geometry
def parse_geo(geo):
    if isinstance(geo, str):
        # If geo is a string, parse it as JSON
        geo_dict = json.loads(geo)
    elif isinstance(geo, dict):
        # If it's already a dict, no need to parse
        geo_dict = geo
    else:
        raise ValueError("GEO column contains invalid data")
    
    geom = shape(geo_dict)
    
    # # If the geometry is not a Point, return the centroid
    # if not isinstance(geom, Point):
    #     return geom.centroid
    # return geom

def assign_cbgid(df, poly):
    # Apply the function to create the new 'geometry' column
    df['geometry'] = df['GEO'].apply(parse_geo)

    # Convert to GeoDataFrame and set the initial CRS (assuming WGS84)
    gdf_events = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")

    # Perform spatial join between the event points and the polygon census block groups
    gdf_with_cbgid = gpd.sjoin(gdf_events, poly, how='left', predicate='within')

    # Merge the original df with the gdf_with_cbgid based on 'EVENT_ID'
    merged_events = df.merge(
        gdf_with_cbgid[['EVENT_ID', 'GEOID']],  # Selecting only relevant columns
        on='EVENT_ID',  # Merging on 'EVENT_ID'
        how='left'  # Keep all rows from the original DataFrame
    )

    return merged_events

In [68]:
df = pd.read_csv(f'../../data_CityEvent/processed/1.4_events_cbgid_by_category/unscheduled_intermediate/unscheduled_intermediate_disasters.csv')
print(df.shape)

(24768, 34)


In [14]:
import json 

# Function to extract geometry type from GEO
def get_geometry_type(geo_json):
    try:
        geo = json.loads(geo_json)
        return geo.get("type", None)
    except json.JSONDecodeError:
        return None

def get_dfs_by_geometry_type(df):
    # Create a new column for geometry type
    df['geometry_type'] = df['GEO'].apply(get_geometry_type)

    # Split dataset based on geometry type
    df_points = df[df['geometry_type'] == 'Point'].reset_index(drop=True)
    df_polygons = df[df['geometry_type'] == 'Polygon'].reset_index(drop=True)
    df_multipolygons = df[df['geometry_type'] == 'MultiPolygon'].reset_index(drop=True)

    # Print the results
    print("Points:", df_points.shape)
    # display(df_points[:1])
    print("\nPolygons:", df_polygons.shape)
    # display(df_polygons[:1])
    print("\nMultiPolygons:", df_multipolygons.shape)
    # display(df_multipolygons[:1])

    return df_points, df_polygons, df_multipolygons

In [63]:
df_points, df_polygons, df_multipolygons = get_dfs_by_geometry_type(df)

Points: (23357, 35)

Polygons: (1231, 35)

MultiPolygons: (180, 35)


In [62]:
import pandas as pd
import geopandas as gpd
import json
from shapely.geometry import shape

# Step 1: Parse 'GEO' field into geometry
def parse_geometry(geo):
    """Parse the GEO field into a Shapely geometry (Point, Polygon, or MultiPolygon)."""
    if pd.isna(geo):
        return None
    try:
        # Convert the GEO string into a Python dict
        geo_json = json.loads(geo)

        # Use shapely's shape() to parse the geometry
        geometry = shape(geo_json)

        # Ensure the parsed geometry is one of the expected types
        if geometry.geom_type in {"Point", "Polygon", "MultiPolygon"}:
            return geometry
        else:
            print(f"Unsupported geometry type: {geometry.geom_type}")
            return None
    except Exception as e:
        print(f"Error parsing geometry: {e}")
        return None

def assign_cbgid(df, shapefile):
    df['geometry'] = df['GEO'].apply(parse_geometry)
    df = df.dropna(subset=['geometry'])
    df = gpd.GeoDataFrame(df, geometry='geometry', crs='EPSG:4326')
    shapefile = gpd.GeoDataFrame(shapefile, geometry='geometry', crs="EPSG:4326")
    shapefile = shapefile[['GEOID', 'geometry']]

    assigned_df = gpd.sjoin(df, shapefile, how='inner', predicate='intersects')
    print(df.shape)
    print(assigned_df.shape)
    assigned_df = assigned_df.drop(columns=['index_right'])
    print(assigned_df.shape)
    return(assigned_df)

In [64]:
df_points = assign_cbgid(df_points, california)
display(df_points[:2])

(23357, 36)
(1309, 38)
(1309, 37)


Unnamed: 0,EVENT_ID,CREATE_DT,UPDATE_DT,TITLE,CATEGORY,LABELS,DESCRIPTION,EVENT_START,EVENT_END,PREDICTED_END,TIMEZONE,ENTITIES,GEO,IMPACT_PATTERNS,SCOPE,PLACEKEY,COUNTRY_CODE,PLACE_HIERARCHIES,PHQ_ATTENDANCE,PHQ_RANK,LOCAL_RANK,AVIATION_RANK,STATUS,BRAND_SAFE,PARENT_EVENT_ID,PREDICTED_EVENT_SPEND_ACCOMMODATION,PREDICTED_EVENT_SPEND_HOSPITALITY,PREDICTED_EVENT_SPEND_TRANSPORTATION,ROW_INSERTED_DT,ROW_UPDATED_DT,EVENT_START_BOSTON,EVENT_END_BOSTON,PREDICTED_END_BOSTON,NEW_TIMEZONE,geometry_type,geometry,GEOID
44,8zgiLS2SoFFrStHZpq,2021-08-25 01:36:10.386,2022-04-06 03:36:28.116,Outdoor fire - North America - United States,disasters,"[""disaster"",""fire"",""wildfire""]",Surface: 647; Total surface: 41098; Frp: 285.5...,2021-08-24 20:17:00.000,2021-08-24 20:17:00.000,,America/Los_Angeles,,"{""coordinates"":[-120.249438,38.7734],""type"":""P...",,locality,,US,"[[""6295630"",""6255149"",""6252001"",""5332921"",""534...",,21,,,active,True,,,,,2022-12-21 04:54:20.629,2023-01-31 20:57:26.060,2021-08-24 23:17:00,2021-08-24 23:17:00,,America/New_York,Point,POINT (-120.24944 38.7734),60170319001
59,HBMXcacXB355TR3Ei5,2021-06-17 23:15:50.623,2022-04-06 03:38:04.609,Outdoor fire - North America - United States,disasters,"[""disaster"",""fire"",""wildfire""]","The Dairy Fire, located south of Red Bluff adj...",2021-06-23 19:53:30.000,2021-06-23 19:53:30.000,,America/Los_Angeles,,"{""coordinates"":[-122.136944,40.105833],""type"":...",,locality,,US,"[[""6295630"",""6255149"",""6252001"",""5332921"",""557...",,21,,,active,True,,,,,2022-12-21 04:54:20.629,2023-01-31 20:57:26.060,2021-06-23 22:53:30,2021-06-23 22:53:30,,America/New_York,Point,POINT (-122.13694 40.10583),61030001003


In [65]:
df_points['EVENT_ID'].nunique()

1309

In [59]:
df_polygons = assign_cbgid(df_polygons, california)
display(df_polygons[:2])

(1231, 36)
(54358, 38)
(54358, 38)


Unnamed: 0,EVENT_ID,CREATE_DT,UPDATE_DT,TITLE,CATEGORY,LABELS,DESCRIPTION,EVENT_START,EVENT_END,PREDICTED_END,TIMEZONE,ENTITIES,GEO,IMPACT_PATTERNS,SCOPE,PLACEKEY,COUNTRY_CODE,PLACE_HIERARCHIES,PHQ_ATTENDANCE,PHQ_RANK,LOCAL_RANK,AVIATION_RANK,STATUS,BRAND_SAFE,PARENT_EVENT_ID,PREDICTED_EVENT_SPEND_ACCOMMODATION,PREDICTED_EVENT_SPEND_HOSPITALITY,PREDICTED_EVENT_SPEND_TRANSPORTATION,ROW_INSERTED_DT,ROW_UPDATED_DT,EVENT_START_BOSTON,EVENT_END_BOSTON,PREDICTED_END_BOSTON,NEW_TIMEZONE,geometry_type,geometry,index_right,GEOID
6,EtDA2ovojkqYeHLWKS,2022-12-30 15:45:12.198,2022-12-30 15:45:31.709,Avalanche Watch,disasters,"[""avalanche"",""disaster"",""disaster-warning""]",The Eastern Sierra Avalanche Center in Mammoth...,2022-12-30 15:31:00.000,2022-12-31 15:00:00.000,,America/Los_Angeles,,"{""coordinates"":[[[-118.63999938999996,37.46301...",,county,,US,"[[""6295630"",""6255149"",""6252001"",""5332921"",""535...",,13,,,active,True,,,,,2022-12-31 00:30:14.718,2023-01-31 20:57:26.060,2022-12-30 18:31:00,2022-12-31 18:00:00,,America/New_York,Polygon,"POLYGON ((-118.64 37.46301, -118.644 37.40951,...",13224,60270008002
6,EtDA2ovojkqYeHLWKS,2022-12-30 15:45:12.198,2022-12-30 15:45:31.709,Avalanche Watch,disasters,"[""avalanche"",""disaster"",""disaster-warning""]",The Eastern Sierra Avalanche Center in Mammoth...,2022-12-30 15:31:00.000,2022-12-31 15:00:00.000,,America/Los_Angeles,,"{""coordinates"":[[[-118.63999938999996,37.46301...",,county,,US,"[[""6295630"",""6255149"",""6252001"",""5332921"",""535...",,13,,,active,True,,,,,2022-12-31 00:30:14.718,2023-01-31 20:57:26.060,2022-12-30 18:31:00,2022-12-31 18:00:00,,America/New_York,Polygon,"POLYGON ((-118.64 37.46301, -118.644 37.40951,...",13576,60290052012


In [38]:
df_polygons['EVENT_ID'].nunique()

174

In [36]:
df_multipolygons = assign_cbgid(df_multipolygons, california)
display(df_multipolygons[:2])

(180, 36)
(17141, 38)


Unnamed: 0,EVENT_ID,CREATE_DT,UPDATE_DT,TITLE,CATEGORY,LABELS,DESCRIPTION,EVENT_START,EVENT_END,PREDICTED_END,TIMEZONE,ENTITIES,GEO,IMPACT_PATTERNS,SCOPE,PLACEKEY,COUNTRY_CODE,PLACE_HIERARCHIES,PHQ_ATTENDANCE,PHQ_RANK,LOCAL_RANK,AVIATION_RANK,STATUS,BRAND_SAFE,PARENT_EVENT_ID,PREDICTED_EVENT_SPEND_ACCOMMODATION,PREDICTED_EVENT_SPEND_HOSPITALITY,PREDICTED_EVENT_SPEND_TRANSPORTATION,ROW_INSERTED_DT,ROW_UPDATED_DT,EVENT_START_BOSTON,EVENT_END_BOSTON,PREDICTED_END_BOSTON,NEW_TIMEZONE,geometry_type,geometry,index_right,GEOID
9,9N4ZN3qMXRiu7wdWdx,2022-07-20 11:15:13.714,2022-07-21 03:00:21.963,Fire Weather Watch,disasters,"[""disaster"",""disaster-warning"",""fire""]",The National Weather Service in Medford has is...,2022-07-21 21:00:00.000,2022-07-22 03:00:00.000,,America/Los_Angeles,,"{""coordinates"":[[[[-122.85999298099995,42.0033...",,county,,US,"[[""6295630"",""6255149"",""6252001"",""5332921"",""557...",,56,,,active,True,,,,,2022-12-21 04:54:20.629,2023-01-31 20:57:26.060,2022-07-22 00:00:00,2022-07-22 06:00:00,,America/New_York,MultiPolygon,"MULTIPOLYGON (((-122.85999 42.00331, -122.8546...",32562,61050002001
9,9N4ZN3qMXRiu7wdWdx,2022-07-20 11:15:13.714,2022-07-21 03:00:21.963,Fire Weather Watch,disasters,"[""disaster"",""disaster-warning"",""fire""]",The National Weather Service in Medford has is...,2022-07-21 21:00:00.000,2022-07-22 03:00:00.000,,America/Los_Angeles,,"{""coordinates"":[[[[-122.85999298099995,42.0033...",,county,,US,"[[""6295630"",""6255149"",""6252001"",""5332921"",""557...",,56,,,active,True,,,,,2022-12-21 04:54:20.629,2023-01-31 20:57:26.060,2022-07-22 00:00:00,2022-07-22 06:00:00,,America/New_York,MultiPolygon,"MULTIPOLYGON (((-122.85999 42.00331, -122.8546...",32559,61050001021


In [39]:
df_multipolygons['EVENT_ID'].nunique()

16

In [69]:
print(df.shape)
whole_df_assigned = assign_cbgid(df, california)
print(whole_df_assigned.shape)

(24768, 34)
(24768, 35)
(72808, 37)
(72808, 36)
(72808, 36)


In [70]:
whole_df_assigned[:2]

Unnamed: 0,EVENT_ID,CREATE_DT,UPDATE_DT,TITLE,CATEGORY,LABELS,DESCRIPTION,EVENT_START,EVENT_END,PREDICTED_END,TIMEZONE,ENTITIES,GEO,IMPACT_PATTERNS,SCOPE,PLACEKEY,COUNTRY_CODE,PLACE_HIERARCHIES,PHQ_ATTENDANCE,PHQ_RANK,LOCAL_RANK,AVIATION_RANK,STATUS,BRAND_SAFE,PARENT_EVENT_ID,PREDICTED_EVENT_SPEND_ACCOMMODATION,PREDICTED_EVENT_SPEND_HOSPITALITY,PREDICTED_EVENT_SPEND_TRANSPORTATION,ROW_INSERTED_DT,ROW_UPDATED_DT,EVENT_START_BOSTON,EVENT_END_BOSTON,PREDICTED_END_BOSTON,NEW_TIMEZONE,geometry,GEOID
47,8zgiLS2SoFFrStHZpq,2021-08-25 01:36:10.386,2022-04-06 03:36:28.116,Outdoor fire - North America - United States,disasters,"[""disaster"",""fire"",""wildfire""]",Surface: 647; Total surface: 41098; Frp: 285.5...,2021-08-24 20:17:00.000,2021-08-24 20:17:00.000,,America/Los_Angeles,,"{""coordinates"":[-120.249438,38.7734],""type"":""P...",,locality,,US,"[[""6295630"",""6255149"",""6252001"",""5332921"",""534...",,21,,,active,True,,,,,2022-12-21 04:54:20.629,2023-01-31 20:57:26.060,2021-08-24 23:17:00,2021-08-24 23:17:00,,America/New_York,POINT (-120.24944 38.7734),60170319001
62,HBMXcacXB355TR3Ei5,2021-06-17 23:15:50.623,2022-04-06 03:38:04.609,Outdoor fire - North America - United States,disasters,"[""disaster"",""fire"",""wildfire""]","The Dairy Fire, located south of Red Bluff adj...",2021-06-23 19:53:30.000,2021-06-23 19:53:30.000,,America/Los_Angeles,,"{""coordinates"":[-122.136944,40.105833],""type"":...",,locality,,US,"[[""6295630"",""6255149"",""6252001"",""5332921"",""557...",,21,,,active,True,,,,,2022-12-21 04:54:20.629,2023-01-31 20:57:26.060,2021-06-23 22:53:30,2021-06-23 22:53:30,,America/New_York,POINT (-122.13694 40.10583),61030001003


In [71]:
df_points_assigned, df_polygons_assigned, df_multipolygons_assigned = get_dfs_by_geometry_type(whole_df_assigned)
print(df_points_assigned.shape)
print(df_polygons_assigned.shape)
print(df_multipolygons_assigned.shape)

Points: (1309, 37)

Polygons: (54358, 37)

MultiPolygons: (17141, 37)
(1309, 37)
(54358, 37)
(17141, 37)


In [72]:
df_multipolygons_assigned[:1]

Unnamed: 0,EVENT_ID,CREATE_DT,UPDATE_DT,TITLE,CATEGORY,LABELS,DESCRIPTION,EVENT_START,EVENT_END,PREDICTED_END,TIMEZONE,ENTITIES,GEO,IMPACT_PATTERNS,SCOPE,PLACEKEY,COUNTRY_CODE,PLACE_HIERARCHIES,PHQ_ATTENDANCE,PHQ_RANK,LOCAL_RANK,AVIATION_RANK,STATUS,BRAND_SAFE,PARENT_EVENT_ID,PREDICTED_EVENT_SPEND_ACCOMMODATION,PREDICTED_EVENT_SPEND_HOSPITALITY,PREDICTED_EVENT_SPEND_TRANSPORTATION,ROW_INSERTED_DT,ROW_UPDATED_DT,EVENT_START_BOSTON,EVENT_END_BOSTON,PREDICTED_END_BOSTON,NEW_TIMEZONE,geometry,GEOID,geometry_type
0,9N4ZN3qMXRiu7wdWdx,2022-07-20 11:15:13.714,2022-07-21 03:00:21.963,Fire Weather Watch,disasters,"[""disaster"",""disaster-warning"",""fire""]",The National Weather Service in Medford has is...,2022-07-21 21:00:00.000,2022-07-22 03:00:00.000,,America/Los_Angeles,,"{""coordinates"":[[[[-122.85999298099995,42.0033...",,county,,US,"[[""6295630"",""6255149"",""6252001"",""5332921"",""557...",,56,,,active,True,,,,,2022-12-21 04:54:20.629,2023-01-31 20:57:26.060,2022-07-22 00:00:00,2022-07-22 06:00:00,,America/New_York,"MULTIPOLYGON (((-122.85999 42.00331, -122.8546...",61050002001,MultiPolygon
