In [1]:
# read data
import pandas as pd
# Set the maximum number of rows and columns to display
pd.set_option('display.max_rows', 200)  # Adjust this number as needed
pd.set_option('display.max_columns', 50)  # Adjust this number as needed

In [2]:
# Get shapefiles
import geopandas as gpd
poly = gpd.GeoDataFrame.from_file('../../data_CityEvent/Shp/US_blck_grp_2019.shp')
# poly = gpd.GeoDataFrame.from_file(r'data_CityEvent\\Shp\\US_blck_grp_2019.shp')
poly = poly.to_crs(epsg=4326)

In [3]:
print(poly.shape)

(219773, 16)


In [4]:
california = poly[poly['STATEFP']=='06']
print(california.shape)

(23192, 16)


In [5]:
import os 

# List all files in the folder
folder_path = '../../data_CityEvent/processed/1.4_events_cbgid_by_category/unscheduled_intermediate/'
unscheduleds = os.listdir(folder_path)

In [6]:
import json 

# Function to extract geometry type from GEO
def get_geometry_type(geo_json):
    try:
        geo = json.loads(geo_json)
        return geo.get("type", None)
    except json.JSONDecodeError:
        return None

def get_dfs_by_geometry_type(df):
    # Create a new column for geometry type
    df['geometry_type'] = df['GEO'].apply(get_geometry_type)

    # Split dataset based on geometry type
    df_points = df[df['geometry_type'] == 'Point'].reset_index(drop=True)
    df_polygons = df[df['geometry_type'] == 'Polygon'].reset_index(drop=True)
    df_multipolygons = df[df['geometry_type'] == 'MultiPolygon'].reset_index(drop=True)

    # Print the results
    print("Points:", df_points.shape)
    # display(df_points[:1])
    print("\nPolygons:", df_polygons.shape)
    # display(df_polygons[:1])
    print("\nMultiPolygons:", df_multipolygons.shape)
    # display(df_multipolygons[:1])

    return df_points, df_polygons, df_multipolygons

In [7]:
geometry_types = {
    'dataframe': [],
    'shape': [],
    'point': [],
    'polygon': [],
    'multipolygon': []
}
geometry_types = pd.DataFrame(geometry_types)

for unscheduled in unscheduleds:
    df = pd.read_csv(f'../../data_CityEvent/processed/1.4_events_cbgid_by_category/unscheduled_intermediate/{unscheduled}')
    print(df.shape)
    df_points, df_polygons, df_multipolygons = get_dfs_by_geometry_type(df)
    
    # Create a new row as a dictionary
    current_geometry = {
        'dataframe': unscheduled,
        'shape': df.shape,
        'point': df_points.shape,
        'polygon': df_polygons.shape,
        'multipolygon': df_multipolygons.shape
    }
    
    # Append the new row to the DataFrame using pd.concat
    geometry_types = pd.concat([geometry_types, pd.DataFrame([current_geometry])], ignore_index=True)


(556078, 34)
Points: (556078, 35)

Polygons: (0, 35)

MultiPolygons: (0, 35)
(24768, 34)
Points: (23357, 35)

Polygons: (1231, 35)

MultiPolygons: (180, 35)
(578, 34)
Points: (578, 35)

Polygons: (0, 35)

MultiPolygons: (0, 35)
(151229, 34)
Points: (2344, 35)

Polygons: (139464, 35)

MultiPolygons: (9421, 35)
(1880, 34)
Points: (1880, 35)

Polygons: (0, 35)

MultiPolygons: (0, 35)


In [8]:
geometry_types

Unnamed: 0,dataframe,shape,point,polygon,multipolygon
0,unscheduled_intermediate_airport-delays.csv,"(556078, 35)","(556078, 35)","(0, 35)","(0, 35)"
1,unscheduled_intermediate_disasters.csv,"(24768, 35)","(23357, 35)","(1231, 35)","(180, 35)"
2,unscheduled_intermediate_health-warnings.csv,"(578, 35)","(578, 35)","(0, 35)","(0, 35)"
3,unscheduled_intermediate_severe-weather.csv,"(151229, 35)","(2344, 35)","(139464, 35)","(9421, 35)"
4,unscheduled_intermediate_terror.csv,"(1880, 35)","(1880, 35)","(0, 35)","(0, 35)"


In [15]:
import os 

# List all files in the folder
folder_path = '../../data_CityEvent/processed/1.4_events_cbgid_by_category/'
scheduleds = os.listdir(folder_path)
scheduleds.remove('archive_before_dropping_missing_geometry')
scheduleds.remove('unscheduled_intermediate')
print(scheduleds)



In [16]:
for schedulued in scheduleds:
    df = pd.read_csv(f'../../data_CityEvent/processed/1.4_events_cbgid_by_category/{schedulued}')
    print(df.shape)
    df_points, df_polygons, df_multipolygons = get_dfs_by_geometry_type(df)
    
    # Create a new row as a dictionary
    current_geometry = {
        'dataframe': schedulued,
        'shape': df.shape,
        'point': df_points.shape,
        'polygon': df_polygons.shape,
        'multipolygon': df_multipolygons.shape
    }
    
    # Append the new row to the DataFrame using pd.concat
    geometry_types = pd.concat([geometry_types, pd.DataFrame([current_geometry])], ignore_index=True)

(35060, 36)
Points: (35057, 37)

Polygons: (3, 37)

MultiPolygons: (0, 37)
(96444, 36)
Points: (96438, 37)

Polygons: (6, 37)

MultiPolygons: (0, 37)
(10407, 36)
Points: (10407, 37)

Polygons: (0, 37)

MultiPolygons: (0, 37)
(5381, 36)
Points: (5380, 37)

Polygons: (1, 37)

MultiPolygons: (0, 37)
(8490, 36)
Points: (7459, 37)

Polygons: (1031, 37)

MultiPolygons: (0, 37)
(29042, 36)
Points: (29040, 37)

Polygons: (2, 37)

MultiPolygons: (0, 37)
(16271, 36)
Points: (16005, 37)

Polygons: (266, 37)

MultiPolygons: (0, 37)
(5275, 36)
Points: (5275, 37)

Polygons: (0, 37)

MultiPolygons: (0, 37)
(6, 36)
Points: (6, 37)

Polygons: (0, 37)

MultiPolygons: (0, 37)
(1282, 36)
Points: (1282, 37)

Polygons: (0, 37)

MultiPolygons: (0, 37)
(6, 36)
Points: (6, 37)

Polygons: (0, 37)

MultiPolygons: (0, 37)
(96, 36)
Points: (96, 37)

Polygons: (0, 37)

MultiPolygons: (0, 37)
(13027, 36)
Points: (0, 37)

Polygons: (11693, 37)

MultiPolygons: (1334, 37)
(475296, 36)
Points: (475296, 37)

Polygons: (0

In [17]:
geometry_types

Unnamed: 0,dataframe,shape,point,polygon,multipolygon
0,unscheduled_intermediate_airport-delays.csv,"(556078, 35)","(556078, 35)","(0, 35)","(0, 35)"
1,unscheduled_intermediate_disasters.csv,"(24768, 35)","(23357, 35)","(1231, 35)","(180, 35)"
2,unscheduled_intermediate_health-warnings.csv,"(578, 35)","(578, 35)","(0, 35)","(0, 35)"
3,unscheduled_intermediate_severe-weather.csv,"(151229, 35)","(2344, 35)","(139464, 35)","(9421, 35)"
4,unscheduled_intermediate_terror.csv,"(1880, 35)","(1880, 35)","(0, 35)","(0, 35)"
5,attended_community.csv,"(35060, 37)","(35057, 37)","(3, 37)","(0, 37)"
6,attended_concerts.csv,"(96444, 37)","(96438, 37)","(6, 37)","(0, 37)"
7,attended_conferences.csv,"(10407, 37)","(10407, 37)","(0, 37)","(0, 37)"
8,attended_expos.csv,"(5381, 37)","(5380, 37)","(1, 37)","(0, 37)"
9,attended_festivals.csv,"(8490, 37)","(7459, 37)","(1031, 37)","(0, 37)"


In [47]:
import json
print(type(json.loads(df['GEO'][0])))
print(json.loads(df['GEO'][0]))
print(json.loads(df['GEO'][0])['type'])

<class 'dict'>
{'coordinates': [-148.464206, 70.196561], 'type': 'Point'}
Point


In [54]:
# Function to extract geometry type from GEO
def get_geometry_type(geo_json):
    try:
        geo = json.loads(geo_json)
        return geo.get("type", None)
    except json.JSONDecodeError:
        return None


In [49]:

# Create a new column for geometry type
df['geometry_type'] = df['GEO'].apply(get_geometry_type)

# Split dataset based on geometry type
df_point = df[df['geometry_type'] == 'Point'].reset_index(drop=True)
df_polygon = df[df['geometry_type'] == 'Polygon'].reset_index(drop=True)
df_multipolygon = df[df['geometry_type'] == 'MultiPolygon'].reset_index(drop=True)

# Print the results
print("Points:", df_point.shape)
display(df_point[:1])
print("\nPolygons:", df_polygon.shape)
display(df_polygon[:1])
print("\nMultiPolygons:", df_multipolygon.shape)
display(df_multipolygon[:1])

Points: (556078, 35)


Unnamed: 0,EVENT_ID,CREATE_DT,UPDATE_DT,TITLE,CATEGORY,LABELS,DESCRIPTION,EVENT_START,EVENT_END,PREDICTED_END,TIMEZONE,ENTITIES,GEO,IMPACT_PATTERNS,SCOPE,PLACEKEY,COUNTRY_CODE,PLACE_HIERARCHIES,PHQ_ATTENDANCE,PHQ_RANK,LOCAL_RANK,AVIATION_RANK,STATUS,BRAND_SAFE,PARENT_EVENT_ID,PREDICTED_EVENT_SPEND_ACCOMMODATION,PREDICTED_EVENT_SPEND_HOSPITALITY,PREDICTED_EVENT_SPEND_TRANSPORTATION,ROW_INSERTED_DT,ROW_UPDATED_DT,EVENT_START_BOSTON,EVENT_END_BOSTON,PREDICTED_END_BOSTON,NEW_TIMEZONE,geometry_type
0,WNgy97K76LxgUwb5Sb,2022-09-14 20:00:24.152,2022-09-14 21:30:42.448,Moderate Delays - Prudhoe Bay/Deadhorse Airpor...,airport-delays,"[""airport"",""delay""]",,2022-09-14 19:00:00,2022-09-14 23:30:00,,America/Anchorage,"[{""entity_id"":""maSP9vgtdgaa88Z6Fz7jJi"",""format...","{""coordinates"":[-148.464206,70.196561],""type"":...",,locality,,US,"[[""6295630"",""6255149"",""6252001"",""5879092"",""587...",,20,,,active,True,,,,,2022-12-21 04:54:20.629,2023-01-31 20:57:26.060,2022-09-14 23:00:00,2022-09-15 03:30:00,,America/New_York,Point



Polygons: (0, 35)


Unnamed: 0,EVENT_ID,CREATE_DT,UPDATE_DT,TITLE,CATEGORY,LABELS,DESCRIPTION,EVENT_START,EVENT_END,PREDICTED_END,TIMEZONE,ENTITIES,GEO,IMPACT_PATTERNS,SCOPE,PLACEKEY,COUNTRY_CODE,PLACE_HIERARCHIES,PHQ_ATTENDANCE,PHQ_RANK,LOCAL_RANK,AVIATION_RANK,STATUS,BRAND_SAFE,PARENT_EVENT_ID,PREDICTED_EVENT_SPEND_ACCOMMODATION,PREDICTED_EVENT_SPEND_HOSPITALITY,PREDICTED_EVENT_SPEND_TRANSPORTATION,ROW_INSERTED_DT,ROW_UPDATED_DT,EVENT_START_BOSTON,EVENT_END_BOSTON,PREDICTED_END_BOSTON,NEW_TIMEZONE,geometry_type



MultiPolygons: (0, 35)


Unnamed: 0,EVENT_ID,CREATE_DT,UPDATE_DT,TITLE,CATEGORY,LABELS,DESCRIPTION,EVENT_START,EVENT_END,PREDICTED_END,TIMEZONE,ENTITIES,GEO,IMPACT_PATTERNS,SCOPE,PLACEKEY,COUNTRY_CODE,PLACE_HIERARCHIES,PHQ_ATTENDANCE,PHQ_RANK,LOCAL_RANK,AVIATION_RANK,STATUS,BRAND_SAFE,PARENT_EVENT_ID,PREDICTED_EVENT_SPEND_ACCOMMODATION,PREDICTED_EVENT_SPEND_HOSPITALITY,PREDICTED_EVENT_SPEND_TRANSPORTATION,ROW_INSERTED_DT,ROW_UPDATED_DT,EVENT_START_BOSTON,EVENT_END_BOSTON,PREDICTED_END_BOSTON,NEW_TIMEZONE,geometry_type


In [34]:
import pandas as pd
import json
from shapely.geometry import shape, Point
import geopandas as gpd

# Function to parse GEO column and create geometry
def parse_geo(geo):
    if isinstance(geo, str):
        # If geo is a string, parse it as JSON
        geo_dict = json.loads(geo)
    elif isinstance(geo, dict):
        # If it's already a dict, no need to parse
        geo_dict = geo
    else:
        raise ValueError("GEO column contains invalid data")
    
    geom = shape(geo_dict)
    
    # If the geometry is not a Point, return the centroid
    if not isinstance(geom, Point):
        return geom.centroid
    return geom

def assign_cbgid(df, poly):
    # Apply the function to create the new 'geometry' column
    df['geometry'] = df['GEO'].apply(parse_geo)

    # Convert to GeoDataFrame and set the initial CRS (assuming WGS84)
    gdf_events = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")

    # Perform spatial join between the event points and the polygon census block groups
    gdf_with_cbgid = gpd.sjoin(gdf_events, poly, how='left', predicate='within')

    # Merge the original df with the gdf_with_cbgid based on 'EVENT_ID'
    merged_events = df.merge(
        gdf_with_cbgid[['EVENT_ID', 'GEOID']],  # Selecting only relevant columns
        on='EVENT_ID',  # Merging on 'EVENT_ID'
        how='left'  # Keep all rows from the original DataFrame
    )

    return merged_events

In [36]:
def check_cbgid(df, poly, california):
    print('df shape is: ', df.shape)
    df_cbgid_assigned = assign_cbgid(df, poly)
    df_cbgid_assigned_cal = assign_cbgid(df, california)

    print('remaining with poly:', df_cbgid_assigned['GEOID'].isna().sum())
    print('remaining with california:', df_cbgid_assigned_cal['GEOID'].isna().sum())
    

In [37]:
check_cbgid(df_point, poly, california)

df shape is:  (23357, 36)
remaining with poly: 17312
remaining with california: 22048


In [38]:
check_cbgid(df_multipolygon, poly, california)

df shape is:  (180, 36)
remaining with poly: 10
remaining with california: 169


In [39]:
check_cbgid(df_polygon, poly, california)

df shape is:  (1231, 36)
remaining with poly: 2
remaining with california: 1125


In [40]:
df_multipolygon[:2]

Unnamed: 0,EVENT_ID,CREATE_DT,UPDATE_DT,TITLE,CATEGORY,LABELS,DESCRIPTION,EVENT_START,EVENT_END,PREDICTED_END,TIMEZONE,ENTITIES,GEO,IMPACT_PATTERNS,SCOPE,PLACEKEY,COUNTRY_CODE,PLACE_HIERARCHIES,PHQ_ATTENDANCE,PHQ_RANK,LOCAL_RANK,AVIATION_RANK,STATUS,BRAND_SAFE,PARENT_EVENT_ID,PREDICTED_EVENT_SPEND_ACCOMMODATION,PREDICTED_EVENT_SPEND_HOSPITALITY,PREDICTED_EVENT_SPEND_TRANSPORTATION,ROW_INSERTED_DT,ROW_UPDATED_DT,EVENT_START_BOSTON,EVENT_END_BOSTON,PREDICTED_END_BOSTON,NEW_TIMEZONE,geometry,geometry_type
0,Uvmd3Znm3fYB7UKRVU,2021-02-23 01:56:06.925,2022-03-05 02:10:17.053,Avalanche Warning,disasters,"[""avalanche"",""disaster"",""disaster-warning""]",The following message is transmitted at the re...,2021-02-23 01:14:00.000,2021-02-24 01:00:00.000,,America/Denver,,"{""coordinates"":[[[[-113.82699659999997,46.6605...",,county,,US,"[[""6295630"",""6255149"",""6252001"",""5667009"",""565...",,13,,,active,True,,,,,2022-12-21 04:54:20.629,2023-01-31 20:57:26.060,2021-02-23 03:14:00,2021-02-24 03:00:00,,America/New_York,POINT (-114.01693074590078 46.58368479462088),MultiPolygon
1,HgR6H8wPJHWq3LMZPk,2022-12-15 04:45:16.324,2022-12-15 04:45:29.229,Avalanche Warning,disasters,"[""avalanche"",""disaster"",""disaster-warning""]",The following message is transmitted at the re...,2022-12-15 04:31:00.000,2022-12-15 22:15:00.000,,America/Anchorage,,"{""coordinates"":[[[[-146.64622854699996,61.0239...",,county,,US,"[[""6295630"",""6255149"",""6252001"",""5879092"",""587...",,13,,,active,True,,,,,2022-12-21 04:54:20.629,2023-01-31 20:57:26.060,2022-12-15 08:31:00,2022-12-16 02:15:00,,America/New_York,POINT (-146.35876933227496 61.1545788350478),MultiPolygon


In [23]:
print(type(df['GEO'][0]))

<class 'str'>


In [24]:
df['GEO'] = df['GEO'].apply(lambda x: json.loads(x) if pd.notnull(x) else None)

In [25]:
print(type(df['GEO'][0]))

<class 'dict'>


In [28]:
df['GEO'][0]['type']

'MultiPolygon'

In [30]:
print(type_counts)

GEO
Point           23357
Polygon          1231
MultiPolygon      180
Name: count, dtype: int64


In [35]:
print(poly.shape)

(219773, 16)


In [39]:
from shapely.geometry import shape, MultiPolygon, Polygon

In [41]:
gdf = gpd.GeoDataFrame(df, crs="EPSG:4326")

# Explode MultiPolygon into individual polygons
exploded_gdf = gdf.explode('GEO', ignore_index=True)

print(exploded_gdf.shape)

ValueError: Assigning CRS to a GeoDataFrame without a geometry column is not supported. Supply geometry using the 'geometry=' keyword argument, or by providing a DataFrame with column name 'geometry'