In [None]:
# read data
import pandas as pd
# Set the maximum number of rows and columns to display
pd.set_option('display.max_rows', 200)  # Adjust this number as needed
pd.set_option('display.max_columns', 50)  # Adjust this number as needed

In [None]:
# Get shapefiles
import geopandas as gpd
poly = gpd.GeoDataFrame.from_file('../../data_CityEvent/Shp/US_blck_grp_2019.shp')
poly = poly.to_crs(epsg=4326)
california = poly[poly['STATEFP']=='06']

In [None]:
import pandas as pd
import geopandas as gpd
import json
from shapely.geometry import shape

def parse_geometry(geo):
    """Parse the GEO field into a Shapely geometry (Point, Polygon, or MultiPolygon)."""
    if pd.isna(geo):
        return None
    try:
        # Convert the GEO string into a Python dict
        geo_json = json.loads(geo)

        # Use shapely's shape() to parse the geometry
        geometry = shape(geo_json)

        # Ensure the parsed geometry is one of the expected types
        if geometry.geom_type in {"Point", "Polygon", "MultiPolygon"}:
            return geometry
        else:
            print(f"Unsupported geometry type: {geometry.geom_type}")
            return None
    except Exception as e:
        print(f"Error parsing geometry: {e}")
        return None

def assign_cbgid(df, shapefile):
    df['geometry'] = df['GEO'].apply(parse_geometry)
    df = df.dropna(subset=['geometry'])
    df = gpd.GeoDataFrame(df, geometry='geometry', crs='EPSG:4326')
    shapefile = gpd.GeoDataFrame(shapefile, geometry='geometry', crs="EPSG:4326")
    shapefile = shapefile[['GEOID', 'geometry']]

    assigned_df = gpd.sjoin(df, shapefile, how='inner', predicate='intersects')
    print(df.shape)
    print(assigned_df.shape)
    assigned_df = assigned_df.drop(columns=['index_right'])
    print(assigned_df.shape)
    return(assigned_df)

In [None]:
df_attended = pd.read_csv('../../data_CityEvent/Cityevents/Demand_Intelligence_for_Attended_Events_California-0.csv')

In [None]:
# Save function
def save_df_by_event_category(df):
    columns = [col for col in df.columns]
    categories = list(df['CATEGORY'].unique())
    for category in categories:
        df_by_category = df[df['CATEGORY'] == category]
        print(f'{category} shape: {df_by_category.shape}')
        df_by_category.to_csv(f'../../data_CityEvent/processed/1.4_updated_events_cbgid_assigned_by_category/attended_{category}.csv', index=False)  # Set index=False to avoid saving the index
    
    print('Successful')

In [None]:
from utils.time.time_converted_to_boston import convert_time_of_dataframe


df_attended = df_attended[(df_attended['CANCELLED_DT'].isna()) & (df_attended['POSTPONED_DT'].isna())]

df_attended = df_attended.drop(['CANCELLED_DT', 'POSTPONED_DT'], axis=1)

datetime_columns = ['CREATE_DT', 'UPDATE_DT', 'EVENT_START', 'EVENT_END', 'PREDICTED_END', 'ROW_INSERTED_DT', 'ROW_UPDATED_DT']
df_attended[datetime_columns] = df_attended[datetime_columns].apply(pd.to_datetime, errors='coerce')

df_attended_converted = convert_time_of_dataframe(df_attended)

df_attended_cbgid = assign_cbgid(df_attended_converted, california)

df_attended_cbgid.to_csv(f'../../data_CityEvent/processed/1.4_updated_events_cbgid_assigned/df_attended.csv', index=False)

save_df_by_event_category(df_attended_cbgid)
