In [None]:
# read data
import pandas as pd
# Set the maximum number of rows and columns to display
pd.set_option('display.max_rows', 200)  # Adjust this number as needed
pd.set_option('display.max_columns', 50)  # Adjust this number as needed

### Get Mobility Data

In [None]:
import os 

# Specify the folder path 
mobility_data_path = '../../data_CityEvent/SafeGraph/'

# List all files in the folder
mobility_data = os.listdir(mobility_data_path)

# file_names.remove('unscheduled_intermediate') 

# # Print the file names
# for mobility_file_name in mobility_data:
#     print(mobility_file_name)

### Get hourly events data by category

In [None]:
# Specify the folder path 
events_data_path = '../../data_CityEvent/processed/3_hourly_events_cbgid_category/'

# List all files in the folder
events_data = os.listdir(events_data_path)

# events_data.remove('archive_before_minutes_floored') 

# for events_data_file_name in events_data:
#     print(events_data_file_name)

### Function to assign mobility/visits in events data

In [None]:
def get_visit_in_events_dataset(events_dataset, mobility_dataset):
    # Convert both time columns to datetime if not already
    mobility_dataset['Date'] = pd.to_datetime(mobility_dataset['Date'])
    events_dataset['Date'] = pd.to_datetime(events_dataset['Date'])

    # Merging the two DataFrames
    events_dataset = pd.merge(events_dataset, mobility_dataset[['BGFIPS', 'Date', 'Visit']], on=['BGFIPS', 'Date'], how='left')
    
    # Update Visits_Mobility by adding Visit where Visit is not NaN
    events_dataset['Visits_Mobility'] += events_dataset['Visit'].fillna(int(0))

    # Drop the Visit column
    events_dataset.drop(columns=['Visit'], inplace=True)

    return events_dataset

### Run loop on events category datasets for assigning mobility, and save

In [None]:
# Loop for events data
for events_data_file_name in events_data:
    
    # Get events dataframe
    events = pd.read_csv(os.path.join(events_data_path,events_data_file_name))
    print(events_data_file_name)
    # Events dataset sanity check
    # print(events.shape)

    # Events dataset preprocessing
    events.rename(columns={'GEOID': 'BGFIPS'}, inplace=True)
    events.rename(columns={'hourly_times': 'Date'}, inplace=True)
    # events['Date'] = pd.to_datetime(events['Date'])
    events['Visits_Mobility'] = 0

    # Loop for mobility data
    for mobility_file_name in mobility_data:
        mobility = pd.read_csv(os.path.join(mobility_data_path,mobility_file_name))
        print(mobility_file_name)
        # Mobility dataset sanity check
        # print(mobility.shape)

        # Get Visit counts from mobility dataset on events dataset
        events = get_visit_in_events_dataset(events_dataset=events, mobility_dataset=mobility)
        # break

    # Save into target destination in csv format
    save_data_path = '../../data_CityEvent/processed/4_events_join_w_mobility/'
    events.to_csv(f'{save_data_path}/{events_data_file_name[:-4]}_visits.csv', index=False)
    # break

### Box Plot Function

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

def create_box_plot(df, df_name='event_category'):
    # Create the plot
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=df, x='num_events', y='percentage_visit_change', palette='coolwarm')
    plt.xlabel('Number of Events')
    plt.ylabel('Percentage Visit Change')
    plt.title(df_name)

    # Save the plot
    plt.savefig(f'../../results/1_box_plots/{df_name}.png', dpi=300, bbox_inches='tight')

    # Show the plot
    plt.show()


### Process data to get average visits for box plot

In [None]:
def data_processing(df, df_name=''):
    try:
        # Step 1: Get new columns - day of the week and hour
        df['Date'] = pd.to_datetime(df['Date'])  # Ensure the 'Date' column is in datetime format
        df['date_without_time'] = df['Date'].dt.date  # Extract only the date
        df['day_of_week'] = df['Date'].dt.day_name()  # Get the day name (e.g., Monday, Tuesday)
        df['hour'] = df['Date'].dt.hour  # Get the hour of the day
        print("Step 1: Date processing completed successfully.")

        # Step 2: Calculate total visits on the day
        df['total_visits_on_the_day'] = df.groupby(['BGFIPS', 'date_without_time'])['Visits_Mobility'].transform('sum')
        print("Step 2: Total visits calculation completed successfully.")

        # Step 3: Calculate the mean visits per day per weekday
        df['avg_visits_perday_perweekday'] = df.groupby(['BGFIPS', 'day_of_week'])['total_visits_on_the_day'].transform('mean')
        print("Step 3: Average visits calculation completed successfully.")

        # Step 4: Group by 'date_without_time' and 'BGFIPS', count unique 'EVENT_ID's, and merge the rows
        df_num_events = df.groupby(['date_without_time', 'BGFIPS'], as_index=False).agg(num_events=('EVENT_ID', 'nunique'))
        df = pd.merge(df, df_num_events, on=['date_without_time', 'BGFIPS'], how='left')
        print("Step 4: Event count and merge completed successfully.")

        # Step 5: Calculate percentage visit change
        df['percentage_visit_change'] = (df['total_visits_on_the_day'] - df['avg_visits_perday_perweekday']) / df['avg_visits_perday_perweekday']
        print("Step 5: Percentage visit change calculation completed successfully.")

        # Step 6: Save dataset
        output_path = f'../../data_CityEvent/processed/processed_daily/{df_name}.csv'
        df.to_csv(output_path, index=False)
        print(f"Step 6: Dataset saved successfully to {output_path}.")

        # Step 7: Show Box Plot
        create_box_plot(df, df_name)
        print("Step 7: Box plot created successfully.")
        
    except Exception as e:
        print(f"An error occurred during processing: {e}")
        
    # Optionally return the modified dataframe for further use
    return df


In [None]:
import os
# LOOP
filepath = '../../data_CityEvent/processed/4_events_join_w_mobility/'
filenames = os.listdir(filepath)

for filename in filenames:
    
    df_name = filename[:-18]
    df = pd.read_csv(f'{filepath}/{df_name}_hourly_visits.csv')
    # print(df.shape)

    data_processing(df, df_name)