In [1]:
import datetime
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from shapely.geometry import Point
from geopandas import GeoDataFrame
import os
pd.options.display.float_format = '{:.5f}'.format

import seaborn as sns
pd.set_option("display.max_colwidth", 0)
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
import warnings
warnings.filterwarnings("ignore")

In [2]:
import sys
import logging

nblog = open("./logs/entropy_"+str(datetime.datetime.now())+".log", "a+")
sys.stdout.echo = nblog
sys.stderr.echo = nblog

get_ipython().log.handlers[0].stream = nblog
get_ipython().log.setLevel(logging.INFO)

%autosave 5

Autosaving every 5 seconds


In [3]:
%%time
df_localness = pd.read_csv('./data/food_localness_oct.csv', header=0)
df_vendor = pd.read_csv('./data/all_vendor_oct_processed.csv',header=0)
total_cuisines = df_vendor.main_cuisine.unique().shape[0] #66 types
print(total_cuisines)

66
CPU times: user 11.6 s, sys: 1.52 s, total: 13.1 s
Wall time: 19.1 s


## Compute entropy of restaurant cusine types based on a buffer around customer location

In [4]:
# %%time
# # create geodf for customer locations
# df_localness['customer_loc_id'] = df_localness['customer_lon'].astype(str)+df_localness['customer_lat'].astype(str)
# df_unique_customers = df_localness[['customer_loc_id','customer_lon','customer_lat']].drop_duplicates()
# print(df_unique_customers.shape[0])
# geometry = [Point(xy) for xy in zip(df_unique_customers.customer_lon, df_unique_customers.customer_lat)]
# gdf_customer_loc = GeoDataFrame(df_unique_customers, crs="EPSG:4326", geometry=geometry)
# gdf_customer_loc = gdf_customer_loc.to_crs(epsg=3857) #convert customer locations to 3857
# gdf_customer_loc = gdf_customer_loc.rename(columns={"geometry": "customer_geometry"})

In [5]:
# %%time
# # prepare vendor geodf for spatial join
# df_vendor = pd.read_csv('./data/all_vendor_oct_processed.csv',header=0)
# geometry = [Point(xy) for xy in zip(df_vendor.vendor_lon, df_vendor.vendor_lat)]
# gdf_vendor = GeoDataFrame(df_vendor, crs="EPSG:4326", geometry=geometry)
# gdf_vendor = gdf_vendor.rename(columns={"vendor_id": "n_vendor_id","main_cuisine":"n_main_cuisine",'vendor_name':'n_vendor_name',
#                                        'vendor_lon':'n_vendor_lon','vendor_lat':'n_vendor_lat','vertical':'n_vertical'})
# gdf_vendor = gdf_vendor.to_crs(epsg=3857)

# gdf_foodvendor_need = gdf_vendor[['geometry','n_main_cuisine']]
# print('total number of restaurants regardless of cusine types:',gdf_foodvendor_need.shape[0])

In [5]:
# import gc
# del df_localness
# del df_vendor
# del gdf_vendor
# del df_unique_customers
# gc.collect()

In [4]:
def compute_entropy_function(x):
    sum_nominator = 0
    for prop in x[col_name].tolist(): # each element is a proportion of one cuisine
        sum_nominator += prop*(np.log(1/prop))
    entropy = sum_nominator/np.log(total_cuisines)
    return entropy

In [None]:
%%time

# list of open resturant's ID during each hour of the day
available_vendors_list_by_hour = df_localness.groupby(['order_time_hour']).vendor_id.unique()

# distance_list = [0.5, 1, 2, 3, 4, 5] #unit:km
distance_list = [0.5, 1, 2] #unit:km
batch_size = 10000 #100000 50000

# Loop hour, for each hour we compute number of available restaurants considering the opening hour
for hour in range(0,24):
    print(datetime.datetime.now(),'Looping hour: ', hour)
    df_order_byhour = df_localness[df_localness['order_time_hour']==hour]
    print(' Number of orders during this hour: ',df_order_byhour.shape[0])
    
    df_vendor_byhour = df_vendor[df_vendor['vendor_id'].isin(available_vendors_list_by_hour[hour])]
    print(' Number of available restaurants during this hour: ',len(available_vendors_list_by_hour[hour]))
    
    # convert restaurant points of a specific hour to geopandas
    geometry = [Point(xy) for xy in zip(df_vendor_byhour.vendor_lon, df_vendor_byhour.vendor_lat)]
    gdf_vendor_byhour = GeoDataFrame(df_vendor_byhour, crs="EPSG:4326", geometry=geometry)
    gdf_vendor_byhour = gdf_vendor_byhour.to_crs(epsg=3857)
    gdf_vendor_byhour = gdf_vendor_byhour[['geometry','main_cuisine']]
    gdf_vendor_byhour = gdf_vendor_byhour.rename(columns={"main_cuisine":"near_main_cuisine"})
    
    for distance_threshold in distance_list:
        col_name = 'entropy_'+str(distance_threshold)+'km'
        output_file = './data/entropy_'+str(distance_threshold)+'km_customerlocations_oct.csv'
        print(datetime.datetime.now(), '************** Distance:',distance_threshold,',', output_file, '**************')

        # Create buffer in meters around customer locations
        geometry = [Point(xy) for xy in zip(df_order_byhour.customer_lon, df_order_byhour.customer_lat)]
        gdf_order_byhour = GeoDataFrame(df_order_byhour, crs="EPSG:4326", geometry=geometry)
        gdf_order_byhour = gdf_order_byhour.to_crs(epsg=3857) # convert customer locations to 3857
        gdf_order_byhour = gdf_order_byhour.rename(columns={"geometry": "customer_geometry"})
        gdf_order_byhour['buffer'] = gdf_order_byhour.customer_geometry.buffer(distance_threshold*1000)
        gdf_order_byhour = gdf_order_byhour.set_geometry('buffer')
        
        # Spatial join to find points within the buffer
        gdf_vendor_byhour[col_name] = 1

        for i in range(0, gdf_order_byhour.shape[0],batch_size):
            print(datetime.datetime.now(),i,i+batch_size,'/', gdf_order_byhour.shape[0])
            groupped_data = gpd.sjoin(gdf_vendor_byhour, gdf_order_byhour.iloc[i:i+batch_size], op='within').groupby(
                ['order_id','near_main_cuisine']).agg({col_name: 'sum'})
            groupped_data = groupped_data.groupby(level=0).apply(lambda x:  x / x.sum()) #convert to proportions
#             print(datetime.datetime.now(),'done spatial join!')
            groupped_data = groupped_data.groupby('order_id').apply(compute_entropy_function).reset_index()  #compute entropy
            groupped_data.columns = ['order_id', col_name]
#             print(datetime.datetime.now(),'done entropy computation!')

            if not os.path.isfile(output_file): # if file does not exist write header
                groupped_data.to_csv(output_file, index=False)
            else: # else it exists so append without writing the header
                groupped_data.to_csv(output_file, mode='a', header=False,index=False)

2024-09-30 22:03:33.903094 Looping hour:  0
 Number of orders during this hour:  71470
 Number of available restaurants during this hour:  3317
2024-09-30 22:03:34.176520 ************** Distance: 0.5 , ./data/entropy_0.5km_customerlocations_oct.csv **************
2024-09-30 22:03:35.759975 0 10000 / 71470
2024-09-30 22:03:39.788035 done spatial join!
2024-09-30 22:03:40.210554 done entropy computation!
2024-09-30 22:03:40.250696 10000 20000 / 71470
2024-09-30 22:03:43.581270 done spatial join!
2024-09-30 22:03:43.932048 done entropy computation!
2024-09-30 22:03:43.943655 20000 30000 / 71470
2024-09-30 22:03:47.031742 done spatial join!
2024-09-30 22:03:47.355506 done entropy computation!
2024-09-30 22:03:47.366624 30000 40000 / 71470
2024-09-30 22:03:51.413853 done spatial join!
2024-09-30 22:03:51.855850 done entropy computation!
2024-09-30 22:03:51.870230 40000 50000 / 71470
2024-09-30 22:03:55.648008 done spatial join!
2024-09-30 22:03:56.052236 done entropy computation!
2024-09-30

In [9]:
import pandas as pd

distance_list = [0.5, 1, 2, 3, 4, 5] #unit:km
for f in distance_list:
    f_file = './data/entropy_'+str(f)+'km_customerlocations_oct.csv'
    df = pd.read_csv(f_file, header=0)
    print(df.shape[0])

2147696
2632917
2793351
2815535
2822420
2825245


In [None]:
# %%time
# Optional step: check if entropy results are reliable
# examine_entropy = df_nearest_compute_entropy[['customer_lon','customer_lat','customer_loc_id','entropy_3.6km']].drop_duplicates()
# print(examine_entropy.shape[0])
# geometry = [Point(xy) for xy in zip(examine_entropy.customer_lon, examine_entropy.customer_lat)]
# gdf_examine_entropy = GeoDataFrame(examine_entropy, crs="EPSG:4326", geometry=geometry)
# gdf_examine_entropy.head(10000).explore(column='entropy_3.6km',cmap="rainbow",tiles="CartoDB positron")

In [None]:
# import sys

# # These are the usual ipython objects, including this one you are creating
# ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# # Get a sorted list of the objects and their sizes
# sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)