In [3]:
import datetime
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from shapely.geometry import Point
from geopandas import GeoDataFrame
import os
pd.options.display.float_format = '{:.5f}'.format

import seaborn as sns
pd.set_option("display.max_colwidth", 0)
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
import warnings
warnings.filterwarnings("ignore")

In [4]:
import sys
import logging

nblog = open("./logs/entropy_"+str(datetime.datetime.now())+".log", "a+")
sys.stdout.echo = nblog
sys.stderr.echo = nblog

get_ipython().log.handlers[0].stream = nblog
get_ipython().log.setLevel(logging.INFO)

%autosave 5

Autosaving every 5 seconds


In [5]:
%%time
df_localness = pd.read_csv('./data/food_localness_oct.csv',header=0)

CPU times: user 13.8 s, sys: 6.52 s, total: 20.3 s
Wall time: 58.2 s


## Compute entropy of restaurant cusine types based on a buffer around customer location

In [6]:
%%time
# create geodf for customer locations
df_localness['customer_loc_id'] = df_localness['customer_lon'].astype(str)+df_localness['customer_lat'].astype(str)
df_unique_customers = df_localness[['customer_loc_id','customer_lon','customer_lat']].drop_duplicates()
print(df_unique_customers.shape[0])
geometry = [Point(xy) for xy in zip(df_unique_customers.customer_lon, df_unique_customers.customer_lat)]
gdf_customer_loc = GeoDataFrame(df_unique_customers, crs="EPSG:4326", geometry=geometry)
gdf_customer_loc = gdf_customer_loc.to_crs(epsg=3857)#convert customer locations to 3857
gdf_customer_loc = gdf_customer_loc.rename(columns={"geometry": "customer_geometry"})

791511
CPU times: user 12.7 s, sys: 532 ms, total: 13.3 s
Wall time: 29.7 s


In [7]:
%%time
# prepare vendor geodf for spatial join
df_vendor = pd.read_csv('./data/all_vendor_oct_processed.csv',header=0)
geometry = [Point(xy) for xy in zip(df_vendor.vendor_lon, df_vendor.vendor_lat)]
gdf_vendor = GeoDataFrame(df_vendor, crs="EPSG:4326", geometry=geometry)
gdf_vendor = gdf_vendor.rename(columns={"vendor_id": "n_vendor_id","main_cuisine":"n_main_cuisine",'vendor_name':'n_vendor_name',
                                       'vendor_lon':'n_vendor_lon','vendor_lat':'n_vendor_lat','vertical':'n_vertical'})
gdf_vendor = gdf_vendor.to_crs(epsg=3857)

gdf_foodvendor_need = gdf_vendor[['geometry','n_main_cuisine']]
print('total number of restaurants regardless of cusine types:',gdf_foodvendor_need.shape[0])
total_cuisines = gdf_foodvendor_need.n_main_cuisine.unique().shape[0] #66 types

total number of restaurants regardless of cusine types: 8105
CPU times: user 273 ms, sys: 0 ns, total: 273 ms
Wall time: 838 ms


In [8]:
import gc
del df_localness
del df_vendor
del gdf_vendor
del df_unique_customers
gc.collect()

20

In [9]:
def compute_entropy_function(x):
    sum_nominator = 0
    for prop in x[col_name].tolist(): # each element is a proportion of one cuisine
        sum_nominator += prop*(np.log(1/prop))
    entropy = sum_nominator/np.log(total_cuisines)
    return entropy

In [None]:
%%time

TESTING = False #False, True
# distance_list = [0.5, 1, 2, 3, 4, 5] #unit:km
distance_list = [3, 4, 5] #unit:km
batch_size = 40000 #100000 50000
print('TESTING:',TESTING)

for distance_threshold in distance_list:
    col_name = 'entropy_'+str(distance_threshold)+'km'
    output_file = './data/entropy_'+str(distance_threshold)+'km_customerlocations_oct.csv'
    print(datetime.datetime.now(), '************** Distance:',distance_threshold,',', output_file, '**************')

    # Create buffer in meters around customer locations
    gdf_customer_loc_buffer = gdf_customer_loc.copy()
    gdf_customer_loc_buffer['buffer'] = gdf_customer_loc_buffer.customer_geometry.buffer(distance_threshold*1000)
    gdf_customer_loc_buffer = gdf_customer_loc_buffer.set_geometry('buffer')
    print(gdf_customer_loc_buffer.shape[0],gdf_customer_loc_buffer.customer_loc_id.unique().shape[0])
    
    # Spatial join to find points within the buffer
    gdf_foodvendor_need[col_name] = 1
    
    for i in range(0,gdf_customer_loc_buffer.shape[0],batch_size):
        print(datetime.datetime.now(),i,i+batch_size,'/',gdf_customer_loc_buffer.shape[0])
        if TESTING:
            groupped_data = gpd.sjoin(gdf_foodvendor_need, gdf_customer_loc_buffer.head(5), op='within').groupby(
                ['customer_loc_id','n_main_cuisine']).agg({col_name: 'sum'}) # debugging line
        else:
            groupped_data = gpd.sjoin(gdf_foodvendor_need, gdf_customer_loc_buffer.iloc[i:i+batch_size], op='within').groupby(
                ['customer_loc_id','n_main_cuisine']).agg({col_name: 'sum'})
        groupped_data = groupped_data.groupby(level=0).apply(lambda x:  x / x.sum()) #convert to proportions
        print(datetime.datetime.now(),'done spatial join! Distance:',distance_threshold)

        groupped_data = groupped_data.groupby('customer_loc_id').apply(compute_entropy_function).reset_index()  #compute entropy
        groupped_data.columns = ['customer_loc_id', col_name]
        print(datetime.datetime.now(),'done entropy computation! Distance:',distance_threshold)
        
        if not TESTING:
            if not os.path.isfile(output_file):# if file does not exist write header
                groupped_data.to_csv(output_file,index=False)
            else: # else it exists so append without writing the header
                groupped_data.to_csv(output_file, mode='a', header=False,index=False)

TESTING: False
2024-06-04 13:37:06.456183 ************** Distance: 3 , ./data/entropy_3km_customerlocations_oct.csv **************
791511 791511
2024-06-04 13:37:48.563621 0 40000 / 791511
2024-06-04 13:39:48.281246 done spatial join! Distance: 3
2024-06-04 13:39:56.245117 done entropy computation! Distance: 3
2024-06-04 13:39:56.497747 40000 80000 / 791511
2024-06-04 13:42:16.673250 done spatial join! Distance: 3
2024-06-04 13:42:29.301674 done entropy computation! Distance: 3
2024-06-04 13:42:29.627451 80000 120000 / 791511
2024-06-04 13:44:42.108906 done spatial join! Distance: 3
2024-06-04 13:44:48.289886 done entropy computation! Distance: 3
2024-06-04 13:44:48.451210 120000 160000 / 791511
2024-06-04 13:46:45.696274 done spatial join! Distance: 3
2024-06-04 13:46:55.732954 done entropy computation! Distance: 3
2024-06-04 13:46:55.990192 160000 200000 / 791511
2024-06-04 13:48:22.826992 done spatial join! Distance: 3
2024-06-04 13:48:31.328109 done entropy computation! Distance: 3

2024-06-04 15:45:54.285739 done spatial join! Distance: 5
2024-06-04 15:46:05.928417 done entropy computation! Distance: 5
2024-06-04 15:46:06.204850 240000 280000 / 791511
2024-06-04 15:50:17.368920 done spatial join! Distance: 5
2024-06-04 15:50:28.368619 done entropy computation! Distance: 5
2024-06-04 15:50:28.627551 280000 320000 / 791511
2024-06-04 15:53:42.258061 done spatial join! Distance: 5
2024-06-04 15:53:51.747372 done entropy computation! Distance: 5
2024-06-04 15:53:52.172127 320000 360000 / 791511
2024-06-04 15:57:06.706460 done spatial join! Distance: 5
2024-06-04 15:57:19.685196 done entropy computation! Distance: 5
2024-06-04 15:57:19.980107 360000 400000 / 791511
2024-06-04 16:00:32.886800 done spatial join! Distance: 5
2024-06-04 16:00:46.272759 done entropy computation! Distance: 5
2024-06-04 16:00:46.665259 400000 440000 / 791511
2024-06-04 16:04:25.486522 done spatial join! Distance: 5
2024-06-04 16:04:37.397014 done entropy computation! Distance: 5
2024-06-04 1

In [None]:
# %%time
# Optional step: check if entropy results are reliable
# examine_entropy = df_nearest_compute_entropy[['customer_lon','customer_lat','customer_loc_id','entropy_3.6km']].drop_duplicates()
# print(examine_entropy.shape[0])
# geometry = [Point(xy) for xy in zip(examine_entropy.customer_lon, examine_entropy.customer_lat)]
# gdf_examine_entropy = GeoDataFrame(examine_entropy, crs="EPSG:4326", geometry=geometry)
# gdf_examine_entropy.head(10000).explore(column='entropy_3.6km',cmap="rainbow",tiles="CartoDB positron")

In [17]:
import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

[('gdf_customer_loc', 88942994),
 ('gdf_customer_loc_buffer', 88942994),
 ('gdf_foodvendor_need', 584149),
 ('geometry', 69152),
 ('GeoDataFrame', 1192),
 ('Point', 896),
 ('nblog', 208),
 ('compute_entropy_function', 136),
 ('distance_list', 96),
 ('output_file', 93),
 ('gpd', 72),
 ('np', 72),
 ('pd', 72),
 ('plt', 72),
 ('sns', 72),
 ('col_name', 60),
 ('batch_size', 28),
 ('distance_threshold', 28),
 ('total_cuisines', 28),
 ('TESTING', 24)]