In [2]:
import datetime
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from shapely.geometry import Point
from geopandas import GeoDataFrame
import os
pd.options.display.float_format = '{:.5f}'.format

import seaborn as sns
pd.set_option("display.max_colwidth", 0)
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
import warnings
warnings.filterwarnings("ignore")

In [3]:
import sys
import logging

nblog = open("./logs/density_"+str(datetime.datetime.now())+".log", "a+")
sys.stdout.echo = nblog
sys.stderr.echo = nblog

get_ipython().log.handlers[0].stream = nblog
get_ipython().log.setLevel(logging.INFO)

%autosave 5

Autosaving every 5 seconds


In [4]:
%%time
df_localness = pd.read_csv('./data/food_localness_oct.csv', header=0)
df_vendor = pd.read_csv('./data/all_vendor_oct_processed.csv',header=0)

CPU times: user 11.9 s, sys: 1.38 s, total: 13.3 s
Wall time: 18.8 s


## Compute number of restaurants based on a buffer around customer location

In [26]:
# %%time
# # create geodf for customer locations
# df_localness['customer_loc_id'] = df_localness['customer_lon'].astype(str)+df_localness['customer_lat'].astype(str)
# df_unique_customers = df_localness[['customer_loc_id','customer_lon','customer_lat']].drop_duplicates()
# print(df_unique_customers.shape[0])
# geometry = [Point(xy) for xy in zip(df_unique_customers.customer_lon, df_unique_customers.customer_lat)]
# gdf_customer_loc = GeoDataFrame(df_unique_customers, crs="EPSG:4326", geometry=geometry)
# gdf_customer_loc = gdf_customer_loc.to_crs(epsg=3857)#convert customer locations to 3857
# gdf_customer_loc = gdf_customer_loc.rename(columns={"geometry": "customer_geometry"})

In [27]:
# %%time
# # prepare vendor geodf for spatial join
# geometry = [Point(xy) for xy in zip(df_vendor.vendor_lon, df_vendor.vendor_lat)]
# gdf_vendor = GeoDataFrame(df_vendor, crs="EPSG:4326", geometry=geometry)
# gdf_vendor = gdf_vendor.rename(columns={"vendor_id": "n_vendor_id","main_cuisine":"n_main_cuisine",'vendor_name':'n_vendor_name',
#                                        'vendor_lon':'n_vendor_lon','vendor_lat':'n_vendor_lat','vertical':'n_vertical'})
# gdf_vendor = gdf_vendor.to_crs(epsg=3857)

# gdf_foodvendor_need = gdf_vendor[['geometry']]
# print('total number of restaurants regardless of cusine types:',gdf_foodvendor_need.shape[0])

In [18]:
# import gc
# # del df_localness
# del df_vendor
# del gdf_vendor
# del df_unique_customers
# gc.collect()

In [34]:
# gdf_order_byhour.head(2)

In [None]:
%%time

# list of open resturant's ID during each hour of the day
available_vendors_list_by_hour = df_localness.groupby(['order_time_hour']).vendor_id.unique()

distance_list = [0.5, 1, 2, 3, 4, 5] #unit:km
# distance_list = [3, 4, 5] #unit:km

batch_size = 10000 #100000 50000

# Loop hour, for each hour we compute number of available restaurants considering the opening hour
for hour in range(0,24):
    print(datetime.datetime.now(),'Looping hour: ', hour)
    df_order_byhour = df_localness[df_localness['order_time_hour']==hour]
    print(' Number of orders during this hour: ',df_order_byhour.shape[0])
    
    df_vendor_byhour = df_vendor[df_vendor['vendor_id'].isin(available_vendors_list_by_hour[hour])]
    print(' Number of available restaurants during this hour: ',len(available_vendors_list_by_hour[hour]))
    
    # convert restaurant points of a specific hour to geopandas
    geometry = [Point(xy) for xy in zip(df_vendor_byhour.vendor_lon, df_vendor_byhour.vendor_lat)]
    gdf_vendor_byhour = GeoDataFrame(df_vendor_byhour, crs="EPSG:4326", geometry=geometry)
    gdf_vendor_byhour = gdf_vendor_byhour.to_crs(epsg=3857)
    gdf_vendor_byhour = gdf_vendor_byhour[['geometry']]
    
    for distance_threshold in distance_list:
        col_name = 'poi_count_'+str(distance_threshold)+'km'
        output_file = './data/poi_count_'+str(distance_threshold)+'km_customerlocations_oct.csv'
        print(datetime.datetime.now(), '************** Distance:',distance_threshold,',', output_file, '**************')

        # Create buffer in meters around customer locations
        geometry = [Point(xy) for xy in zip(df_order_byhour.customer_lon, df_order_byhour.customer_lat)]
        gdf_order_byhour = GeoDataFrame(df_order_byhour, crs="EPSG:4326", geometry=geometry)
        gdf_order_byhour = gdf_order_byhour.to_crs(epsg=3857) # convert customer locations to 3857
        gdf_order_byhour = gdf_order_byhour.rename(columns={"geometry": "customer_geometry"})
        gdf_order_byhour['buffer'] = gdf_order_byhour.customer_geometry.buffer(distance_threshold*1000)
        gdf_order_byhour = gdf_order_byhour.set_geometry('buffer')
        
        # Spatial join to find points within the buffer
        gdf_vendor_byhour[col_name] = 1

        for i in range(0, gdf_order_byhour.shape[0],batch_size):
            print(datetime.datetime.now(),i,i+batch_size,'/', gdf_order_byhour.shape[0])
            groupped_data = gpd.sjoin(gdf_vendor_byhour, gdf_order_byhour.iloc[i:i+batch_size], op='within').groupby(
                    'order_id').agg({col_name: 'sum'}).reset_index()
        
            if not os.path.isfile(output_file): # if file does not exist write header
                groupped_data.to_csv(output_file, index=False)
            else: # else it exists so append without writing the header
                groupped_data.to_csv(output_file, mode='a', header=False,index=False)

2024-09-30 18:31:04.828033 Looping hour:  0
 Number of orders during this hour:  71470
 Number of available restaurants during this hour:  3317
2024-09-30 18:31:04.960817 ************** Distance: 3 , ./data/poi_count_3km_customerlocations_oct.csv **************
2024-09-30 18:31:06.480189 0 10000 / 71470
2024-09-30 18:31:08.561032 10000 20000 / 71470
2024-09-30 18:31:10.135313 20000 30000 / 71470
2024-09-30 18:31:11.577060 30000 40000 / 71470
2024-09-30 18:31:13.759913 40000 50000 / 71470
2024-09-30 18:31:15.644367 50000 60000 / 71470
2024-09-30 18:31:17.552561 60000 70000 / 71470
2024-09-30 18:31:19.510120 70000 80000 / 71470
2024-09-30 18:31:19.794175 ************** Distance: 4 , ./data/poi_count_4km_customerlocations_oct.csv **************
2024-09-30 18:31:21.330066 0 10000 / 71470
2024-09-30 18:31:24.567112 10000 20000 / 71470
2024-09-30 18:31:27.232641 20000 30000 / 71470
2024-09-30 18:31:29.588750 30000 40000 / 71470
2024-09-30 18:31:32.959726 40000 50000 / 71470
2024-09-30 18:31:

In [2]:
import pandas as pd

distance_list = [0.5, 1, 2, 3, 4, 5] #unit:km
for f in distance_list:
    f_file = './data/poi_count_'+str(f)+'km_customerlocations_oct.csv'
    df = pd.read_csv(f_file, header=0)
    print(df.shape[0])

2147696
2632917
2793351
2815535
2822420
2825245
