In [1]:
import datetime
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from shapely.geometry import Point
from geopandas import GeoDataFrame
import os
pd.options.display.float_format = '{:.5f}'.format

import seaborn as sns
pd.set_option("display.max_colwidth", 0)
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
import warnings
warnings.filterwarnings("ignore")

In [2]:
import sys
import logging

nblog = open("./logs/density_"+str(datetime.datetime.now())+".log", "a+")
sys.stdout.echo = nblog
sys.stderr.echo = nblog

get_ipython().log.handlers[0].stream = nblog
get_ipython().log.setLevel(logging.INFO)

%autosave 5

Autosaving every 5 seconds


In [3]:
%%time
df_localness = pd.read_csv('./data/food_localness_oct.csv',header=0)

CPU times: user 15.9 s, sys: 5.62 s, total: 21.5 s
Wall time: 49.2 s


## Compute number of restaurants based on a buffer around customer location

In [4]:
%%time
# create geodf for customer locations
df_localness['customer_loc_id'] = df_localness['customer_lon'].astype(str)+df_localness['customer_lat'].astype(str)
df_unique_customers = df_localness[['customer_loc_id','customer_lon','customer_lat']].drop_duplicates()
print(df_unique_customers.shape[0])
geometry = [Point(xy) for xy in zip(df_unique_customers.customer_lon, df_unique_customers.customer_lat)]
gdf_customer_loc = GeoDataFrame(df_unique_customers, crs="EPSG:4326", geometry=geometry)
gdf_customer_loc = gdf_customer_loc.to_crs(epsg=3857)#convert customer locations to 3857
gdf_customer_loc = gdf_customer_loc.rename(columns={"geometry": "customer_geometry"})

791511
CPU times: user 14.7 s, sys: 447 ms, total: 15.1 s
Wall time: 37 s


In [5]:
%%time
# prepare vendor geodf for spatial join
df_vendor = pd.read_csv('./data/all_vendor_oct_processed.csv',header=0)
geometry = [Point(xy) for xy in zip(df_vendor.vendor_lon, df_vendor.vendor_lat)]
gdf_vendor = GeoDataFrame(df_vendor, crs="EPSG:4326", geometry=geometry)
gdf_vendor = gdf_vendor.rename(columns={"vendor_id": "n_vendor_id","main_cuisine":"n_main_cuisine",'vendor_name':'n_vendor_name',
                                       'vendor_lon':'n_vendor_lon','vendor_lat':'n_vendor_lat','vertical':'n_vertical'})
gdf_vendor = gdf_vendor.to_crs(epsg=3857)

gdf_foodvendor_need = gdf_vendor[['geometry']]
print('total number of restaurants regardless of cusine types:',gdf_foodvendor_need.shape[0])

total number of restaurants regardless of cusine types: 8105
CPU times: user 321 ms, sys: 0 ns, total: 321 ms
Wall time: 701 ms


In [6]:
import gc
del df_localness
del df_vendor
del gdf_vendor
del df_unique_customers
gc.collect()

20

In [7]:
%%time

TESTING = False #False, True
# distance_list = [0.5, 1, 2, 3, 4, 5] #unit:km
distance_list = [4, 5] #unit:km

batch_size = 20000 #100000 50000
print('TESTING:',TESTING)

for distance_threshold in distance_list:
    col_name = 'poi_count_'+str(distance_threshold)+'km'
    output_file = './data/poi_count_'+str(distance_threshold)+'km_customerlocations_oct.csv'
    print(datetime.datetime.now(), '************** Distance:',distance_threshold,',', output_file, '**************')

    # Create buffer in meters around customer locations
    gdf_customer_loc_buffer = gdf_customer_loc.copy()
    gdf_customer_loc_buffer['buffer'] = gdf_customer_loc_buffer.customer_geometry.buffer(distance_threshold*1000)
    gdf_customer_loc_buffer = gdf_customer_loc_buffer.set_geometry('buffer')
    print(gdf_customer_loc_buffer.shape[0],gdf_customer_loc_buffer.customer_loc_id.unique().shape[0])
    
    # Spatial join to find points within the buffer
    gdf_foodvendor_need[col_name] = 1
    
    for i in range(0,gdf_customer_loc_buffer.shape[0],batch_size):
        print(datetime.datetime.now(),i,i+batch_size,'/',gdf_customer_loc_buffer.shape[0])
        if TESTING:
            groupped_data = gpd.sjoin(gdf_foodvendor_need, gdf_customer_loc_buffer.head(5), op='within').groupby(
                'customer_loc_id').agg({col_name: 'sum'}).reset_index() # debugging line
        else:
            groupped_data = gpd.sjoin(gdf_foodvendor_need, gdf_customer_loc_buffer.iloc[i:i+batch_size], op='within').groupby(
                'customer_loc_id').agg({col_name: 'sum'}).reset_index()
        groupped_data.columns = ['customer_loc_id', col_name]
        print(datetime.datetime.now(),'done spatial join! Distance:',distance_threshold)
        
        if TESTING:
            display(groupped_data)
            break
        
        if not TESTING:
            if not os.path.isfile(output_file):# if file does not exist write header
                groupped_data.to_csv(output_file,index=False)
            else: # else it exists so append without writing the header
                groupped_data.to_csv(output_file, mode='a', header=False,index=False)

TESTING: False
2024-06-04 12:28:15.627872 ************** Distance: 4 , ./data/poi_count_4km_customerlocations_oct.csv **************
791511 791511
2024-06-04 12:28:52.033585 0 20000 / 791511
2024-06-04 12:29:34.931067 done spatial join! Distance: 4
2024-06-04 12:29:35.049223 20000 40000 / 791511
2024-06-04 12:32:02.493126 done spatial join! Distance: 4
2024-06-04 12:32:02.663028 80000 100000 / 791511
2024-06-04 12:32:53.761230 done spatial join! Distance: 4
2024-06-04 12:32:53.844196 100000 120000 / 791511
2024-06-04 12:33:44.188776 done spatial join! Distance: 4
2024-06-04 12:33:44.244542 120000 140000 / 791511
2024-06-04 12:34:39.549289 done spatial join! Distance: 4
2024-06-04 12:34:39.613705 140000 160000 / 791511
2024-06-04 12:35:27.967568 done spatial join! Distance: 4
2024-06-04 12:35:28.035932 160000 180000 / 791511
2024-06-04 12:35:55.357623 done spatial join! Distance: 4
2024-06-04 12:35:55.462715 180000 200000 / 791511
2024-06-04 12:36:32.688919 done spatial join! Distance: 

2024-06-04 13:29:51.267836 done spatial join! Distance: 5
2024-06-04 13:29:51.347099 740000 760000 / 791511
2024-06-04 13:30:43.487839 done spatial join! Distance: 5
2024-06-04 13:30:43.577281 760000 780000 / 791511
2024-06-04 13:31:37.080700 done spatial join! Distance: 5
2024-06-04 13:31:37.123996 780000 800000 / 791511
2024-06-04 13:32:03.319481 done spatial join! Distance: 5
CPU times: user 16min 11s, sys: 15min 4s, total: 31min 16s
Wall time: 1h 3min 47s
