In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np

In [2]:
from shapely.geometry import Point
def intpt_func(row):
    return Point(row['INTPTLON'], row['INTPTLAT'])

In [3]:
#loading geometry data
ham_cbg = pd.read_csv('../data/ham_cbg.csv')
ham_cbg['intpt'] = ham_cbg[['INTPTLAT', 'INTPTLON']].apply(lambda p: intpt_func(p), axis=1)
ham_cbg = gpd.GeoDataFrame(ham_cbg, geometry=gpd.GeoSeries.from_wkt(ham_cbg.geometry))
ham_cbg.GEOID = ham_cbg.GEOID.astype(str)

  arr = construct_1d_object_array_from_listlike(values)


In [4]:
#loading residential buildings
res_build = pd.read_csv('../data/ham_residential_buildings2.csv', index_col=0)
res_build = gpd.GeoDataFrame(res_build, geometry=gpd.GeoSeries.from_wkt(res_build.geometry))
res_build['location'] = res_build.geometry.apply(lambda p: [p.y, p.x])

#loading work buildings
com_build = pd.read_csv('../data/work_loc_poi_com_civ.csv', index_col=0)
com_build = gpd.GeoDataFrame(com_build, geometry=gpd.GeoSeries.from_wkt(com_build.geometry))
com_build['location'] = com_build.geometry.apply(lambda p: [p.y, p.x])
com_build = com_build.reset_index()
com_build.GEOID = com_build.GEOID.astype(str)

#loading all buildings (MS dataset)
ms_build = pd.read_csv('../data/ham_buildings_MS.csv')
ms_build = gpd.GeoDataFrame(ms_build, geometry=gpd.GeoSeries.from_wkt(ms_build.geo_centers))
ms_build.GEOID = ms_build.GEOID.astype(str)
ms_build['location'] = ms_build.geometry.apply(lambda p: [p.y, p.x])

In [5]:
def datetime_range(start, end, delta):
    current = start
    while current < end:
        yield current
        current += delta

In [25]:
#generating array of start and return times (in 15 min intervals)
from datetime  import datetime, timedelta
times_morning = [datetime.strptime(dt.strftime('%H:%M'), '%H:%M') for dt in 
       datetime_range(datetime(2016, 9, 1, 7), datetime(2016, 9, 1, 9, 10), 
       timedelta(minutes=15))]
times_evening = [datetime.strptime(dt.strftime('%H:%M'), '%H:%M') for dt in 
       datetime_range(datetime(2016, 9, 1, 16), datetime(2016, 9, 1, 18, 10), 
       timedelta(minutes=15))]

In [7]:
res_build.GEOID = res_build.GEOID.astype(str)
com_build.GEOID = com_build.GEOID.astype(str)

In [8]:
#input safegraph preprocessed data
sg = gpd.read_file('path to safegraph data') # file not added due to privacy concerns

In [14]:
#grouping home and work location movements to get total no of movements
sg = sg.groupby(['home_cbg', 'poi_cbg']).agg(frequency=('frequency', sum), visits_monday=('visits_monday', sum), visits_tuesday=('visits_tuesday', sum), visits_wednesday=('visits_wednesday', sum), visits_thursday=('visits_thursday', sum), visits_friday=('visits_friday', sum), visits_saturday=('visits_saturday', sum), visits_sunday=('visits_sunday', sum) ).reset_index()

In [30]:
import random
import tqdm
from tqdm.notebook import tqdm_notebook

#setting the random seed
np.random.seed(42)
random.seed(42)


prob_matrix_sg = gpd.GeoDataFrame()
for idx, movement in tqdm_notebook(sg.iterrows(), total=sg.shape[0]):

    res = res_build[res_build.GEOID == movement.home_cbg].reset_index(drop=True)
    if res.empty:
        res = ms_build[ms_build.GEOID == movement.home_cbg].sample(n=movement.frequency, random_state=42, replace=True).reset_index(drop=True)

    com = com_build[com_build.GEOID == movement.poi_cbg].reset_index(drop=True)
    if com.empty:
        com = ms_build[ms_build.GEOID == movement.poi_cbg].sample(n=movement.frequency, random_state=42, replace=True).reset_index(drop=True)
        
    r = res
    c = com

    for freq in range(int(movement.frequency/7)):

        if c.empty:
            c = com
        if r.empty:
            r = res
        
        rand_r = random.randrange(r.shape[0])
        rand_c = random.randrange(c.shape[0])
        r_df = r.iloc[rand_r]
        c_df = c.iloc[rand_c]
        r = r.drop([rand_r]).reset_index(drop=True)
        c = c.drop([rand_c]).reset_index(drop=True)

        time_slot1 = np.random.choice(times_morning, size=1, replace=True)
        time_slot2 = np.random.choice(times_evening, size=1, replace=True)

        temp = gpd.GeoDataFrame()

        temp.loc[freq, 'home_cbg'] = movement.home_cbg
        temp.loc[freq, 'poi_cbg'] = movement.poi_cbg
        temp.loc[freq, 'frequency'] = movement.frequency
        temp.loc[freq, 'home_loc_lat'] = r_df.location[0]
        temp.loc[freq, 'home_loc_lon'] = r_df.location[1]
        temp.loc[freq, 'work_loc_lat'] = c_df.location[0]
        temp.loc[freq, 'work_loc_lon'] = c_df.location[1]
        temp.loc[freq, 'go_time'] = time_slot1[0]
        temp.loc[freq, 'go_time_str'] = time_slot1[0].strftime('%H:%M')
        temp.loc[freq, 'return_time'] = time_slot2[0]
        temp.loc[freq, 'return_time_str'] = time_slot2[0].strftime('%H:%M')

        # temp.loc[job, 'home_geom'] = Point([r_df.location[1], r_df.location[0]])
        prob_matrix_sg = prob_matrix_sg.append(temp, ignore_index=True)

  0%|          | 0/16561 [00:00<?, ?it/s]

In [31]:
def func_home_pt(row):
    return Point(row.home_loc_lon, row.home_loc_lat)
def func_work_pt(row):
    return Point(row.work_loc_lon, row.work_loc_lat)

In [32]:
# convert the lat and lon points to shapely Points
prob_matrix_sg['home_geom'] = prob_matrix_sg[['home_loc_lat', 'home_loc_lon']].apply(lambda row: func_home_pt(row), axis=1)
prob_matrix_sg['work_geom'] = prob_matrix_sg[['work_loc_lat', 'work_loc_lon']].apply(lambda row: func_work_pt(row), axis=1)

  arr = construct_1d_object_array_from_listlike(values)
  arr = construct_1d_object_array_from_listlike(values)


In [34]:
# prob_matrix_sg.to_csv('sg_combinations.csv', index=False)
prob_matrix_sg.to_parquet('sg_combinations.parquet', index=False)