In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np

In [2]:
from shapely.geometry import Point
def intpt_func(row):
    return Point(row['INTPTLON'], row['INTPTLAT'])

In [64]:
#loading LODES data

county_lodes = pd.read_csv('../data/county_lodes_2019.csv', dtype={"TRACTCE20_home":"string", "TRACTCE20_work":"string"})
county_lodes.h_geocode = county_lodes.h_geocode.apply(lambda x: int(x/1000))
county_lodes.w_geocode = county_lodes.w_geocode.apply(lambda x: int(x/1000))
county_lodes.w_geocode = county_lodes.w_geocode.astype(str)
county_lodes.h_geocode = county_lodes.h_geocode.astype(str)

#loading Hamilton county geodata
county_cbg = pd.read_csv('../data/county_cbg.csv')
county_cbg['intpt'] = county_cbg[['INTPTLAT', 'INTPTLON']].apply(lambda p: intpt_func(p), axis=1)
county_cbg = gpd.GeoDataFrame(county_cbg, geometry=gpd.GeoSeries.from_wkt(county_cbg.geometry))
county_cbg.GEOID = county_cbg.GEOID.astype(str)
county_cbg['location'] = county_cbg.geometry.apply(lambda p: p.representative_point())

In [73]:
#loading residential buildings
res_build = pd.read_csv('../data/county_residential_buildings.csv', index_col=0)
res_build = gpd.GeoDataFrame(res_build, geometry=gpd.GeoSeries.from_wkt(res_build.geometry))
res_build['location'] = res_build.geometry.apply(lambda p: [p.y, p.x])

#loading work buildings
com_build = pd.read_csv('../data/county_work_loc_poi_com_civ.csv', index_col=0)
com_build = gpd.GeoDataFrame(com_build, geometry=gpd.GeoSeries.from_wkt(com_build.geometry))
com_build['location'] = com_build.geometry.apply(lambda p: [p.y, p.x])
com_build = com_build.reset_index()
com_build.GEOID = com_build.GEOID.astype(str)

#loading all buildings (MS dataset)
ms_build = pd.read_csv('../data/county_buildings_MS.csv')
ms_build = gpd.GeoDataFrame(ms_build, geometry=gpd.GeoSeries.from_wkt(ms_build.geo_centers))
ms_build.GEOID = ms_build.GEOID.astype(str)
ms_build['location'] = ms_build.geometry.apply(lambda p: [p.y, p.x])

In [None]:
#aggregating total jobs for each combination of home and work cbg 
county_lodes = county_lodes.groupby(['h_geocode', 'w_geocode']).agg(total_jobs=('total_jobs', sum)).reset_index().merge(county_cbg[['GEOID', 'geometry']], left_on='h_geocode', right_on='GEOID').rename({'geometry':'home_geom'}, axis=1).drop('GEOID', axis=1).merge(county_cbg[['GEOID', 'geometry']], left_on='w_geocode', right_on='GEOID').rename({'geometry':'work_geom'}, axis=1).drop('GEOID', axis=1).sort_values('total_jobs', ascending=False).reset_index(drop=True)
county_lodes = gpd.GeoDataFrame(county_lodes)

In [74]:
def datetime_range(start, end, delta):
    current = start
    while current < end:
        yield current
        current += delta

In [75]:
#generating array of start and return times (in 15 min intervals)
from datetime  import datetime, timedelta
times_morning = [datetime.strptime(dt.strftime('%H%M%S'), '%H%M%S') for dt in 
       datetime_range(datetime(2021, 9, 1, 7), datetime(2021, 9, 1, 9, 10), 
       timedelta(seconds=15))]
times_evening = [datetime.strptime(dt.strftime('%H%M%S'), '%H%M%S') for dt in 
       datetime_range(datetime(2021, 9, 1, 16), datetime(2021, 9, 1, 18, 10), 
       timedelta(seconds=15))]

In [76]:
(times_morning[5] - datetime(1900, 1, 1)).total_seconds()
# times_morning[1].strftime('%H:%M:%S')

25275.0

In [77]:
res_build.GEOID = res_build.GEOID.astype(str)
com_build.GEOID = com_build.GEOID.astype(str)

In [78]:
county_lodes.head()

Unnamed: 0,h_geocode,w_geocode,total_jobs,home_geom,work_geom
0,471870502041,471870503041,195,"POLYGON ((-86.72612 35.99715, -86.72599 35.997...","POLYGON ((-86.82119 36.01639, -86.82119 36.016..."
1,471870501021,471870503041,193,"POLYGON ((-86.68189 35.98940, -86.67941 35.988...","POLYGON ((-86.82119 36.01639, -86.82119 36.016..."
2,471870503071,471870503071,146,"POLYGON ((-86.84334 35.95729, -86.84326 35.957...","POLYGON ((-86.84334 35.95729, -86.84326 35.957..."
3,471870503041,471870503041,144,"POLYGON ((-86.82119 36.01639, -86.82119 36.016...","POLYGON ((-86.82119 36.01639, -86.82119 36.016..."
4,471870504041,471870503041,139,"POLYGON ((-86.88878 36.01290, -86.88873 36.013...","POLYGON ((-86.82119 36.01639, -86.82119 36.016..."


In [81]:
county_cbg.head()

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,BLKGRPCE,GEOID,NAMELSAD,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry,intpt,location
0,47,187,51204,1,471870512041,Block Group 1,G5030,S,14787289,0,35.806428,-86.905028,"POLYGON ((-86.95749 35.82126, -86.95681 35.821...",POINT (-86.905028 35.8064278),POINT (-86.91476 35.78957)
1,47,187,50801,2,471870508012,Block Group 2,G5030,S,582160,0,35.916146,-86.892766,"POLYGON ((-86.89711 35.91794, -86.89710 35.917...",POINT (-86.89276580000001 35.9161462),POINT (-86.89383 35.91531)
2,47,187,50802,2,471870508022,Block Group 2,G5030,S,1084845,0,35.923286,-86.869286,"POLYGON ((-86.87499 35.92983, -86.87496 35.929...",POINT (-86.8692863 35.9232864),POINT (-86.86813 35.92294)
3,47,187,50801,4,471870508014,Block Group 4,G5030,S,1089401,0,35.911845,-86.881154,"POLYGON ((-86.89236 35.90984, -86.89231 35.909...",POINT (-86.88115380000001 35.911845),POINT (-86.88150 35.91212)
4,47,187,51203,2,471870512032,Block Group 2,G5030,S,128042910,28674,35.853633,-87.121033,"POLYGON ((-87.21510 35.85065, -87.21474 35.852...",POINT (-87.1210329 35.8536327),POINT (-87.12511 35.85495)


In [82]:
import random
import tqdm
from tqdm.notebook import tqdm_notebook

#setting the random seed
np.random.seed(42)
random.seed(42)

prob_matrix = gpd.GeoDataFrame()
for idx, movement in tqdm_notebook(county_lodes.iterrows(), total=county_lodes.shape[0]):

    res = res_build[res_build.GEOID == movement.h_geocode].reset_index(drop=True)
    if res.empty:
        try:
            res = ms_build[ms_build.GEOID == movement.h_geocode].sample(n=movement.total_jobs, random_state=42).reset_index(drop=True)
        except:
            try:
                res = ms_build[ms_build.GEOID == movement.h_geocode].sample(n=movement.total_jobs, random_state=42, replace=True).reset_index(drop=True)
            except:
                res = county_cbg[county_cbg.GEOID == movement.h_geocode]

    com = com_build[com_build.GEOID == movement.w_geocode].reset_index(drop=True)
    if com.empty:
        try:
            com = ms_build[ms_build.GEOID == movement.w_geocode].sample(n=movement.total_jobs, random_state=42).reset_index(drop=True)
        except:
            try:
                com = ms_build[ms_build.GEOID == movement.w_geocode].sample(n=movement.total_jobs, random_state=42, replace=True).reset_index(drop=True)
            except:
                com = county_cbg[county_cbg.GEOID == movement.w_geocode]
    
    r = res
    c = com
   
    for job in range(movement.total_jobs):
     
        if c.empty:
            c = com
        if r.empty:
            r = res

        rand_r = random.randrange(0, r.shape[0])
        rand_c = random.randrange(0, c.shape[0])
        r_df = r.iloc[rand_r]
        c_df = c.iloc[rand_c]
        r = r.drop([rand_r]).reset_index(drop=True)
        c = c.drop([rand_c]).reset_index(drop=True)
        
        time_slot1 = np.random.choice(times_morning, size=1, replace=True)
        time_slot2 = np.random.choice(times_evening, size=1, replace=True)

        temp = gpd.GeoDataFrame()

        temp.loc[job, 'h_geocode'] = movement.h_geocode
        temp.loc[job, 'w_geocode'] = movement.w_geocode
        temp.loc[job, 'total_jobs'] = movement.total_jobs
        temp.loc[job, 'home_loc_lat'] = r_df.location[0]
        temp.loc[job, 'home_loc_lon'] = r_df.location[1]
        temp.loc[job, 'work_loc_lat'] = c_df.location[0]
        temp.loc[job, 'work_loc_lon'] = c_df.location[1]
        temp.loc[job, 'go_time'] = time_slot1[0].time()
        temp.loc[job, 'go_time_secs'] = (time_slot1[0] - datetime(1900, 1, 1)).total_seconds()
        temp.loc[job, 'go_time_str'] = time_slot1[0].strftime('%H:%M:%S')
        temp.loc[job, 'return_time'] = time_slot2[0].time()
        temp.loc[job, 'return_time_secs'] = (time_slot2[0] - datetime(1900, 1, 1)).total_seconds()
        temp.loc[job, 'return_time_str'] = time_slot2[0].strftime('%H:%M:%S')

        prob_matrix = prob_matrix.append(temp, ignore_index=True)

  0%|          | 0/1908 [00:00<?, ?it/s]

In [83]:
def func_home_pt(row):
    return Point(row.home_loc_lon, row.home_loc_lat)
def func_work_pt(row):
    return Point(row.work_loc_lon, row.work_loc_lat)

In [85]:
# convert the lat and lon points to shapely Points
prob_matrix['home_geom'] = prob_matrix[['home_loc_lat', 'home_loc_lon']].apply(lambda row: func_home_pt(row), axis=1)
prob_matrix['work_geom'] = prob_matrix[['work_loc_lat', 'work_loc_lon']].apply(lambda row: func_work_pt(row), axis=1)
prob_matrix.h_geocode = prob_matrix.h_geocode.astype(str)
prob_matrix.w_geocode = prob_matrix.w_geocode.astype(str)

In [87]:
prob_matrix.to_csv('../data/county_lodes_combinations.csv', index=False)
# prob_matrix.to_parquet('../data/county_lodes_combinations.parquet', engine='pyarrow', index=False)