## Home and work location and POI Mapping 
The purpose of this notebook is to take the data for the users that passed qc and calculate home and work locations for those users using various shapefiles.

1) Compute and write out home and work locations 
2) Map POI information 

### Data reading and package imports

In [1]:
from dotenv import load_dotenv
load_dotenv()

import os
import glob
from tqdm.notebook import trange, tqdm

import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

import dask.dataframe as dd
import geopandas as gpd
from datetime import datetime as dt

import mobilkit #.loader.crop_spatial as mkcrop_spatial

# Access environment variables and define other necessary variables
data_dir = os.getenv('WORKING_DIR')
meta_dir = f'{data_dir}metadata/'
pq_dir = f'{data_dir}data/parquet/'
out_dir_hw = f'{data_dir}data/home_work/'
data_info  = 'bogota_area_2019_months1_2_60min_pings_10min_days'

In [2]:
#### FUNCTIONS FOR DATA PROCESSING ####

def find_home_work_locs(ddf, shapefile, home_hrs=(22.0, 6.0), work_hrs=(9.5,16.5)):
    regions = gpd.read_file(shapefile)
    ddf_w_zones, tessellation_gdf = mobilkit.spatial.tessellate(ddf,tesselation_shp=shapefile,filterAreas=True)
    ddf_w_zones = mobilkit.stats.userHomeWork(ddf_w_zones,
                                         homeHours=home_hrs,
                                         workHours=work_hrs)
    #these next two actually detect home locations and may take some time
    ddf_w_zones_stat= mobilkit.stats.userHomeWorkLocation(ddf_w_zones)
    df_hw_locs = ddf_w_zones_stat.compute()
    return ddf_w_zones, tessellation_gdf, df_hw_locs

def write_files_with_home_work_tile_info(df_hw_locs, tessellation_gdf, ddf_w_zones, 
                                         output_folder, data_info, shp_name, write_ddf_parquet=False): 
    hw_locs_filename = f'{output_folder}user_stats_{data_info}_w_homework_tiles_{shp_name}'
    ddf_w_zones_filename = f'{output_folder}ddf_w_zones_{data_info}_{shp_name}'
    tess_df_filename = f'{output_folder}tessellation_gdf_tiles_{shp_name}'
    tessellation_gdf_w_home = pd.merge(tessellation_gdf,df_hw_locs.groupby(by="home_tile_ID").count().reset_index()[["home_tile_ID","home_pings"]],left_on="tile_ID",right_on="home_tile_ID")
    tessellation_gdf_w_home_filename = f'{output_folder}tessellation_gdf_w_home_{data_info}_{shp_name}.csv'
    df_hw_locs.to_csv(f'{hw_locs_filename}.csv')
    tessellation_gdf.to_csv(f'{tess_df_filename}.csv', index=False)
    tessellation_gdf_w_home.to_csv(tessellation_gdf_w_home_filename)
    if write_ddf_parquet==True: 
        name_function = lambda x: f"data-{x}.parquet"
        ddf_w_zones.to_parquet(f'{ddf_w_zones_filename}/', name_function=name_function)
    return 

### Read in data for users 
This is the data for users that live in a region around Bogota (defined roughly by a boundary box), that passed quality control (minimum 60 pings and 10 days active)

In [3]:
data_for_qcd_users = f'{pq_dir}bogota_area_year=2019_month=1and2_pass_qc.parquet'
qc_user_data = dd.read_parquet(data_for_qcd_users)
qc_user_data.head()

Unnamed: 0,uid,lat,lng,datetime,geohash,horizontal_accuracy
0,8251b1dd-b9dc-4d1a-a5db-54863687b033,4.642742,-74.106285,2018-12-31 20:02:26-05:00,d2g63tjmymug,12.001812
1,6f64691d-f108-4e09-9a97-ccc618d8d4a6,4.618552,-74.12803,2019-01-01 05:38:24-05:00,d2g630v958x3,32.0
2,92a5a843-80e9-425f-9d4b-12fa5ae624c5,4.64666,-74.085205,2019-01-01 08:18:37-05:00,d2g66juuc85j,65.0
3,1fb81f41-7260-4fa1-b061-b4ed1e482984,4.664384,-74.07639,2019-01-01 12:48:51-05:00,d2g6d35hnbjv,10.0
4,29cf982a-3506-4d73-949a-b1eda673ec65,4.705379,-74.04616,2019-01-01 08:33:58-05:00,d2g6g094z6nu,32.0


### Compute home and work locations
For the different shapefiles, we can see whether users live in certain areas of the city that correspond with a particular income, near the stations of interest, and in a particular neighbhorhood (depending on the shapefile used)

In [4]:
shp_names = ['zat_stratum', 'Stations_Buffer_1000', 'utam_access']
shapefiles = [f'{meta_dir}income/{shp_names[0]}.shp', f'{meta_dir}{shp_names[1]}.shp', f'{meta_dir}{shp_names[2]}.shp']


for i in tqdm(range(0,len(shp_names)), desc=f'Calculating home and work locations based on the shapefiles...'): 
    shp_name, shapefile = shp_names[i], shapefiles[i]
    ddf_w_zones, tessellation_gdf, df_hw_locs = find_home_work_locs(qc_user_data, 
                                                                    shapefile, 
                                                                    home_hrs=(22.0, 6.0), work_hrs=(9.5,16.5))

    write_files_with_home_work_tile_info(df_hw_locs, tessellation_gdf, ddf_w_zones, 
                                        out_dir_hw, data_info, shp_name=shp_name)
    

Calculating home and work locations based on the shapefiles...:   0%|          | 0/3 [00:00<?, ?it/s]

  if not (lk == lk.astype(rk.dtype))[~np.isnan(lk)].all():
  if not (lk == lk.astype(rk.dtype))[~np.isnan(lk)].all():
  if not (lk == lk.astype(rk.dtype))[~np.isnan(lk)].all():
  if not (lk == lk.astype(rk.dtype))[~np.isnan(lk)].all():
  if not (lk == lk.astype(rk.dtype))[~np.isnan(lk)].all():
  if not (lk == lk.astype(rk.dtype))[~np.isnan(lk)].all():


### Visualize and load data

In [11]:
tessellation_gdf_w_home = pd.merge(tessellation_gdf,df_hw_locs.groupby(by="home_tile_ID").count().reset_index()[["home_tile_ID","home_pings"]],left_on="tile_ID",right_on="home_tile_ID")
print('The breakdown of users living by different neighborhoods that we can detect is as follows:\n', tessellation_gdf_w_home[['LOCNombre', 'home_pings']].head(10))
print(f"There are {tessellation_gdf_w_home['home_pings'].sum()} users that live in the ZAT regions that pass the filtering conditions for min pings and days detected.")

The breakdown of users living by different neighborhoods that we can detect is as follows:
         LOCNombre  home_pings
0     TEUSAQUILLO        2278
1         KENNEDY         100
2     TEUSAQUILLO         535
3     TEUSAQUILLO         715
4        SANTA FE         273
5         KENNEDY         412
6   PUENTE ARANDA         172
7        FONTIBON         374
8  BARRIOS UNIDOS         470
9        FONTIBON         546
There are 46268 users that live in the ZAT regions that pass the filtering conditions for min pings and days detected.


Let's visualize the number of people living in the three neighbhoords that Elena mentioned as an area of focus

In [22]:
places = ['SAN CRISTOBAL', 'CIUDAD BOLIVAR', 'RAFAEL URIBE URIBE']
df = tessellation_gdf_w_home[tessellation_gdf_w_home['LOCNombre'].isin(places)]
grouped_df = df.groupby('LOCNombre')['home_pings'].sum()
grouped_df

LOCNombre
CIUDAD BOLIVAR        659
RAFAEL URIBE URIBE    350
SAN CRISTOBAL         370
Name: home_pings, dtype: int64

In [23]:
print(tessellation_gdf.columns)
tessellation_gdf.head()

Index(['MUNCodigo', 'MUNNombre', 'LOCNombre', 'USOSNum', 'USOPreNum',
       'USOPreCor', 'ESTRATO1', 'ESTRATO2', 'ESTRATO3', 'ESTRATO4', 'ESTRATO5',
       'ESTRATO6', 'ESTRATOPre', 'HOGARES', 'UTAM', 'UTAMNombre', 'UTAMArea',
       'homes', 'jobs', 'grav_home', 'grav_work', 'grav_both', 'amenities',
       'code', 'stratum', 'grav_amen', 'stratum2', 'grav_acces', 'dist_city_',
       'tot_amen', 'ca_o', 'ca_ew', 'ca_e', 'ca_b', 'access', 'grav_suppl',
       'ca_w', 'ca_c', 'ca_a', 'dcb', 'geometry', 'tile_ID'],
      dtype='object')


Unnamed: 0,MUNCodigo,MUNNombre,LOCNombre,USOSNum,USOPreNum,USOPreCor,ESTRATO1,ESTRATO2,ESTRATO3,ESTRATO4,...,ca_e,ca_b,access,grav_suppl,ca_w,ca_c,ca_a,dcb,geometry,tile_ID
0,11001.0,BOGOTA,TEUSAQUILLO,12345.0,1.0,RESIDENCIAL,NO,NO,SI,SI,...,0.001551,0.009383,3533.776012,3533.776012,0.015663,0.010204,0.00951,2687.619389,"POLYGON ((-74.06463 4.64965, -74.06619 4.64146...",0
1,11001.0,BOGOTA,KENNEDY,12345.0,2.0,COMERCIO Y SERVICIOS,SI,SI,NO,NO,...,0.000322,0.00845,3831.822018,3831.822018,0.009329,0.008428,0.008453,13824.656558,"POLYGON ((-74.17077 4.62848, -74.17141 4.62811...",1
2,11001.0,BOGOTA,TEUSAQUILLO,1234.0,1.0,RESIDENCIAL,NO,SI,SI,SI,...,0.001774,0.009334,3681.716567,3681.716567,0.012395,0.010293,0.009487,5100.30713,"POLYGON ((-74.09245 4.63648, -74.09184 4.63569...",2
3,11001.0,BOGOTA,TEUSAQUILLO,12345.0,1.0,RESIDENCIAL,NO,SI,SI,SI,...,0.001304,0.009147,3466.647544,3466.647544,0.014174,0.009899,0.009267,4144.480416,"POLYGON ((-74.06707 4.63644, -74.06840 4.62882...",3
4,11001.0,BOGOTA,SANTA FE,12345.0,2.0,COMERCIO Y SERVICIOS,SI,NO,SI,SI,...,0.000831,0.008915,3301.860312,3301.860312,0.011348,0.00939,0.008995,4993.453065,"POLYGON ((-74.05722 4.62403, -74.05719 4.62404...",4


In [45]:
cols = ['MUNNombre', 'LOCNombre', 'USOPreNum',
       'USOPreCor', 'ESTRATO1', 'ESTRATO2', 'ESTRATO3', 'ESTRATO4', 'ESTRATO5',
       'ESTRATO6', 'ESTRATOPre', 'HOGARES', 'UTAM', 'UTAMNombre', 'UTAMArea',
       'homes', 'jobs', 'tile_ID']

tessellation_gdf[tessellation_gdf['LOCNombre'].isin(places)][cols].head(20)

Unnamed: 0,MUNNombre,LOCNombre,USOPreNum,USOPreCor,ESTRATO1,ESTRATO2,ESTRATO3,ESTRATO4,ESTRATO5,ESTRATO6,ESTRATOPre,HOGARES,UTAM,UTAMNombre,UTAMArea,homes,jobs,tile_ID
25,BOGOTA,SAN CRISTOBAL,1.0,RESIDENCIAL,SI,SI,SI,NO,NO,NO,3.0,12900.0,UTAM33,SOSIEGO,2346004.0,39515.390479,16532.49166,25
38,BOGOTA,CIUDAD BOLIVAR,1.0,RESIDENCIAL,SI,SI,SI,NO,NO,NO,1.0,12593.0,UTAM68,EL TESORO,2105724.0,48538.403764,2456.964133,38
39,BOGOTA,CIUDAD BOLIVAR,3.0,INDUSTRIAL,SI,SI,SI,NO,SI,NO,2.0,1185.0,UTAM64,MONTE BLANCO,6947220.0,6000.740328,0.0,39
45,BOGOTA,CIUDAD BOLIVAR,1.0,RESIDENCIAL,SI,SI,NO,NO,NO,NO,1.0,41890.0,UTAM67,EL LUCERO,5855784.0,175610.635738,11144.453745,45
46,BOGOTA,CIUDAD BOLIVAR,2.0,COMERCIO Y SERVICIOS,NO,SI,NO,NO,NO,NO,2.0,0.0,UTAM63,EL MOCHUELO,3163016.0,0.0,551.974308,46
47,BOGOTA,RAFAEL URIBE URIBE,1.0,RESIDENCIAL,SI,SI,SI,NO,NO,NO,1.0,21083.0,UTAM55,DIANA TURBAY,2111462.0,80582.015225,2013.638936,47
48,BOGOTA,SAN CRISTOBAL,1.0,RESIDENCIAL,SI,SI,NO,NO,NO,NO,2.0,19541.0,UTAM51,LOS LIBERTADORES,3644477.0,76930.224521,3138.559297,48
51,BOGOTA,CIUDAD BOLIVAR,1.0,RESIDENCIAL,SI,SI,SI,NO,NO,NO,3.0,17551.0,UTAM65,ARBORIZADORA,3058416.0,90313.367489,12637.049789,51
57,BOGOTA,SAN CRISTOBAL,1.0,RESIDENCIAL,SI,SI,NO,NO,NO,NO,2.0,29022.0,UTAM50,LA GLORIA,3853381.0,97100.132462,5792.401344,57
71,BOGOTA,CIUDAD BOLIVAR,1.0,RESIDENCIAL,SI,SI,NO,NO,NO,NO,1.0,26279.0,UTAM70,JERUSALEM,5367312.0,107833.227222,4818.932034,71


In [38]:
tessellation_gdf_zat_name = f'{out_dir_hw}tessellation_gdf_w_home_bogota_area_2019_months1_2_60min_pings_10min_days_zat_stratum.csv'
tessellation_gdf_zat = pd.read_csv(tessellation_gdf_zat_name, index_col='Unnamed: 0')
tessellation_gdf_zat = tessellation_gdf_zat[tessellation_gdf_zat['MUNCod'] == 11001.0]
tessellation_gdf_zat.head()

Unnamed: 0,Area,MUNCod,NOMMun,ZAT,UTAM,homes,jobs,code,stratum,geometry,tile_ID,home_tile_ID,home_pings
2,1085719000.0,11001.0,Bogotá,796.0,UPR3,0.0,440.719962,796.0,,POLYGON Z ((-74.08374277396507 4.4969999529411...,5,5.0,69
11,958365.1,11001.0,Bogotá,83.0,UTAM13,662.265211,1884.648559,83.0,5.146584,POLYGON Z ((-74.02063088880436 4.7294332330128...,22,22.0,60
12,324491.5,11001.0,Bogotá,854.0,UTAM11,0.0,710.276263,854.0,2.637681,POLYGON Z ((-74.02062215279005 4.7294345082514...,23,23.0,14
13,662306.7,11001.0,Bogotá,57.0,UTAM11,7700.034072,326.478939,57.0,1.836368,POLYGON Z ((-74.02006000084828 4.7464387396365...,24,24.0,16
14,194223.3,11001.0,Bogotá,20.0,UTAM10,757.632297,950.94803,20.0,2.114516,POLYGON Z ((-74.02245966468159 4.7465083522347...,25,25.0,14


In [40]:
tessellation_gdf_stations_name = f'{out_dir_hw}tessellation_gdf_w_home_bogota_area_2019_months1_2_60min_pings_10min_days_Stations_Buffer_1000.csv'
tessellation_gdf_stations = pd.read_csv(tessellation_gdf_stations_name, index_col='Unnamed: 0')
#tessellation_gdf_zat = tessellation_gdf_zat[tessellation_gdf_zat['MUNCod'] == 11001.0]
tessellation_gdf_stations.head()

Unnamed: 0,Station,Y_wgs,X_wgs,BUFF_DIST,ORIG_FID,Shape_Leng,Shape_Area,geometry,tile_ID,home_tile_ID,home_pings
0,El Tunal,4.569088,-74.139714,1000.0,1,0.056719,0.000256,POLYGON ((-74.13971421699995 4.578131272000064...,0,0.0,1361
1,Juan Pablo II,4.556036,-74.147474,1000.0,2,0.056718,0.000256,POLYGON ((-74.14747414399994 4.565078821000043...,1,1.0,345
2,Manitas,4.550582,-74.150299,1000.0,3,0.056718,0.000256,POLYGON ((-74.15029856599995 4.559624694000036...,2,2.0,277
3,Mirador El Paraiso,4.550355,-74.158587,1000.0,4,0.056718,0.000256,POLYGON ((-74.15858652499998 4.559397704000048...,3,3.0,136


In [42]:
# would need to load the data for the stations first 
df = tessellation_gdf_stations[tessellation_gdf_stations['home_pings'] > 0]
print('The breakdown of users living by different stations that we can detect is as follows:\n', df[['Station', 'home_pings']])
print(f"There are {df['home_pings'].sum()} users that live in the ZAT regions that pass the filtering conditions for min pings and days detected.")

The breakdown of users living by different stations that we can detect is as follows:
               Station  home_pings
0            El Tunal        1361
1       Juan Pablo II         345
2             Manitas         277
3  Mirador El Paraiso         136
There are 2119 users that live in the ZAT regions that pass the filtering conditions for min pings and days detected.


In [62]:
df_hw_locs_stations_name = f'{out_dir_hw}user_stats_bogota_area_2019_months1_2_60min_pings_10min_days_w_homework_tiles_Stations_Buffer_1000.csv'
df_hw_locs_stations = pd.read_csv(df_hw_locs_stations_name)
df_hw_locs_stations#.home_tile_ID #.value_counts()
print(len(df_hw_locs_stations))
df_hw_locs_stations.head()

2962


Unnamed: 0,uid,tot_pings,home_tile_ID,lat_home,lng_home,home_pings,work_tile_ID,lat_work,lng_work,work_pings
0,00002eec-9e3e-4e4d-9822-4e4858a0de0c,51.0,0.0,4.566214,-74.131834,4.0,0.0,,,0.0
1,0015bd0e-0604-4025-bebc-7279d2bee852,138.0,3.0,4.550926,-74.161019,8.0,3.0,4.551377,-74.16088,24.0
2,00206b33-0b12-49a5-8078-7047c04231b2,337.0,0.0,4.569858,-74.146072,63.0,0.0,4.570332,-74.14541,61.0
3,0022c667-dee1-449f-8d97-d1e53903e9d0,57.0,1.0,4.560172,-74.147907,9.0,1.0,4.560224,-74.147857,27.0
4,00269c64-4cbc-4ec6-af68-8ba37320b4a2,122.0,2.0,4.554071,-74.146554,88.0,0.0,,,0.0
