## Home and work location and POI Mapping 
The purpose of this notebook is to take the data for the users that passed qc and calculate home and work locations for those users using various shapefiles.

1) Compute and write out home and work locations 
2) Map POI information 

### Data reading and package imports

In [1]:
from dotenv import load_dotenv
load_dotenv()

import os
import glob
from pathlib import Path
from pprint import pprint
import math
from tqdm.notebook import trange, tqdm


import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

import dask.dataframe as dd
import geopandas as gpd
from shapely.geometry import Point
from datetime import datetime as dt


import mobilkit #.loader.crop_spatial as mkcrop_spatial

# Access environment variables and define other necessary variables
data_dir = os.getenv('WORKING_DIR')
meta_dir = f'{data_dir}metadata/'
pq_dir = f'{data_dir}data/parquet/bogota_area_passqc/'
out_dir_hw = f'{data_dir}data/home_work/'

In [2]:
#### FUNCTIONS FOR DATA PROCESSING ####

def filter_pings(df, shapefile_path):
    """
    This function filters a DataFrame to include only rows where the point
    defined by the 'lat' and 'lng' columns falls within one of
    the regions defined in the global gdf_regions variable.

    Parameters
    ----------
    df : pandas.DataFrame
        A DataFrame with 'lat' and 'lng' columns.
    shapefile_path : str
        The path to the shapefile that defines the regions.

    Returns
    -------
    geopandas.GeoDataFrame
        A GeoDataFrame containing only the rows of df where the point falls
        within one of the regions. The 'lat' and 'lng' columns
        are replaced with a 'geometry' column that contains Point objects.
    """
    # Load shapefile
    gdf_regions = gpd.read_file(shapefile_path)

    # Convert shapefile's CRS to WGS84 if it's not already
    gdf_regions = gdf_regions.to_crs('EPSG:4326')

    geometry = [Point(xy) for xy in zip(df['lng'], df['lat'])]
    #df = df.drop(['lng', 'lat'], axis=1)
    gdf = gpd.GeoDataFrame(df, geometry=geometry, crs='EPSG:4326')
    return gpd.sjoin(gdf, gdf_regions, op='within')

def filter_pings_w_shapefile(ping_pq_path, shapefile_path): 
    # Define an empty DataFrame to store the filtered pings
    filtered_pings = pd.DataFrame()

    # Open the parquet file
    parquet_file = pq.ParquetFile(ping_pq_path)
    cols = ['uid', 'datetime', 'lat', 'lng']

    # Define batch size
    batch_size = 10 ** 6  # adjust this value to your system's memory

    # Initialize progress bar
    total_batches = parquet_file.metadata.num_rows // batch_size
    pbar = tqdm(total=total_batches)

    # Read and filter the parquet file in batches
    for batch in parquet_file.iter_batches(batch_size, columns=cols):
        df = batch.to_pandas()
        filtered_chunk = filter_pings(df, shapefile_path)
        filtered_pings = pd.concat([filtered_pings, filtered_chunk])

        # Update progress bar
        pbar.update(1)

    # Close progress bar
    pbar.close()

    # Save the filtered pings to a new parquet file
    filtered_pq = f"{ping_pq_path.split('.parquet')[0]}_filtered.parquet"
    print(f'Writing to {filtered_pq}')
    filtered_pings.to_parquet(filtered_pq)
    filtered_pings = None 
    return

def find_home_work_locs(ddf, shapefile, home_hrs=(22.0, 6.0), work_hrs=(9.5,16.5)):
    #regions = gpd.read_file(shapefile)
    ddf_w_zones, tessellation_gdf = mobilkit.spatial.tessellate(ddf,tesselation_shp=shapefile,filterAreas=True)
    ddf_w_zones = mobilkit.stats.userHomeWork(ddf_w_zones,
                                         homeHours=home_hrs,
                                         workHours=work_hrs)
    #these next two actually detect home locations and may take some time
    ddf_w_zones_stat= mobilkit.stats.userHomeWorkLocation(ddf_w_zones)
    df_hw_locs = ddf_w_zones_stat.compute()
    return ddf_w_zones, tessellation_gdf, df_hw_locs

def write_files_with_home_work_tile_info(df_hw_locs, tessellation_gdf, ddf_w_zones, 
                                         output_folder, data_info, shp_name, write_ddf_parquet=False): 
    hw_locs_filename = f'{output_folder}user_stats_{data_info}_w_homework_tiles_{shp_name}'
    ddf_w_zones_filename = f'{output_folder}ddf_w_zones_{data_info}_{shp_name}'
    tess_df_filename = f'{output_folder}tessellation_gdf_tiles_{shp_name}'
    tessellation_gdf_w_home = pd.merge(tessellation_gdf,df_hw_locs.groupby(by="home_tile_ID").count().reset_index()[["home_tile_ID","home_pings"]],left_on="tile_ID",right_on="home_tile_ID")
    tessellation_gdf_w_home_filename = f'{output_folder}tessellation_gdf_w_home_{data_info}_{shp_name}.csv'
    df_hw_locs.to_csv(f'{hw_locs_filename}.csv')
    tessellation_gdf.to_csv(f'{tess_df_filename}.csv', index=False)
    tessellation_gdf_w_home.to_csv(tessellation_gdf_w_home_filename)
    if write_ddf_parquet==True: 
        name_function = lambda x: f"data-{x}.parquet"
        ddf_w_zones.to_parquet(f'{ddf_w_zones_filename}/', name_function=name_function)
    return 

In [3]:
from pathlib import Path

def filter_pings_w_shapefile(ping_pq_path, shapefile_path): 
    # Define the path for the new Parquet file
    filtered_pq_path = Path(ping_pq_path).parent
    filtered_pq_file = Path(ping_pq_path).stem + "_filtered"

    # Open the parquet file
    parquet_file = pq.ParquetFile(ping_pq_path)
    cols = ['uid', 'datetime', 'lat', 'lng']

    # Define batch size
    batch_size = 1000000  # adjust this value to your system's memory

    # Initialize progress bar
    total_batches = parquet_file.metadata.num_rows // batch_size
    pbar = tqdm(total=total_batches)

    # Columns to keep after filtering
    columns_to_keep = ['uid', 'lat', 'lng', 'datetime', 'LOCNombre', 'USOPreCor', 'UTAM', 'UTAMNombre']

    # Read and filter the parquet file in batches
    for i, batch in enumerate(parquet_file.iter_batches(batch_size, columns=cols)):
        df = batch.to_pandas()
        filtered_chunk = filter_pings(df, shapefile_path)

        # Select only the columns to keep
        filtered_chunk = filtered_chunk[columns_to_keep]

        # Convert filtered_chunk to PyArrow Table
        table = pa.Table.from_pandas(filtered_chunk)

        # Write the filtered chunk directly to a new Parquet file
        batch_file = filtered_pq_path / f"{filtered_pq_file}_{i}.parquet"
        print(f'Writing to {batch_file}')
        pq.write_table(table, batch_file)

        # Update progress bar
        pbar.update(1)

    # Close progress bar
    pbar.close()


###########

def filter_pings_w_shapefile(ping_pq_path, shapefile_path): 
    # Define the path for the new Parquet file
    parent_path = Path(ping_pq_path).parent
    base_name = Path(ping_pq_path).stem
    output_dir = parent_path / f"{base_name}_filtered"
    os.makedirs(output_dir, exist_ok=True)

    # Open the parquet file
    parquet_file = pq.ParquetFile(ping_pq_path)
    cols = ['uid', 'datetime', 'lat', 'lng']

    # Define batch size
    batch_size_bytes = 5 * 10**8  # Set your batch size in bytes. Here it's 2GB.

    # Initialize progress bar
    total_file_size = os.path.getsize(ping_pq_path)#/1.5 #divide because only reading in some columns 
    total_batches = math.ceil(total_file_size / batch_size_bytes)
    pbar = tqdm(total=total_batches)

    # Columns to keep after filtering
    columns_to_keep = ['uid', 'lat', 'lng', 'datetime', 'LOCNombre', 'USOPreCor', 'UTAM', 'UTAMNombre']

    # Read and filter the parquet file in batches
    for i, batch in enumerate(parquet_file.iter_batches(batch_size_bytes, columns=cols)):
        df = batch.to_pandas()
        filtered_chunk = filter_pings(df, shapefile_path)

        # Select only the columns to keep
        filtered_chunk = filtered_chunk[columns_to_keep]

        # Convert filtered_chunk to PyArrow Table
        table = pa.Table.from_pandas(filtered_chunk)

        # Write the filtered chunk directly to a new Parquet file
        batch_file = output_dir / f"{base_name}_{i}.parquet"
        print(f'Writing to {batch_file}')
        pq.write_table(table, batch_file)

        # Update progress bar
        pbar.update(1)

    # Close progress bar
    pbar.close()


### Read in data for users 
This is the data for users that live in a region around Bogota (defined roughly by a boundary box), that passed quality control (minimum 60 pings and 10 days active)

In [4]:
#data_for_qcd_users = f'{pq_dir}bogota_area_year=2019_month=1and2_pass_qc.parquet'
#data_for_qcd_users = f'{pq_dir}bogota_area_year=2019_month=3_pass_qc.parquet'
#qc_user_data = dd.read_parquet(data_for_qcd_users)
#qc_user_data.head()

## Compute home and work locations
For the different shapefiles, we can see whether users live in certain areas of the city that correspond with a particular income, near the stations of interest, and in a particular neighbhorhood (depending on the shapefile used).

### Filter pings outside of shapefile zones

For now, I was able to run this on the first 4 months of user data

In [4]:
shp_name = 'union_utam_localidad_study_area'
shapefile = f'{meta_dir}{shp_name}.shp'
gdf_regions = gpd.read_file(shapefile)
gdf_regions

Unnamed: 0,FID_poligo,Nombre_de_,Acto_admin,Area_de_la,Identifica,FID_utam_a,MUNCodigo,MUNNombre,LOCNombre,USOSNum,...,ca_b,access,grav_suppl,ca_w,ca_c,ca_a,dcb,Shape_Leng,Shape_Area,geometry
0,0,CIUDAD BOLIVAR,Acuerdo 14 de 1983,130002593080624,19,-1,0.0,,,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.677431,7.697443e-03,"MULTIPOLYGON (((-74.18090 4.58738, -74.18095 4..."
1,1,SUBA,Acuerdo 8 de 1977,100560477718254,11,-1,0.0,,,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.545705,3.083258e-03,"MULTIPOLYGON (((-74.04228 4.83042, -74.04242 4..."
2,3,KENNEDY,Acuerdo 8 de 1977,38589733814235,08,-1,0.0,,,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.038693,1.870588e-06,"MULTIPOLYGON (((-74.16109 4.66299, -74.16157 4..."
3,4,USME,Acuerdo 15 de 1993,215066686644374,05,-1,0.0,,,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.980154,1.504180e-02,"MULTIPOLYGON (((-74.05597 4.50832, -74.05611 4..."
4,6,SANTA FE,Acuerdo 117 de 2003,45170645863322,03,-1,0.0,,,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.310500,3.113309e-03,"MULTIPOLYGON (((-74.05516 4.62272, -74.05516 4..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
274,8,FONTIBON,Acuerdo 8 de 1977,33281002494914,09,113,25473.0,MOSQUERA,,35.0,...,0.007637,2991.657453,2991.657453,0.001199,0.006675,0.007499,13016.989194,0.000885,7.259953e-09,"POLYGON ((-74.17240 4.69600, -74.17239 4.69606..."
275,8,FONTIBON,Acuerdo 8 de 1977,33281002494914,09,105,11001.0,BOGOTA,FONTIBON,12345.0,...,0.008606,3616.626913,3616.626913,0.004103,0.008588,0.008621,9604.609900,0.002404,6.215887e-08,"POLYGON ((-74.16982 4.68962, -74.17013 4.68988..."
276,8,FONTIBON,Acuerdo 8 de 1977,33281002494914,09,113,25473.0,MOSQUERA,,35.0,...,0.007637,2991.657453,2991.657453,0.001199,0.006675,0.007499,13016.989194,0.002404,6.215887e-08,"POLYGON ((-74.16982 4.68962, -74.17013 4.68988..."
277,18,BOSA,Acuerdo 14 de 1983,23933196086856,07,96,11001.0,BOGOTA,BOSA,12345.0,...,0.007900,3594.007792,3594.007792,0.007778,0.007458,0.007827,15225.694636,0.005466,1.018373e-07,"MULTIPOLYGON (((-74.20273 4.60052, -74.20406 4..."


In [5]:
shp_names = ['zat_stratum', 'Stations_Buffer_1000', 'utam_access']
shapefiles = [f'{meta_dir}income/{shp_names[0]}.shp', f'{meta_dir}{shp_names[1]}.shp', f'{meta_dir}{shp_names[2]}.shp']

for i in tqdm(range(0,1), desc=f'Calculating pings to include based on the shapefiles...'): 
    data_info  = f'bogota_area_year=2019_month={str(i)}_pass_qc'
    ping_pq_path = f'{pq_dir}{data_info}.parquet'
    shp_name, shapefile = shp_names[2], shapefiles[2]
    print('This function takes too long and my computer runs out of memory - fix later.')
    #filter_pings_w_shapefile(ping_pq_path, shapefile)

Calculating pings to include based on the shapefiles...:   0%|          | 0/1 [00:00<?, ?it/s]

This function takes too long and my computer runs out of memory - fix later.


### Compute home and work locations using the filtered pings

In [6]:
pings_paths = glob.glob((pq_dir + '*filtered.parquet'))
pprint(pings_paths)
cols = ['uid', 'datetime', 'lat', 'lng']
loc_cols_from_shp = ['LOCNombre', 'USOPreCor', 'UTAM', 'UTAMNombre'] # can see the UTAM and other information per ping if these cols are added
ddf = dd.read_parquet(pings_paths, columns=(cols))
ddf.head()

['/Users/emilyrobitschek/git/ETH/SPUR/mobile_data_colombia/data/parquet/bogota_area_passqc/bogota_area_year=2019_month=2_pass_qc_filtered.parquet',
 '/Users/emilyrobitschek/git/ETH/SPUR/mobile_data_colombia/data/parquet/bogota_area_passqc/bogota_area_year=2019_month=3_pass_qc_filtered.parquet',
 '/Users/emilyrobitschek/git/ETH/SPUR/mobile_data_colombia/data/parquet/bogota_area_passqc/bogota_area_year=2019_month=1_pass_qc_filtered.parquet']


Unnamed: 0,uid,datetime,lat,lng
0,8251b1dd-b9dc-4d1a-a5db-54863687b033,2018-12-31 20:02:26-05:00,4.642742,-74.106285
137,6d2eb92a-40b3-4797-bafa-5a9521e99cff,2018-12-31 19:13:05-05:00,4.644672,-74.101845
195,6de78d80-cda4-4ad6-8cce-eb2b1f1d1801,2019-01-01 04:19:02-05:00,4.638697,-74.09611
834,ac245d6c-c27f-4bbe-8d10-41e3149ed4fb,2019-01-01 18:35:57-05:00,4.640657,-74.09967
879,2cfdc8c7-e51a-411a-9b9e-dd44cd7ceb98,2019-01-01 18:46:53-05:00,4.643566,-74.105545


In [8]:
#ddf_test_path = f"{ping_pq_path.split('.parquet')[0]}_filtered.parquet"
#ddf_test = dd.read_parquet(ddf_test_path)
shp_names = ['utam_access']
shapefiles = [f'{meta_dir}{shp_names[0]}.shp']

def find_home_work_locs(ddf, shapefile, home_hrs=(22.0, 6.0), work_hrs=(9.5,16.5)):
    ddf_w_zones, tessellation_gdf = mobilkit.spatial.tessellate(ddf,tesselation_shp=shapefile, filterAreas=True)
    ddf_w_zones = mobilkit.stats.userHomeWork(ddf_w_zones,
                                         homeHours=home_hrs,
                                         workHours=work_hrs)
    #these next two actually detect home locations and may take some time
    ddf_w_zones_stat = mobilkit.stats.userHomeWorkLocation(ddf_w_zones)
    df_hw_locs = ddf_w_zones_stat.compute()
    return ddf_w_zones, tessellation_gdf, df_hw_locs

for i in tqdm(range(0,len(shp_names)), desc=f'Calculating home and work locations based on the shapefiles...'): 
    shp_name, shapefile = shp_names[i], shapefiles[i]
    ddf_w_zones, tessellation_gdf, df_hw_locs = find_home_work_locs(ddf, shapefile, 
                                                        home_hrs=(22.0, 6.0), work_hrs=(9.5,16.5))
    write_files_with_home_work_tile_info(df_hw_locs, tessellation_gdf, ddf_w_zones, 
                                         out_dir_hw, data_info, shp_name=shp_name)

Calculating home and work locations based on the shapefiles...:   0%|          | 0/1 [00:00<?, ?it/s]

  if not (lk == lk.astype(rk.dtype))[~np.isnan(lk)].all():


### Visualize the results

In [97]:
home_locs = df_hw_locs[df_hw_locs['home_pings']>0] 
work_locs = df_hw_locs[df_hw_locs['work_pings']>0] 
home_w_work = home_locs[home_locs['work_pings']>0]
print(f'There are {len(df_hw_locs)} users in the dataset, {len(home_locs)} of which have home pings.')
print(f'There are {len(df_hw_locs)} users in the dataset, {len(work_locs)} of which have work pings.')
print(f'There are {len(home_w_work)} users in the dataset with home and work pings.')
home_locs.home_tile_ID.value_counts()
home_locs.head()

There are 250124 users in the dataset, 164769 of which have home pings.
There are 250124 users in the dataset, 168269 of which have work pings.
There are 124629 users in the dataset with home and work pings.


Unnamed: 0_level_0,tot_pings,home_tile_ID,lat_home,lng_home,home_pings,work_tile_ID,lat_work,lng_work,work_pings
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
00032a22-9681-47d0-9381-f7b5e293c8e5,4.0,70.0,4.616458,-74.16872,1.0,70.0,,,0.0
0003a978-c383-4f67-8a5b-785b263964c0,31.0,4.0,4.65163,-74.156181,1.0,82.0,4.651726,-74.156231,2.0
0006df93-3e2f-41f7-acd5-75a1b2481b43,7.0,8.0,4.741859,-74.04475,1.0,70.0,4.699833,-74.08959,1.0
00071501-1df6-4eee-bbc4-dfd118c868c0,7.0,16.0,4.651428,-74.151798,2.0,13.0,,,0.0
00093784-f444-4b74-b683-44963b19d566,49.0,9.0,4.613179,-74.167755,1.0,55.0,4.638473,-74.116814,1.0


In [80]:
tessellation_gdf_w_home = pd.merge(tessellation_gdf, home_locs.groupby(by="home_tile_ID").count().reset_index()[["home_tile_ID","home_pings"]],left_on="tile_ID",right_on="home_tile_ID")
print('The breakdown of users living by different neighborhoods that we can detect is as follows:\n', tessellation_gdf_w_home[['LOCNombre', 'home_pings']].head(10))
print(f"There are {tessellation_gdf_w_home['home_pings'].sum()} users that live in the ZAT regions that pass the filtering conditions for min pings and days detected.")

The breakdown of users living by different neighborhoods that we can detect is as follows:
         LOCNombre  home_pings
0     TEUSAQUILLO        6220
1         KENNEDY          41
2     TEUSAQUILLO        1602
3     TEUSAQUILLO        4614
4        SANTA FE        2526
5         KENNEDY         458
6   PUENTE ARANDA         860
7        FONTIBON         511
8  BARRIOS UNIDOS        5598
9        FONTIBON        1916
There are 164769 users that live in the ZAT regions that pass the filtering conditions for min pings and days detected.


In [66]:
localidades = ['SAN CRISTOBAL', 'CIUDAD BOLIVAR', 'RAFAEL URIBE URIBE']
loc_df = tessellation_gdf_w_home[tessellation_gdf_w_home['LOCNombre'].isin(localidades)]
grouped_loc_df = loc_df.groupby('LOCNombre')['home_pings'].sum()
grouped_loc_df

LOCNombre
CIUDAD BOLIVAR        410
RAFAEL URIBE URIBE    264
SAN CRISTOBAL         373
Name: home_pings, dtype: int64

In [109]:
utams = ['EL LUCERO', 'EL TESORO', 'SAN FRANCISCO']
df = tessellation_gdf_w_home[tessellation_gdf_w_home['UTAMNombre'].isin(utams)]
grouped_df = df.groupby('UTAMNombre')['home_pings'].sum()
grouped_df

UTAMNombre
EL LUCERO        108
EL TESORO          5
SAN FRANCISCO     15
Name: home_pings, dtype: int64

In [108]:
tessellation_gdf[tessellation_gdf['UTAMNombre']=='SAN FRANCISCO']['tile_ID']

73    73
Name: tile_ID, dtype: int64

In [21]:
import folium
import math

def plot_data_foliummap(shapefile_path, df, lat_col, lng_col):
    user_data = df #ddf.sample(frac=frac).compute()
    map_df=gpd.read_file(shapefile_path)

    # Create a folium map centered on the user's latitude and longitude
    center_lat = user_data.iloc[0][lat_col]
    center_lng = user_data.iloc[0][lng_col]
    map_obj = folium.Map(location=[center_lat, center_lng], zoom_start=12)

    # Add markers for each measurement
    for index, row in user_data.iterrows():
        #folium.Marker([row[lat_col], row[lng_col]]).add_to(map_obj)
        if math.isnan(row['lat_home']):
            pass
        else: 
            folium.Circle(radius=0.1, location=[row[lat_col], row[lng_col]], color="orange", fill=False).add_to(map_obj)
        if math.isnan(row['lat_work']):
            pass
        else: 
            folium.Circle(radius=0.1, location=[row['lat_work'], row['lng_work']], color="purple", fill=False).add_to(map_obj)
    # Add the shapefile as an overlay
    folium.GeoJson(map_df).add_to(map_obj)

    return map_obj, user_data

home_locs_formap = home_locs
map_obj, user_data = plot_data_foliummap(shapefile_path=shapefiles[0], df=home_locs_formap, lat_col='lat_home', lng_col='lng_home')
map_obj.save(f"{data_dir}figures/user_month1-3_home_work_utam_w_utam_overlay.html")

In [38]:
population_per_area = home_locs.reset_index().groupby("home_tile_ID").agg({
                                                "uid": "nunique",
                                                "home_pings": "sum"}).reset_index()

population_per_area = population_per_area.rename(columns={
                                                "home_tile_ID": "tile_ID",
                                                "uid": "POP_DATA",
                                                "home_pings": "pings"})

population_per_area.head()

Unnamed: 0,tile_ID,POP_DATA,pings
0,0.0,6220,10290.0
1,1.0,41,41.0
2,2.0,1602,2427.0
3,3.0,4614,6954.0
4,4.0,2526,3343.0


In [102]:
home_summary = pd.merge(tessellation_gdf, population_per_area, left_on="tile_ID",right_on="tile_ID")
localidades = ['RAFAEL URIBE URIBE', 'SAN CRISTOBAL', 'CIUDAD BOLIVAR']
home_summary_locs = home_summary[home_summary['LOCNombre'].isin(localidades)]
home_summary_locs.groupby(by="LOCNombre")[['LOCNombre', 'UTAMNombre', 'tile_ID', 'POP_DATA', 'pings']].head()

Unnamed: 0,LOCNombre,UTAMNombre,tile_ID,POP_DATA,pings
25,SAN CRISTOBAL,SOSIEGO,25,70,86.0
38,CIUDAD BOLIVAR,EL TESORO,38,5,5.0
44,CIUDAD BOLIVAR,EL LUCERO,45,108,168.0
45,CIUDAD BOLIVAR,EL MOCHUELO,46,18,23.0
46,RAFAEL URIBE URIBE,DIANA TURBAY,47,21,36.0
47,SAN CRISTOBAL,LOS LIBERTADORES,48,6,6.0
49,CIUDAD BOLIVAR,ARBORIZADORA,51,147,162.0
55,SAN CRISTOBAL,LA GLORIA,57,22,24.0
69,CIUDAD BOLIVAR,JERUSALEM,71,22,37.0
70,RAFAEL URIBE URIBE,SAN JOSE,72,24,35.0


In [101]:
#plot the home locations of users in the 3 localidades we care about
def plot_homes_foliummap(shapefile_path, df, lat_col, lng_col):
    user_data = df #ddf.sample(frac=frac).compute()
    map_df=gpd.read_file(shapefile_path)

    # Create a folium map centered on the user's latitude and longitude
    center_lat = user_data.iloc[0][lat_col]
    center_lng = user_data.iloc[0][lng_col]
    map_obj = folium.Map(location=[center_lat, center_lng], zoom_start=12)

    # Add markers for each measurement
    for index, row in user_data.iterrows():
        #folium.Marker([row[lat_col], row[lng_col]]).add_to(map_obj)
        if row.home_tile_ID in ['25', '48', '57', '94', '97']: #'SAN CRISTOBAL'
            folium.Circle(radius=0.5, location=[row[lat_col], row[lng_col]], color="crimson", fill=False).add_to(map_obj)
        elif row.home_tile_ID in ['38', '45', '46', '51', '71']: #'CUIDAD BOLIVAR'
            folium.Circle(radius=0.5, location=[row[lat_col], row[lng_col]], color="green", fill=False).add_to(map_obj)
        elif row.home_tile_ID in ['47', '72', '106', '107', '109']: # 'RAFAEL URIBE URIBE'
            folium.Circle(radius=0.5, location=[row[lat_col], row[lng_col]], color="yellow", fill=False).add_to(map_obj)
        else: 
            #print('localidad not found')
            pass
    # Add the shapefile as an overlay
    folium.GeoJson(map_df).add_to(map_obj)
    return map_obj, user_data

#home_locs['home_tile_ID'] = home_locs['home_tile_ID'].astype(int).astype(str)
home_locs_formap = home_locs
map_obj, user_data = plot_homes_foliummap(shapefile_path=shapefiles[0], df=home_locs_formap, lat_col='lat_home', lng_col='lng_home')
map_obj

In [82]:
home_locs#.columns

Unnamed: 0_level_0,tot_pings,home_tile_ID,lat_home,lng_home,home_pings,work_tile_ID,lat_work,lng_work,work_pings
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
00032a22-9681-47d0-9381-f7b5e293c8e5,4.0,70.0,4.616458,-74.168720,1.0,70.0,,,0.0
0003a978-c383-4f67-8a5b-785b263964c0,31.0,4.0,4.651630,-74.156181,1.0,82.0,4.651726,-74.156231,2.0
0006df93-3e2f-41f7-acd5-75a1b2481b43,7.0,8.0,4.741859,-74.044750,1.0,70.0,4.699833,-74.089590,1.0
00071501-1df6-4eee-bbc4-dfd118c868c0,7.0,16.0,4.651428,-74.151798,2.0,13.0,,,0.0
00093784-f444-4b74-b683-44963b19d566,49.0,9.0,4.613179,-74.167755,1.0,55.0,4.638473,-74.116814,1.0
...,...,...,...,...,...,...,...,...,...
fffc01e9-0ac3-4b61-a6b5-fdd006f0b8c5,58.0,15.0,4.647242,-74.060800,1.0,70.0,4.647294,-74.060797,7.0
fffc57a9-dc65-4456-8ff2-86ac0ce2746e,142.0,16.0,4.578889,-74.119339,3.0,77.0,4.576781,-74.097945,5.0
fffc8c2e-73d9-4346-946d-c4c0ea9eeca3,4682.0,81.0,4.715843,-74.107762,13.0,70.0,4.696206,-74.093434,201.0
fffceaa9-d1f8-4938-8a3e-c499b079ccaf,99.0,70.0,4.596863,-74.186455,2.0,70.0,4.601617,-74.070840,1.0


In [None]:
shp_names = ['zat_stratum', 'Stations_Buffer_1000', 'utam_access']
shapefiles = [f'{meta_dir}income/{shp_names[0]}.shp', f'{meta_dir}{shp_names[1]}.shp', f'{meta_dir}{shp_names[2]}.shp']


for i in tqdm(range(0,len(shp_names)), desc=f'Calculating home and work locations based on the shapefiles...'): 
    shp_name, shapefile = shp_names[i], shapefiles[i]
    ddf_w_zones, tessellation_gdf, df_hw_locs = find_home_work_locs(qc_user_data, 
                                                                    shapefile, 
                                                                    home_hrs=(22.0, 6.0), work_hrs=(9.5,16.5))

    write_files_with_home_work_tile_info(df_hw_locs, tessellation_gdf, ddf_w_zones, 
                                        out_dir_hw, data_info, shp_name=shp_name)
    

### Visualize and load data

Let's visualize the number of people living in the three neighbhoords that Elena mentioned as an area of focus

In [None]:
places = ['SAN CRISTOBAL', 'CIUDAD BOLIVAR', 'RAFAEL URIBE URIBE']
df = tessellation_gdf_w_home[tessellation_gdf_w_home['LOCNombre'].isin(places)]
grouped_df = df.groupby('LOCNombre')['home_pings'].sum()
grouped_df

In [None]:
print(tessellation_gdf.columns)
tessellation_gdf.head()

In [None]:
cols = ['MUNNombre', 'LOCNombre', 'USOPreNum',
       'USOPreCor', 'ESTRATO1', 'ESTRATO2', 'ESTRATO3', 'ESTRATO4', 'ESTRATO5',
       'ESTRATO6', 'ESTRATOPre', 'HOGARES', 'UTAM', 'UTAMNombre', 'UTAMArea',
       'homes', 'jobs', 'tile_ID']

tessellation_gdf[tessellation_gdf['LOCNombre'].isin(places)][cols].head(20)

In [None]:
tessellation_gdf_zat_name = f'{out_dir_hw}tessellation_gdf_w_home_bogota_area_2019_months1_2_60min_pings_10min_days_zat_stratum.csv'
tessellation_gdf_zat = pd.read_csv(tessellation_gdf_zat_name, index_col='Unnamed: 0')
tessellation_gdf_zat = tessellation_gdf_zat[tessellation_gdf_zat['MUNCod'] == 11001.0]
tessellation_gdf_zat.head()

In [None]:
tessellation_gdf_stations_name = f'{out_dir_hw}tessellation_gdf_w_home_bogota_area_2019_months1_2_60min_pings_10min_days_Stations_Buffer_1000.csv'
tessellation_gdf_stations = pd.read_csv(tessellation_gdf_stations_name, index_col='Unnamed: 0')
#tessellation_gdf_zat = tessellation_gdf_zat[tessellation_gdf_zat['MUNCod'] == 11001.0]
tessellation_gdf_stations.head()

In [None]:
# would need to load the data for the stations first 
df = tessellation_gdf_stations[tessellation_gdf_stations['home_pings'] > 0]
print('The breakdown of users living by different stations that we can detect is as follows:\n', df[['Station', 'home_pings']])
print(f"There are {df['home_pings'].sum()} users that live in the ZAT regions that pass the filtering conditions for min pings and days detected.")

In [None]:
df_hw_locs_stations_name = f'{out_dir_hw}user_stats_bogota_area_2019_months1_2_60min_pings_10min_days_w_homework_tiles_Stations_Buffer_1000.csv'
df_hw_locs_stations = pd.read_csv(df_hw_locs_stations_name)
df_hw_locs_stations#.home_tile_ID #.value_counts()
print(len(df_hw_locs_stations))
df_hw_locs_stations.head()