### Load and filter the data by day
For each month of data, load the data for each day and filter it first with a box that includes a region around Bogota and then with a shapefile with regions relevant to the study area that we care about (e.g. neighbhorhoods within Bogota). Write out that data in an efficient format (parquet) for calculation of user stats and filtering by user stats

In [None]:
from dotenv import load_dotenv
load_dotenv()

from plotting import * 
from preprocess import *

import os
import glob
import pandas as pd

os.environ['USE_PYGEOS'] = '0'
import geopandas as gpd

# Access environment variables and define other necessary variables
data_dir = os.getenv('WORKING_DIR')
meta_dir = f'{data_dir}metadata/'

in_dir = f'{data_dir}data/'
year = 'year=2019'
data_year = f'{in_dir}{year}/'
data_folders = glob.glob((data_year + '*/'))
out_dir = f'{data_dir}data/parquet/in_study_area/'

#### Load the shapefile and plot it to see if it is what we might expect for the study areas we want to analyze

In [None]:
shp_name = 'union_utam_localidad_study_area'
shapefile = f'{meta_dir}{shp_name}.shp'
gdf_regions = gpd.read_file(shapefile)
gdf_regions.plot()

### Testing and sanity checks: 
For one day, filter the points by region and check the plots to ensure that the filtering is working as expected before applying the function to the whole dataset. 
- Time to run: ~ 2 minutes, including plotting functions 

#### Load and plot a fraction of all the data

In [None]:
# select one day for testing 
i = 0
data_folder = data_folders[i]
month = data_folder.split(data_year)[1].split('/')[0]
day_dirs = get_days(data_folder)
day_dir = day_dirs[i]

filepaths, day = get_files(data_folder, day_dir)
day_name = day.split('/')[0]
ddf = load_data(filepaths, initial_cols, sel_cols, final_cols)
ddf.head()

In [None]:
map_obj, user_data = plot_frac_data_on_map(shapefile_path=shapefile, ddf=ddf, frac=0.0001)
map_obj.save(f"{data_dir}figures/{year}_{month}_{day_name}_user_pings_raw_f0001.html")
map_obj

#### Load and plot a fraction of the data limited to a box around the area of Bogota
This reduces the data size quite substantially and quickly (by a factor of ~3 or so depending on how much data is outside the roughly defined region around Bogota)

In [None]:
within_box_ddf = find_within_box(ddf, minlon, maxlon, minlat, maxlat)
map_obj, user_data = plot_frac_data_on_map(shapefile_path=shapefile, ddf=within_box_ddf, frac=0.0001)
map_obj.save(f"{data_dir}figures/{year}_{month}_{day_name}_user_pings_bogbox_f0001.html")
map_obj

#### Load and plot a fraction of the data limited to the regions within the area of Bogota that are designated by our shapefile

In [None]:
ddf_in_regions = find_within_regions(ddf, gdf=gdf_regions)
map_obj, user_data = plot_frac_data_on_map(shapefile_path=shapefile, ddf=ddf_in_regions, frac=0.01)
map_obj.save(f"{data_dir}figures/{year}_{month}_{day_name}_user_pings_in_studyareashp_f01.html")
map_obj

This works well. Let's make a function to more easily apply this filtering to our data. 

In [None]:
# function testing for the one day: 
ddf_in_regions = filter_data_for_day(filepaths, gdf=gdf_regions)
ddf_in_regions.head(5)

### Filter all the days in the dataset for all months
Because the previous functions appeared to filter the data in the desired way, we will now apply that filtering to the whole dataset
- Time to run ~83 minutes

In [None]:
for i in range(0, len(data_folders)):
    data_folder = data_folders[i]
    from_month_write_filter_days_to_pq(data_folder, gdf=gdf_regions, out_dir=out_dir, data_year=data_year, year=year)

### Compute the user stats based on the pings from the study area regions
May take quite some time. 
- Time to run ~115 minutes

In [None]:
import mobilkit.stats

def compute_user_stats_from_pq(pq_dir):
    table_dd = dd.read_parquet(pq_dir, columns=['uid', 'datetime'])
    user_stats = mobilkit.stats.userStats(table_dd).compute()
    return user_stats

user_stats = compute_user_stats_from_pq(out_dir)
user_stats.head()

#### Write out user stats for the whole dataset for filtering
Based on 60 ping and 10 day mininum cutoffs, kept 701961 of a total of 3557494 users for this dataset.

In [None]:
print(len(user_stats))
output_filepath = f'{data_dir}/data/user_stats/user_stats_2019_allpings_months1-8s_shp_filtered.csv'
user_stats.to_csv(output_filepath, index=False)
#data_dir

In [None]:
min_pings, min_days = 60, 10 
output_filepath = f'{data_dir}/data/user_stats/user_stats_2019_months1-8_60min_pings_10min_days_shp_filtered.csv'

# ran this once and will comment out for now to not overwrite the file: 
user_stats_filtered = user_stats[(user_stats['pings'] >= min_pings) & (user_stats['daysActive'] >= min_days)]
print(f"Based on {min_pings} ping and {min_days} day mininum cutoffs, kept {len(user_stats_filtered)} of a total of {len(user_stats)} users for this dataset.")
user_stats_filtered.to_csv(output_filepath, index=False)

### Filter dataset by users that pass quality control 
Write out to parquet file for downstream analysis. (Reorganized the files into folders with 1 month of data each, except the first three months that are smaller in size.) 
- Time to run: ~14 min

In [None]:
output_filepath = f'{data_dir}/data/user_stats/user_stats_2019_months1-8_60min_pings_10min_days_shp_filtered.csv'
user_stats_filtered = pd.read_csv(output_filepath)
user_stats_filtered.head()

In [None]:
pq_dirs_months = glob.glob(out_dir + '*')
pq_dirs_months_names = [i.split(f'{out_dir}')[1] for i in pq_dirs_months]
uids_pass_qc= list(user_stats_filtered['uid'])
data_for_qcd_users = f'{out_dir}bogota_study_area_year=2019_{pq_dirs_months_names[0]}_pass_qc.parquet'

In [None]:
import pyarrow.dataset as ds

for i in tqdm(range(0,len(pq_dirs_months)), desc=f'Writing data for users that pass qc'):
    print(f'Filtering data for {pq_dirs_months_names[i]}...')
    dataset = ds.dataset(pq_dirs_months[i], format="parquet")
    table = dataset.to_table(filter=ds.field('uid').isin(uids_pass_qc))
    # this causes the kernel to crash when I ran it on all the data so I need to rewrite it to not load everything in memory all at once
    data_for_qcd_users = f'{out_dir}bogota_study_area_year=2019_{pq_dirs_months_names[i]}_pass_qc.parquet'
    pq.write_table(table, data_for_qcd_users)