### Load and filter the data by day
For each month of data, load the data for each day and filter it first with a box that includes a region around Bogota and then with a shapefile with regions relevant to the study area that we care about (e.g. neighbhorhoods within Bogota). Write out that data in an efficient format (parquet) for calculation of user stats and filtering by user stats

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

working_dir = os.getenv("WORKING_DIR")
os.environ["USE_PYGEOS"] = "0"

from setup import *
from plotting import *
from preprocess import *

In [None]:
c = read_config(f"{working_dir}configs/config_2018.yml")
(
    year,
    datatypes,
    initial_cols,
    sel_cols,
    final_cols,
    minlon,
    maxlon,
    minlat,
    maxlat,
) = get_config_vars(c=c, mode="preprocess")
min_days, min_pings = get_config_vars(c=c, mode="user_qc")

where = get_dirs(working_dir, year=year, min_days=min_days, min_pings=min_pings)

#### Load the shapefile and plot it to see if it is what we might expect for the study areas we want to analyze

In [None]:
shapefile, gdf_regions = get_shp(
    meta_dir=where.meta_dir, shp_name=c["meta"]["shp"]["study_area"], load=True
)

### Testing and sanity checks: 
For one day, filter the points by region and check the plots to ensure that the filtering is working as expected before applying the function to the whole dataset. 
- Time to run: ~ 2 minutes, including plotting functions 

#### Load and plot a fraction of all the data

In [None]:
# select one day for testing
i = 0
data_folder = where.data_folders[i]
month = data_folder.split(where.data_year)[1].split("/")[0]
day_dirs = get_days(data_folder)
day_dir = day_dirs[i]

filepaths, day = get_files(data_folder, day_dir)
day_name = day.split("/")[0]
ddf = load_data(
    filepaths,
    initial_cols=initial_cols,
    sel_cols=sel_cols,
    final_cols=final_cols,
    datatypes=datatypes,
)
ddf.head()

In [None]:
map_obj, user_data = plot_frac_data_on_map(
    shapefile_path=shapefile, ddf=ddf, frac=0.0001
)
map_obj.save(f"{where.plot_dir}/{year}_{month}_{day_name}_user_pings_raw_f0001.html")
map_obj

#### Load and plot a fraction of the data limited to a box around the area of Bogota
This reduces the data size quite substantially and quickly (by a factor of ~3 or so depending on how much data is outside the roughly defined region around Bogota)

In [None]:
within_box_ddf = find_within_box(ddf, minlon, maxlon, minlat, maxlat)
map_obj, user_data = plot_frac_data_on_map(
    shapefile_path=shapefile, ddf=within_box_ddf, frac=0.0001
)
map_obj.save(f"{where.plot_dir}/{year}_{month}_{day_name}_user_pings_bogbox_f0001.html")
map_obj

#### Load and plot a fraction of the data limited to the regions within the area of Bogota that are designated by our shapefile

In [None]:
ddf_in_regions = find_within_regions(ddf, gdf=gdf_regions)
map_obj, user_data = plot_frac_data_on_map(
    shapefile_path=shapefile, ddf=ddf_in_regions, frac=0.01
)
map_obj.save(
    f"{where.plot_dir}/{year}_{month}_{day_name}_user_pings_in_studyareashp_f01.html"
)
map_obj

This works well. Let's make a function to more easily apply this filtering to our data. 

In [None]:
# function testing for the one day:
ddf_in_regions = filter_data_for_day(
    filepaths,
    gdf=gdf_regions,
    initial_cols=initial_cols,
    sel_cols=sel_cols,
    final_cols=final_cols,
    datatypes=datatypes,
    minlon=minlon,
    maxlon=maxlon,
    minlat=minlat,
    maxlat=maxlat,
)
ddf_in_regions.head(5)

### Filter all the days in the dataset for all months
Because the previous functions appeared to filter the data in the desired way, we will now apply that filtering to the whole dataset
- Time to run 
    - ~83 minutes (2019 data) ~44 minutes (2018 data)

In [None]:
def write_data_in_study_area(where: Where, config_path: str):
    print(config_path)
    c = read_config(path=config_path)
    shapefile, gdf_regions = get_shp(
        meta_dir=where.meta_dir, shp_name=c["meta"]["shp"]["study_area"], load=True
    )
    (
        year,
        datatypes,
        initial_cols,
        sel_cols,
        final_cols,
        minlon,
        maxlon,
        minlat,
        maxlat,
    ) = get_config_vars(c=c, mode="preprocess")
    for i in range(0, len(where.data_folders)):
        data_folder = where.data_folders[i]
        from_month_write_filter_days_to_pq(
            data_folder,
            gdf=gdf_regions,
            out_dir=where.study_area_dir,
            data_year=where.data_year,
            year=year,
            initial_cols=initial_cols,
            sel_cols=sel_cols,
            final_cols=final_cols,
            datatypes=datatypes,
            minlon=minlon,
            maxlon=maxlon,
            minlat=minlat,
            maxlat=maxlat,
        )


write_data_in_study_area(where, config_path=f"{working_dir}configs/config_2018.yml")

### Compute the user stats based on the pings from the study area regions
May take quite some time. 
- Time to run ~115 minutes (2019 data) ~73 minutes (2018 data)

In [None]:
user_stats = compute_user_stats_from_pq(where.study_area_dir)
user_stats.head()

#### Write out user stats for the whole dataset for filtering
Based on 60 ping and 10 day mininum cutoffs, kept 701961 of a total of 3557494 users for this dataset (2019 data). Based on 60 ping and 10 day mininum cutoffs, kept 365261 of a total of 2637382 users for this dataset (2018 dataset).

In [None]:
user_stats = write_filter_user_stats(
    user_stats, output_dir=where.user_stats_dir, year=year, filter_by_frequency=False
)

In [None]:
user_stats_filtered = write_filter_user_stats(
    user_stats,
    output_dir=where.user_stats_dir,
    year=year,
    filter_by_frequency=True,
    min_pings=min_pings,
    min_days=min_days,
)

### Filter dataset by users that pass quality control 
Write out to parquet file for downstream analysis. (Reorganized the files into folders with 1 month of data each, except the first three months that are smaller in size.) 
- Time to run: ~14 min (2019 data) ~ 3 min (2018 data)

In [None]:
output_filepath = f"{where.pass_qc_dir}user_stats_{year}_{min_pings}min_pings_{min_days}min_days_shp_filtered.csv"
user_stats_filtered = pd.read_csv(output_filepath)
user_stats_filtered.head()

After moving the data into month subfolders in the study area dir we can pass filter the data for each month by user stats

In [None]:
pq_dirs_months = glob.glob(where.study_area_dir + "*")
pq_dirs_months_names = [
    i.split(f"{where.study_area_dir}")[1].split(".parquet")[0] for i in pq_dirs_months
]
uids_pass_qc = list(user_stats_filtered["uid"])
data_for_qcd_users = (
    f"{where.study_area_dir}bogota_study_area_{pq_dirs_months_names[0]}_pass_qc.parquet"
)

In [None]:
import pyarrow.dataset as ds

for i in tqdm(
    range(0, len(pq_dirs_months)), desc=f"Writing data for users that pass qc"
):
    print(f"Filtering data for {pq_dirs_months_names[i]}...")
    dataset = ds.dataset(pq_dirs_months[i], format="parquet")
    table = dataset.to_table(filter=ds.field("uid").isin(uids_pass_qc))
    # this causes the kernel to crash when I ran it on all the data so I need to rewrite it to not load everything in memory all at once
    data_for_qcd_users = f"{where.pass_qc_dir}bogota_study_area_{year}_{pq_dirs_months_names[i]}_pass_qc.parquet"
    pq.write_table(table, data_for_qcd_users)

### Experiment with different QC thresholds
Loads the user stats for the data and produces user stats files filtered with different criteria (e.g. at least 150 or 300 pings) that can later be used to see the impacts of different qc settings

In [None]:
output_filepath = f"{where.user_stats_dir}user_stats_{year}_allpings_shp_filtered.csv"
user_stats = pd.read_csv(output_filepath)
user_stats

In [None]:
user_stats_filtered = write_filter_user_stats(
    user_stats,
    output_dir=where.user_stats_dir,
    year=year,
    filter_by_frequency=True,
    min_pings=150,
    min_days=min_days,
)

user_stats_filtered = write_filter_user_stats(
    user_stats,
    output_dir=where.user_stats_dir,
    year=year,
    filter_by_frequency=True,
    min_pings=300,
    min_days=min_days,
)