## Mobility analysis
The purpose of this notebook is to compare the mobility of users who live in the "treatment" (ZATs near new cable car) and "control" zones (similar ZATs but no new cable car)

1) Compute distance between home and work locations 
2) Compute visits (20, 30, 1hr visit times) 

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

working_dir = os.getenv("WORKING_DIR")
os.environ["USE_PYGEOS"] = "0"

from setup import *
from plotting import *
from preprocess import *
from analysis import *

import pandas as pd
import numpy as np
import skmob

c = read_config(f"{working_dir}configs/config_2018.yml")
(
    year,
    datatypes,
    initial_cols,
    sel_cols,
    final_cols,
    minlon,
    maxlon,
    minlat,
    maxlat,
) = get_config_vars(c=c, mode="preprocess")
min_days, min_pings = get_config_vars(c=c, mode="user_qc")

where = get_dirs(working_dir, year=year, min_days=min_days, min_pings=min_pings)

meta_dir = where.meta_dir

# Access environment variables and define other necessary variables
working_dir = os.getenv("WORKING_DIR")
# pq_dir = f'{working_dir}data/parquet/in_study_area/pass_qc/in_zats/home_in_zats/'
# hw_dir = f'{working_dir}data/parquet/in_study_area/pass_qc/home_work_locs/'
hw_dir = f"{where.pass_qc_dir}home_work_locs/"
pq_dir = f"{hw_dir}home/in_zats/"

### Compute distance between home and work locations for users

In [None]:
# from 2019 analysis:
# hw_dir = f'{working_dir}data/parquet/in_study_area/pass_qc/home_work_locs/'
# home_locs_file = f'{hw_dir}home/selected_txt_control_home_locs_2200_0600_w_zats_for_users_pass_qc.csv'
# work_locs_file = f'{hw_dir}work/all_work_locs_0800_1800_w_zats_for_users_pass_qc.csv'

home_locs_file = (
    f"{hw_dir}home/selected_txt_control_home_locs_w_zats_for_users_pass_qc.csv"
)
work_locs_file = (
    f"{hw_dir}work/selected_txt_control_work_locs_w_zats_for_users_pass_qc.csv"
)

home_df = pd.read_csv(
    home_locs_file,
    usecols=[
        "uid",
        "lat_home",
        "lng_home",
        "Area",
        "MUNCod",
        "NOMMun",
        "ZAT_home",
        "UTAM_home",
        "stratum",
    ],
)
work_df = pd.read_csv(
    work_locs_file,
    usecols=[
        "uid",
        "lat_work",
        "lng_work",
        "Area",
        "MUNCod",
        "NOMMun",
        "ZAT_work",
        "UTAM_work",
        "stratum",
    ],
)

In [None]:
# drop the duplicates (keeping the first home location) and merge the frames
home_df = home_df.drop_duplicates(subset="uid")
work_df = work_df.drop_duplicates(subset="uid")

home_work_df = home_df.merge(work_df, on="uid", suffixes=("_home", "_work"))
print(len(home_df), len(work_df), len(home_work_df))
home_work_df.head()

Calculate the distance between the home and work location for each user

In [None]:
home_work_df["distance_km"] = home_work_df.apply(geodesic_distance, axis=1)
home_work_df.head()

In [None]:
home_work_w_dist_file = f"{hw_dir}home_work_locs_for_users_living_in_selected_txt_control_zats_w_distance_btwn_home_work.csv"
home_work_df.to_csv(home_work_w_dist_file, index=False)

### Load the pings data for the users living in the selected ZATs

Load 1. the shapefile with the ZAT and stratum information and 2. the user pings

In [None]:
shapefile, regions_gdf_zat = get_shp(
    meta_dir=f"{meta_dir}income/", shp_name=c["meta"]["shp"]["zat"], load=True
)

regions_gdf_zat.plot(column="stratum")

In [None]:
ping_files = glob.glob(pq_dir + "*.parquet")
ping_df = ds.dataset(ping_files, format="parquet").to_table().to_pandas()
ping_df = ping_df.reset_index()
ping_df = ping_df.drop(columns="index")
ping_df.head()

In [None]:
len(ping_df)

### Compute visits and other mobility variables
Can also "compress" the pings data and do other preprocessing and filtering

### Load and preprocess pings data
Not that we can print the parameters of the functions, for example for the filtering with `print(ftdf.parameters)`. Filtering takes ~28 minutes on the full dataset of pings from users in the selected zats.  

In [None]:
# Convert the DataFrame into a TrajDataFrame and filter out outliers
from skmob.preprocessing import filtering

tdf = skmob.TrajDataFrame(
    ping_df, user_id="uid", latitude="lat", longitude="lng", datetime="datetime"
)
ftdf = filtering.filter(tdf)  # takes quite some time - maybe save for later
n_deleted_points = len(tdf) - len(ftdf)
print(n_deleted_points)

### Compute the stops 
We will compute stops for 20, 30, and 60 minutes for the dataset and output those stops. For the filtered dataset, there are 1861875 stops for at least 60 minutes, although this number is reduced to 1613447 if we don't count pings with more than 24 hours worth of minutes until the next ping (missing data). There are 2763819 stops for at least 20 minutes if we don't count pings with more than 24 hours worth of minutes until the next ping (missing data). 

It took ~20 minutes to compute the stops for each minimum stopping time.

In [None]:
# Compute the stops
number_min = 60
sftdf = detection.stay_locations(
    ftdf,
    stop_radius_factor=0.5,
    minutes_for_a_stop=number_min,
    spatial_radius_km=0.2,
    leaving_time=True,
)
print(
    f"The number of stops for {number_min} minutes in the filtered dataset is {len(sftdf)}"
)

In [None]:
# out dir for 2019 data: f'{working_dir}data/parquet/in_study_area/pass_qc/in_zats/visits/'
pq_dir_out = f"{hw_dir}home/in_zats/visits/"
ensure_directory_exists(pq_dir_out)

calculate_visits_min_minutes(
    tdf=ftdf, visit_durations=[20, 30, 60], out_dir=pq_dir_out, no_data_for_minutes=1e12
)  # 1440 is 24 hours worth of minutes - this is to not count stops with more than 1440 minutes of missing data between them as stops because they may just be missing points

### Load in the visits df and visualize some of the stops

In [None]:
# out_dir=f'{working_dir}data/parquet/in_study_area/pass_qc/in_zats/visits/'
# outfilename = f'{pq_dir_out}users_living_in_sel_zat_visits_atleast_60min.csv'
outfilename = f"{pq_dir_out}users_living_in_sel_zat_visits_atleast_60min_nodatafor_1440_minutes.csv"
visit_df = pd.read_csv(outfilename)
visit_df.head(10)

In [None]:
visit_df.uid.value_counts()

Map some of the visits

In [None]:
user_id = "2ff377ea-68e7-4638-bebe-7df5bba77967"  #'e846d741-5ee4-440f-b304-e4e3886c2210' #'f12efcd6-a347-416c-8a39-93e6fb67f7aa' #'324fe201-cce9-4395-91ab-ee421cdd34c9' #'f07076d8-32be-40f4-ad3a-e1ece90ec6f7' #'00002eec-9e3e-4e4d-9822-4e4858a0de0c'
map_obj, user_data = plot_user_on_map(
    shapefile_path=shapefile, df=visit_df, lat_col="lat", lng_col="lng", user_id=user_id
)

map_obj

### Compute other metrics (radius of gyration, etc)

In [None]:
# TO DO

### Visualizing trajectories and stops for a single user
Work with data for one user initially (with many stops)

In [None]:
# work with data for one user initially (with many stops)
selected_user = user_id
user_filter = ds.field("uid").isin([selected_user])
ping_df = (
    ds.dataset(ping_files, format="parquet").to_table(filter=user_filter).to_pandas()
)

map_obj, user_data = plot_frac_data_on_map(
    shapefile_path=shapefile, ddf=dd.from_pandas(ping_df, npartitions=2), frac=1.0
)
map_obj

In [None]:
map_obj_tdf, user_data_tdf = plot_user_on_map(
    shapefile_path=shapefile,
    df=tdf,
    lat_col="lat",
    lng_col="lng",
    user_id=selected_user,
)
map_obj_ftdf, user_data_ftdf = plot_user_on_map(
    shapefile_path=shapefile,
    df=ftdf,
    lat_col="lat",
    lng_col="lng",
    user_id=selected_user,
)
map_obj_ftdf

In [None]:
# Compute the stops
dfs_to_process = [tdf, ftdf]

number_min = 60
stdf = detection.stay_locations(
    tdf,
    stop_radius_factor=0.5,
    minutes_for_a_stop=number_min,
    spatial_radius_km=0.2,
    leaving_time=True,
)
sftdf = detection.stay_locations(
    ftdf,
    stop_radius_factor=0.5,
    minutes_for_a_stop=number_min,
    spatial_radius_km=0.2,
    leaving_time=True,
)
print(
    f"The number of stops for {number_min} minutes in the filtered dataset is {len(sftdf)}"
)

map_obj_stdf, user_data_stdf = plot_user_on_map(
    shapefile_path=shapefile,
    df=stdf,
    lat_col="lat",
    lng_col="lng",
    user_id=selected_user,
)
map_obj_stdf.save(
    f"{where.plot_dir}{selected_user}_user_stops_{number_min}_minutes.html"
)

map_obj_sftdf, user_data_sftdf = plot_user_on_map(
    shapefile_path=shapefile,
    df=sftdf,
    lat_col="lat",
    lng_col="lng",
    user_id=selected_user,
)
map_obj_sftdf.save(
    f"{where.plot_dir}figures/{selected_user}_user_stops_{number_min}_minutes_filtered.html"
)