## Visit analysis
The purpose of this notebook is to compare the mobility of users who live in the "treatment" (ZATs near new cable car) and "control" zones (similar ZATs but no new cable car). We will look for trends in the visits we computed in the `mobility_analysis` followed by the `visit_compute` notebook.

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

working_dir = os.getenv("WORKING_DIR")
os.environ["USE_PYGEOS"] = "0"

from setup import *
from plotting import *
from preprocess import *
from analysis import *
from plotting import *

import pandas as pd
import numpy as np
import skmob

c = read_config(f"{working_dir}configs/config_2019.yml")
(
    year,
    datatypes,
    initial_cols,
    sel_cols,
    final_cols,
    minlon,
    maxlon,
    minlat,
    maxlat,
) = get_config_vars(c=c, mode="preprocess")
min_days, min_pings = get_config_vars(c=c, mode="user_qc")

where = get_dirs(working_dir, year=year, min_days=min_days, min_pings=min_pings)

meta_dir = where.meta_dir

In [None]:
hw_dir = f"{where.pass_qc_dir}home_work_locs/home/"
visit_dir = f"{hw_dir}in_zats/visits/"
out_dir = f"{visit_dir}w_pois/"

for d in [visit_dir, out_dir]:
    ensure_directory_exists(d)

sel_zat_home_locs_meta_fp = f"{hw_dir}selected_txt_control_home_locs_w_zats_for_users_pass_qc_w_treatment_group.csv"

### Load home locations with ZATs and treatment group for users living in treatment and control ZATs and the shapefile of interest
There is one shapefile with a buffer of 15 meters around the google places POIs and one with 20 meters buffer.

In [None]:
sel_zat_home_locs_meta = pd.read_csv(sel_zat_home_locs_meta_fp)

# for POIs with 15m Buffer
shp_name, gdf_from_shp = get_shp_to_assign_poi(
    shp_dir=f"{meta_dir}places/Buffer Shapefiles/", config=c, radius=15, plot=True
)

sel_zat_home_locs_meta.head()

### Load the visits and look at how many mapped to one or more POIs, group them by POI category

In [None]:
stopping_time = 20
min_minutes = 1e12  # or 1440
outfilename = f"users_living_in_sel_zat_visits_atleast_{stopping_time}min_nodatafor_{min_minutes}_minutes_w_poi_from_{shp_name}_shp"
vists_w_poi_df_fp = f"{out_dir}{shp_name}/{outfilename}.csv"
visits_w_poi_df = pd.read_csv(vists_w_poi_df_fp)
(
    visits_w_named_pois,
    visits_w_more_than_one_named_poi,
    grouped_category_proportions,
) = calc_group_poi_visits(visits_w_poi_df)
visits_w_named_pois.to_csv(
    f"{out_dir}{shp_name}/{outfilename}_drop_null.csv", index=False
)
visits_w_named_pois.head()

### Plot proportion of category of POI visits for control and treatment ZATs

In [None]:
grouped_cat_prop_csv_fp = f"{out_dir}{outfilename}_grouped_by_txt_category_proportions"
grouped_category_proportions.to_csv(f"{grouped_cat_prop_csv_fp}.csv")
grouped_category_proportions.head()

In [None]:
plotfilename = f"{where.plot_dir}users_living_in_sel_zat_visits_atleast_{stopping_time}min_w_poi_from_{shp_name}_shp_grouped_by_txt_category_proportions"
plot_stacked_bar_from_csv(
    f"{grouped_cat_prop_csv_fp}.csv", out_file=plotfilename, colormap="Spectral"
)

In [None]:
visits_w_poi_df.head(5)

### Compute and plot number of visits per month (or day of the year) for each group
Helps to assess variation in data quality over time scales and between years

In [None]:
time_unit = "month" #"dayofyear" or "month"

cols_for_bar = [
    "uid",  #'lat_visit', 'lng_visit',
    "datetime",  #'leaving_datetime',
    "Group",
    "name",
    "category",
]

all_visits, all_visits_grouped = count_visits_over_time(
    visits_w_poi_df, cols=cols_for_bar, time_unit = time_unit, normalize=True
)
named_poi_visits, named_visits_grouped = count_visits_over_time(
    visits_w_named_pois, cols=cols_for_bar, time_unit = time_unit, normalize=True
)

In [None]:
data = all_visits_grouped
plot_dir = where.plot_dir
palette = sns.color_palette("Paired")

title = f"Total >{stopping_time} min visits per number of users per group by {time_unit} for users living in selected ZATs"
plot_visits_bar(
    data=data,
    x=time_unit,
    y="count_normalized_nusers",
    hue="Group",
    plot_dir=plot_dir,
    title=title,
)


In [None]:
data = all_visits_grouped  
plot_dir = where.plot_dir
palette = sns.color_palette("Paired")

title = f"Total >{stopping_time} min visits per number of users per group by {time_unit} for users living in selected ZATs"
plot_visits_bar(
    data=data,
    x=time_unit,
    y="count_normalized_nusers",
    hue="Group",
    plot_dir=plot_dir,
    title=title,
)

title = f"Total >{stopping_time} min visits by {time_unit} for users living in treatment or control ZATs"
plot_visits_bar(
    data=data,
    x=time_unit,
    y="count",
    hue="Group",
    plot_dir=plot_dir,
    title=title,
)

data = named_visits_grouped  # [named_visits_grouped['month']<12]

title = f"Total >{stopping_time} min visits to named POIs per number of users per group by {time_unit} for users living in selected ZATs"
plot_visits_bar(
    data=data,
    x=time_unit,
    y="count_normalized_nusers",
    hue="Group",
    plot_dir=plot_dir,
    title=title,
)

title = f"Total >{stopping_time} min visits to named POIs by {time_unit} for users living in treatment or control ZATs"
plot_visits_bar(
    data=data,
    x=time_unit,
    y="count",
    hue="Group",
    plot_dir=plot_dir,
    title=title,
)

#### Plot the percentage of montly visits that come from control group users versus treatment group users

In [None]:
all_visits_p, all_visits_grouped_p = count_visits_over_time(
    visits_w_poi_df, cols=cols_for_bar, as_proportion=True, time_unit=time_unit, 
)
named_poi_visits_p, named_visits_grouped_p = count_visits_over_time(
    visits_w_named_pois, cols=cols_for_bar, as_proportion=True, time_unit=time_unit,
)

Note: For the 2019 data, we may want to merge the 2018 and 2019 files. But otherwise to not visualize the datapoints from the last day of December 2018 in the 2019 data, one can filter as in: `all_visits_grouped_p[all_visits_grouped_p['month']<12]`

In [None]:
data = all_visits_grouped_p  # [all_visits_grouped_p['month']<12]
title = f"Percentage >{stopping_time} min visits made by users living in treatment versus control ZATs over time"
plot_visits_bar(
    data=data,
    x=time_unit,
    y="percentage",
    hue="Group",
    plot_dir=plot_dir,
    title=title,
)

data = named_visits_grouped_p   
title = f"Percentage >{stopping_time} min visits made to named POIs by users living in treatment versus control ZATs over time"
plot_visits_bar(
    data=data,
    x=time_unit,
    y="percentage",
    hue="Group",
    plot_dir=plot_dir,
    title=title,
)

### Make the same plots but only include users that have more than a particular number of visits

In [None]:
thresh_pois = 300
thresh_named_pois = 50

visits_w_poi_df_filtered = filter_users_by_minimum_visits(
    visit_df=visits_w_poi_df, visit_threshold=thresh_pois
)
visits_w_named_pois_filtered = filter_users_by_minimum_visits(
    visit_df=visits_w_named_pois, visit_threshold=thresh_named_pois
)

In [None]:
# compute and graph filtered data
all_visits_p, all_visits_grouped_p = count_visits_over_time(
    visits_w_poi_df_filtered, cols=cols_for_bar, as_proportion=True, time_unit=time_unit
)
named_poi_visits_p, named_visits_grouped_p = count_visits_over_time(
    visits_w_named_pois_filtered, cols=cols_for_bar, as_proportion=True, time_unit=time_unit
)

data = all_visits_grouped_p  
title = f"Percentage >{stopping_time} min visits made by users living in treatment versus control ZATs (min {thresh_pois} visits per user)"
plot_visits_bar(
    data=data,
    x=time_unit,
    y="percentage",
    hue="Group",
    plot_dir=plot_dir,
    title=title,
)

data = named_visits_grouped_p   
title = f"Percentage >{stopping_time} min visits made to named POIs by users living in treatment versus control ZATs (min {thresh_named_pois} visits per user"
plot_visits_bar(
    data=data,
    x=time_unit,
    y="percentage",
    hue="Group",
    plot_dir=plot_dir,
    title=title,
)

### Filter and then plot the normalized version of the data

In [None]:
all_visits_p, all_visits_grouped_p = count_visits_over_time(
    visits_w_poi_df_filtered, cols=cols_for_bar, normalize=True, time_unit=time_unit
)
named_poi_visits_p, named_visits_grouped_p = count_visits_over_time(
    visits_w_named_pois_filtered, cols=cols_for_bar, normalize=True, time_unit=time_unit
)

data = all_visits_grouped_p  # [all_visits_grouped_p['month']<12]
title = f"Total >{stopping_time} min visits per number of users per group by {time_unit} for users living in selected ZATs (min {thresh_pois} visits per user)"
plot_visits_bar(
    data=data,
    x=time_unit,
    y="count_normalized_nusers",
    hue="Group",
    plot_dir=plot_dir,
    title=title,
)

data = named_visits_grouped_p   
title = f"Total >{stopping_time} min visits to named POIs per number of users per group by {time_unit} for users living in selected ZATs (min {thresh_named_pois} visits per user)"
plot_visits_bar(
    data=data,
    x=time_unit,
    y="count_normalized_nusers",
    hue="Group",
    plot_dir=plot_dir,
    title=title,
)

### Check the frequency of visits to see if there are some lat, lng pairs that are outliers 

For instance in the 2019 data, there appeara to be an outlier with >7000 visits.

In [None]:
df = visits_w_poi_df
counted_pairs = df.groupby(['lat_visit', 'lng_visit']).size().reset_index(name='counts')
sorted_counted_pairs = counted_pairs.sort_values('counts', ascending=False)
print(sorted_counted_pairs)

# Specify the specific latitude and longitude pair you're interested in (e.g. most common pair)
target_lat = 4.570431 #4.649300 (2019) #4.570431 (2018)
target_lng = -74.095920 #-74.061699 (2019) #-74.095920 (2018)

# Filter the dataframe to retrieve rows with the specific lat_visit, lng_visit pair
filtered_df = df[(df['lat_visit'] == target_lat) & (df['lng_visit'] == target_lng)]
#filtered_df.head(10)

### Add additional filtering of users by stricter QC thresholds

In [None]:
# read in users who are in both 2018 and 2019 and who pass different qc thresholds
user_stats_both_years_dir = os.path.join(c["run"]["working_dir"], c["run"]["user_stats_dir_both_years"])

uids_in_both60 = pd.read_csv(os.path.join(user_stats_both_years_dir, 
                            f"user_stats_common_users_2018_2019_minpings60_mindays{min_days}_shp_filtered.csv"))
uids_in_both150 = pd.read_csv(os.path.join(user_stats_both_years_dir, 
                            f"user_stats_common_users_2018_2019_minpings150_mindays{min_days}_shp_filtered.csv"))
uids_in_both300 = pd.read_csv(os.path.join(user_stats_both_years_dir, 
                            f"user_stats_common_users_2018_2019_minpings300_mindays{min_days}_shp_filtered.csv"))

In [None]:
# filter the visit dfs by these users
visits_w_poi_df_60 = visits_w_poi_df[visits_w_poi_df['uid'].isin(uids_in_both60['uid'])]
visits_w_poi_df_150 = visits_w_poi_df[visits_w_poi_df['uid'].isin(uids_in_both150['uid'])]
visits_w_poi_df_300 = visits_w_poi_df[visits_w_poi_df['uid'].isin(uids_in_both300['uid'])]

visits_w_named_pois_60 = visits_w_named_pois[visits_w_named_pois['uid'].isin(uids_in_both60['uid'])]
visits_w_named_pois_150 = visits_w_named_pois[visits_w_named_pois['uid'].isin(uids_in_both150['uid'])]
visits_w_named_pois_300 = visits_w_named_pois[visits_w_named_pois['uid'].isin(uids_in_both300['uid'])]

# if desired, set the visits_dfs to be the desired filtered dfs and then use the graphing functions to make the graphs
# visits_w_poi_df = visits_w_poi_df_60
# visits_w_named_pois = visits_w_named_pois_60