In [1]:
import sys, os
import numpy as np
import pandas as pd
import geopandas as gpd
import datetime as dt
from survey import Survey, nine_to_county, purp_num_to_name18, purp_num_to_name23, mode_num_to_name23, county_order
sys.path.insert(0, r'Y:\champ\util\pythonlib-migration\master_versions\misc_utils')
from df_utils import df_to_excel

In [2]:
SURVEY_2023 = {'household':{'filepath_or_buffer':r'..\..\..\..\Review_20240809\flagged\hh.csv'},
               'person':{'filepath_or_buffer':r'..\..\..\..\Review_20240809\flagged\person.csv'},
               'day':{'filepath_or_buffer':r'..\..\..\..\Review_20240809\flagged\day.csv'},
               'trip':{'filepath_or_buffer':r'..\..\..\..\Review_20240809\flagged\trip.csv'},
               'vehicle':{'filepath_or_buffer':r'..\..\..\..\Deliverable_20240809\vehicle.csv'},
               'location':{'filepath_or_buffer':r'..\..\..\..\Deliverable_20240809\location.csv'},
               }
OUTDIR = r'..\..\..\..\Review_20240809'
COUNTIES = r'Q:\GIS\Boundaries\Counties\Counties.shp'
FIPS_COUNTIES   =['001','013','041','055','075','081','085','095','097']

In [5]:
def flag_complete_trips(trip, trip_reqs):
    complete = 1
    for c in trip_reqs:
        complete = complete * trip[c]
    trip['sfcta_is_complete'] = complete
    return trip

def flag_complete_days(day, trip, day_reqs):
    ## apply the day flags
    day.drop(columns=['sfcta_num_trips','sfcta_day_trips_complete'], inplace=True)
    
    # recalc day trips complete based on the trip completeness status above.
    tmp = (trip
           .groupby(['hh_id','person_id','day_num'])
           .agg({'sfcta_is_complete':'sum','trip_id':'count'})
           .rename(columns={'sfcta_is_complete':'sfcta_num_trips'}))
    tmp['sfcta_day_trips_complete'] = (tmp['sfcta_num_trips'].eq(tmp['trip_id'])*1)
    day = pd.merge(day, tmp[['sfcta_num_trips','sfcta_day_trips_complete']], left_on=['hh_id','person_id','day_num'], right_index=True, how='left')

    # correct for no-travel days
    day['sfcta_num_trips'] = day['sfcta_num_trips'].fillna(0)
    day.loc[day['sfcta_num_trips'].eq(0) & day['no_travel_1'].eq(0),'sfcta_day_trips_complete'] = 1
    day['sfcta_day_trips_complete'] = day['sfcta_day_trips_complete'].fillna(0)
    
    complete = 1
    for c in day_reqs:
        complete = complete * day[c]
    
    day['sfcta_day_complete'] = (complete * day['sfcta_day_trips_complete']) * 1
    return day

def flag_complete_persons(person, day, person_reqs, min_concurrent_days):
    person.drop(columns=['sfcta_num_days_complete'], inplace=True)
    # roll up to person level
    person = pd.merge(person,
                      day.groupby('person_id', as_index=False)
                         .agg({'sfcta_day_complete':'sum'})
                         .rename(columns={'sfcta_day_complete':'sfcta_num_days_complete'}),
                      how='left')
    complete = 1
    for c in person_reqs:
        complete = complete * person[c]
    
    ## apply the minimum concurrent days req
    person['sfcta_person_days_complete'] = (((person['diary_platform'].isin(['browser','call']) | person['has_proxy'].eq(1)) & person['sfcta_num_days_complete'].ge(1)) |
                                                    (person['diary_platform'].eq('rmove') & person['sfcta_num_days_complete'].ge(min_concurrent_days))) * 1
    person['sfcta_person_survey_complete'] = complete
    person['sfcta_person_complete'] = (person['sfcta_person_survey_complete'].eq(1) & person['sfcta_person_days_complete'].eq(1))*1
    return person

def flag_concurrent_days(hh, person, day, min_concurrent_days, min_concurrent_weighted_days):
    hh.drop(columns=['num_concurrent_non_proxy_days','num_concurrent_total_days',
                     'num_concurrent_weighted_weekdays','num_concurrent_weighted_days'], inplace=True)
    # concurrent days
    day = pd.merge(day, hh[['hh_id','diary_platform']])
    day = pd.merge(day, person[['person_id','has_proxy']])
    
    concurrent_day = day.pivot_table(index=['hh_id','diary_platform'], 
                                     columns=['has_proxy','travel_dow'], 
                                     values='sfcta_day_complete', 
                                     aggfunc='sum').fillna(0).astype(int).reset_index().set_index('hh_id')
    concurrent_weighted_day = day.pivot_table(index=['hh_id','diary_platform'], 
                                     columns=['has_weight','travel_dow'], 
                                     values='sfcta_day_complete', 
                                     aggfunc='sum').fillna(0).astype(int).reset_index().set_index('hh_id')
    
    hh_size = (person.groupby('hh_id')
                     .agg({'person_id':'nunique','has_proxy':'sum'})
                     .rename(columns={'person_id':'persons','has_proxy':'proxy_persons'})
              )
    hh_size['non_proxy_persons'] = hh_size['persons'] - hh_size['proxy_persons']
    concurrent_day['num_concurrent_non_proxy_days'] = (concurrent_day[0].eq(hh_size['non_proxy_persons'], axis=0)*1).sum(axis=1)
    concurrent_day['num_concurrent_total_days'] = ((concurrent_day[0]+concurrent_day[1]).eq(hh_size['persons'], axis=0)*1).sum(axis=1)
    concurrent_weighted_day['num_concurrent_weighted_weekdays'] = ((concurrent_weighted_day[1].loc[:,2:4]).eq(hh_size['persons'], axis=0)*1).sum(axis=1)
    concurrent_weighted_day['num_concurrent_weighted_days'] = ((concurrent_weighted_day[1]).eq(hh_size['persons'], axis=0)*1).sum(axis=1)
    
    j = pd.DataFrame(concurrent_day[['num_concurrent_non_proxy_days','num_concurrent_total_days']])
    j.columns=['num_concurrent_non_proxy_days','num_concurrent_total_days']
    k = pd.DataFrame(concurrent_weighted_day[['num_concurrent_weighted_weekdays','num_concurrent_weighted_days']])
    k.columns=['num_concurrent_weighted_weekdays','num_concurrent_weighted_days']
    jk = pd.merge(j, k, left_index=True, right_index=True)
    hh = pd.merge(hh, jk, left_on='hh_id', right_index=True)
    hh.set_index('hh_id', inplace=True)
    hh['sfcta_hh_concurrent_complete'] = ((concurrent_day['diary_platform'].eq('rmove') & 
                                           concurrent_day['num_concurrent_non_proxy_days'].ge(min_concurrent_days) & 
                                           concurrent_day['num_concurrent_total_days'].ge(1)) | 
                                          (concurrent_day['diary_platform'].isin(['call','browser']) &
                                           concurrent_day['num_concurrent_non_proxy_days'].ge(1) & 
                                           concurrent_day['num_concurrent_total_days'].ge(1)))* 1
    hh['sfcta_hh_concurrent_weighted_complete'] = concurrent_weighted_day['num_concurrent_weighted_weekdays'].ge(min_concurrent_weighted_days)*1
    hh.reset_index(inplace=True)
    return hh
    
def flag_complete_hhs(hh, person, hh_reqs):
    hh.drop(columns=['sfcta_num_persons_complete','sfcta_hh_persons_complete'], inplace=True)
    
    # person rollup to hh
    tmp = (person.groupby('hh_id', as_index=False)
                 .agg({'sfcta_person_complete':'sum', 'person_id':'count','sfcta_bipoc_flag':'sum'})
                 .rename(columns={'sfcta_person_complete':'sfcta_num_persons_complete',
                                  'person_id':'persons'})
           )
    tmp['sfcta_hh_persons_complete'] = (tmp['sfcta_num_persons_complete'].eq(tmp['persons']))*1
    hh = pd.merge(hh, tmp[['hh_id','sfcta_num_persons_complete','sfcta_hh_persons_complete']],
                  on='hh_id', how='left')
    
    complete = 1
    for c in hh_reqs:
        complete = complete * hh[c]
    
    hh['sfcta_hh_survey_complete'] = complete
    hh['sfcta_hh_complete'] = (hh['sfcta_hh_survey_complete'].eq(1) & 
                               hh['sfcta_hh_persons_complete'].eq(1) &
                               hh['sfcta_hh_concurrent_complete'].eq(1) & 
                               hh['sfcta_hh_concurrent_weighted_complete'].eq(1))*1
    return hh

In [6]:
s23 = Survey(**SURVEY_2023)

In [7]:
TRIP_REQS = ['o_purpose_category_complete','d_purpose_category_complete','mode_type_complete',
             'depart_hour_complete','depart_minute_complete','depart_seconds_complete',
             'arrive_hour_complete','arrive_minute_complete','arrive_second_complete',
             'o_lat_complete','o_lon_complete','d_lat_complete','d_lon_complete']
DAY_REQS = ['telecommute_time_complete']

HH_REQS = ['num_workers_complete','num_adults_complete','num_kids_complete', #'income_detailed','income_broad',
           'num_workers_complete','num_vehicles_complete','home_lat_complete','home_lon_complete']

ITEM_CRITERIA = ['lenient','strict']
HOUSEHOLD_UNIT = ['family','household']
CONCURRENT_DAYS = [1,2,3,4,5,6,7]
CONCURRENT_WEIGHTED_DAYS = [1]

mi = pd.MultiIndex.from_product([ITEM_CRITERIA,HOUSEHOLD_UNIT,CONCURRENT_DAYS,CONCURRENT_WEIGHTED_DAYS],
                                names=['item_critera','household_unit','concurrent_days','concurrent_weighted_days'])
cols = ['sfcta_hh_survey_complete','sfcta_hh_persons_complete',
        'sfcta_hh_concurrent_complete','sfcta_hh_concurrent_weighted_complete',
        'sfcta_hh_complete']

df = pd.DataFrame(index=mi, columns=cols)
for h1 in ITEM_CRITERIA:
    PERSON_REQS = ['race_complete','ethnicity_complete','gender_complete','age_complete','employment_complete','telework_complete',
               'school_loc_complete','work_park_complete']
    if h1 == 'strict':
        PERSON_REQS = PERSON_REQS + ['student_complete','work_loc_complete','transit_pass_complete','has_proxy_complete','can_drive_complete']
    
    for h2 in HOUSEHOLD_UNIT:
        for concurrent_days in CONCURRENT_DAYS:
            for concurrent_weighted_days in CONCURRENT_WEIGHTED_DAYS:
                
                #incentives = pd.read_csv(r'..\..\..\..\Review_20240809\flagged\incentives.csv')

                if h2 == 'family':
                    person = s23.person.loc[s23.person['relationship'].le(5)]
                    hh = s23.hh.loc[s23.hh['hh_id'].isin(s23.person['hh_id'])]
                    day = s23.day.loc[s23.day['person_id'].isin(s23.person['person_id'])]
                    trip = s23.trip.loc[s23.trip['person_id'].isin(s23.person['person_id'])]
                else:
                    person = s23.person.copy()
                    hh = s23.hh.copy()
                    day = s23.day.copy()
                    trip = s23.trip.copy()

                ## apply the trip flags
                trip = flag_complete_trips(trip, TRIP_REQS)

                ## apply the day flags
                day = flag_complete_days(day, trip, DAY_REQS)

                ## apply the person flags
                person = flag_complete_persons(person, day, PERSON_REQS, concurrent_days)

                ## flag household concurrent days
                hh = flag_concurrent_days(hh, person, day, concurrent_days, concurrent_weighted_days)

                ## apply the household flags
                hh = flag_complete_hhs(hh, person, HH_REQS)


                data = [h1,h2,concurrent_days,concurrent_weighted_days]+list(hh[cols].sum().values)
                d = pd.DataFrame(columns=['item_criteria','household_unit','concurrent_days','concurrent_weighted_days']+cols, data=[data])
                d.set_index(['item_criteria','household_unit','concurrent_days','concurrent_weighted_days'], inplace=True)
                df.update(d)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  person.drop(columns=['sfcta_num_days_complete'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  person.drop(columns=['sfcta_num_days_complete'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  person.drop(columns=['sfcta_num_days_complete'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-

In [8]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,sfcta_hh_survey_complete,sfcta_hh_persons_complete,sfcta_hh_concurrent_complete,sfcta_hh_concurrent_weighted_complete,sfcta_hh_complete
item_critera,household_unit,concurrent_days,concurrent_weighted_days,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
lenient,family,1,1,8258.0,8031.0,8011.0,7753.0,7753.0
lenient,family,2,1,8258.0,7976.0,7952.0,7753.0,7708.0
lenient,family,3,1,8258.0,7892.0,7848.0,7753.0,7625.0
lenient,family,4,1,8258.0,7740.0,7672.0,7753.0,7468.0
lenient,family,5,1,8258.0,7485.0,7403.0,7753.0,7219.0
lenient,family,6,1,8258.0,6852.0,6731.0,7753.0,6582.0
lenient,family,7,1,8258.0,5244.0,5240.0,7753.0,5152.0
lenient,household,1,1,8258.0,7543.0,7523.0,7271.0,7271.0
lenient,household,2,1,8258.0,7490.0,7466.0,7271.0,7227.0
lenient,household,3,1,8258.0,7412.0,7370.0,7271.0,7152.0


In [9]:
df.to_csv(os.path.join(OUTDIR,r'completeness_sensitivity.csv'))

# explore weighted complete / incomplete
## hhsize (actual) by whether they have at least 1 concurrent weighted day

In [11]:
hh.pivot_table(index='num_people', columns='sfcta_hh_concurrent_weighted_complete', values='hh_id', aggfunc='size')

sfcta_hh_concurrent_weighted_complete,0,1
num_people,Unnamed: 1_level_1,Unnamed: 2_level_1
1,47.0,3532.0
2,356.0,2520.0
3,275.0,648.0
4,208.0,423.0
5,68.0,102.0
6,21.0,34.0
7,7.0,9.0
8,2.0,2.0
9,2.0,1.0
10,1.0,


## concurrent weighted complete fall households

In [12]:

hh.loc[hh['sfcta_hh_concurrent_weighted_complete'].eq(1) & hh['season'].eq('fall') & hh['num_people'].ge(2),[ 'hh_id','num_people','diary_platform','hh_weight']]

Unnamed: 0,hh_id,num_people,diary_platform,hh_weight
444,23100110,2,rmove,427.212388
453,23100853,2,rmove,59.890662
454,23100913,2,rmove,25.078525
456,23100970,3,browser,40.761685
457,23101043,2,rmove,16.335494
...,...,...,...,...
8236,23802629,4,rmove,67.163840
8241,23802812,2,rmove,32.000246
8243,23802914,3,rmove,11.837159
8246,23803034,2,rmove,31.457939


### Example 1, HH 23100110

This household is complete.  

All people have weights, all people are related.

Concurrent travel days are Tuesday, Wednesday, and Thursday.

In [22]:
person.loc[person['hh_id'].eq(23100110),['person_id','diary_platform','person_num','relationship','num_days_complete','sfcta_num_days_complete','person_weight']]

Unnamed: 0,person_id,diary_platform,person_num,relationship,num_days_complete,sfcta_num_days_complete,person_weight
876,2310011001,rmove,1,0,7,7.0,427.212388
877,2310011002,rmove,2,1,7,7.0,427.212388


In [24]:
(day.loc[day['hh_id'].eq(23100110),['hh_id','person_num','travel_dow','sfcta_day_complete','day_weight','has_weight']]
   .pivot_table(index='travel_dow',columns='person_num',values='has_weight',aggfunc='sum'))

person_num,1,2
travel_dow,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,0
2,1,1
3,1,1
4,1,1
5,0,0
6,0,0
7,0,0


## concurrent weighted incomplete fall households

In [15]:
hh.loc[hh['sfcta_hh_concurrent_weighted_complete'].eq(0) & hh['season'].eq('fall') & hh['num_people'].ge(2),[ 'hh_id','num_people','diary_platform','hh_weight']]

Unnamed: 0,hh_id,num_people,diary_platform,hh_weight
448,23100522,4,rmove,81.855570
476,23102548,2,rmove,60.263538
487,23103239,2,rmove,21.279372
518,23105396,2,rmove,24.201401
526,23105808,4,rmove,249.832152
...,...,...,...,...
8219,23801487,2,rmove,54.825000
8228,23802298,2,rmove,16.990262
8232,23802428,3,rmove,75.595527
8238,23802755,2,rmove,80.798196


### Example 2, HH 23100522

This household is incomplete.  

The persons appear complete; they each have a weight and sfcta_num_days_complete >=5

They all appear to share a weighted travel day on Wednesday.  

But persons 3 and 4 did not complete any trips on Wednesday, and did not provide a no_travel reason.  

In [26]:
person.loc[person['hh_id'].eq(23100522),['person_id','diary_platform','person_num','relationship','num_days_complete','sfcta_num_days_complete','person_weight']]

Unnamed: 0,person_id,diary_platform,person_num,relationship,num_days_complete,sfcta_num_days_complete,person_weight
881,2310052201,rmove,1,0,6,6.0,81.85557
882,2310052202,rmove,2,1,7,7.0,81.85557
883,2310052203,rmove,3,2,1,5.0,81.85557
884,2310052204,rmove,4,2,1,5.0,81.85557


In [27]:
# weightedness by travel_dow by person
(day.loc[day['hh_id'].eq(23100522),['hh_id','person_num','travel_dow','sfcta_day_complete','day_weight','has_weight']]
   .pivot_table(index='travel_dow',columns='person_num',values='has_weight',aggfunc='sum'))

person_num,1,2,3,4
travel_dow,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,0,0,0
2,0,0,0,0
3,0,0,0,0
4,1,1,1,1
5,0,0,0,0
6,0,0,0,0
7,0,0,0,0


In [32]:
day.loc[day['hh_id'].eq(23100522),['hh_id','person_num','travel_dow','sfcta_day_complete','sfcta_day_trips_complete','sfcta_day_survey_complete','sfcta_num_trips','no_travel_1','no_travel_2','no_travel_3','no_travel_4','no_travel_5','no_travel_6','no_travel_7','no_travel_8','no_travel_9','no_travel_11','no_travel_12','day_weight','has_weight']]

Unnamed: 0,hh_id,person_num,travel_dow,sfcta_day_complete,sfcta_day_trips_complete,sfcta_day_survey_complete,sfcta_num_trips,no_travel_1,no_travel_2,no_travel_3,no_travel_4,no_travel_5,no_travel_6,no_travel_7,no_travel_8,no_travel_9,no_travel_11,no_travel_12,day_weight,has_weight
3808,23100522,1,1,0.0,0.0,1,3.0,995,995,995,995,995,995,995,995,995,995,995,0.0,0
3809,23100522,2,1,1.0,1.0,1,2.0,1,0,0,0,0,0,0,0,0,0,0,0.0,0
3810,23100522,3,1,1.0,1.0,1,2.0,995,995,995,995,995,995,995,995,995,995,995,0.0,0
3811,23100522,4,1,1.0,1.0,1,2.0,995,995,995,995,995,995,995,995,995,995,995,0.0,0
3812,23100522,1,2,1.0,1.0,1,3.0,995,995,995,995,995,995,995,995,995,995,995,0.0,0
3813,23100522,2,2,1.0,1.0,1,5.0,1,0,0,0,0,0,0,0,0,0,0,0.0,0
3814,23100522,3,2,1.0,1.0,1,3.0,995,995,995,995,995,995,995,995,995,995,995,0.0,0
3815,23100522,4,2,1.0,1.0,1,3.0,995,995,995,995,995,995,995,995,995,995,995,0.0,0
3816,23100522,1,3,1.0,1.0,1,2.0,995,995,995,995,995,995,995,995,995,995,995,0.0,0
3817,23100522,2,3,1.0,1.0,1,6.0,995,995,995,995,995,995,995,995,995,995,995,0.0,0


In [29]:
trip.loc[trip['person_id'].eq(2310052203),['hh_id','person_num','travel_dow','sfcta_is_complete']]

Unnamed: 0,hh_id,person_num,travel_dow,sfcta_is_complete
14333,23100522,3,1,1
14334,23100522,3,1,1
14335,23100522,3,2,1
14336,23100522,3,2,1
14337,23100522,3,2,1
14338,23100522,3,3,1
14339,23100522,3,3,1
14340,23100522,3,6,1
14341,23100522,3,6,1
14342,23100522,3,7,1


In [21]:
# 
(day.loc[day['hh_id'].eq(23100522) & day['sfcta_day_complete'].eq(1),['hh_id','person_num','travel_dow','sfcta_day_complete','day_weight','has_weight']]
   .pivot_table(index='travel_dow',columns='person_num',values='has_weight',aggfunc='sum'))

person_num,1,2,3,4
travel_dow,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0
4,1.0,1.0,,
5,0.0,0.0,,
6,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0
