In [1]:
import sys, os
import numpy as np
import pandas as pd
import geopandas as gpd
import datetime as dt
from survey import Survey, nine_to_county, purp_num_to_name18, purp_num_to_name23, mode_num_to_name23, county_order
sys.path.insert(0, r'Y:\champ\util\pythonlib-migration\master_versions\misc_utils')
from df_utils import df_to_excel

In [2]:
SURVEY_2023 = {'household':{'filepath_or_buffer':r'..\..\..\..\Deliverable_20241127\hh.csv'},
               'person':{'filepath_or_buffer':r'..\..\..\..\Deliverable_20241127\person.csv'},
               'day':{'filepath_or_buffer':r'..\..\..\..\Deliverable_20241127\day.csv'},
               'trip':{'filepath_or_buffer':r'..\..\..\..\Deliverable_20241127\trip.csv'},
               'vehicle':{'filepath_or_buffer':r'..\..\..\..\Deliverable_20241127\vehicle.csv'},
               'location':{'filepath_or_buffer':r'..\..\..\..\Deliverable_20241127\location.csv'},
               }
OUTDIR = r'..\..\..\..\Review_20241127'
INCENTIVES = r'<PATH>\incentives_disaggregate.xlsx'
ACS_MEANS_OF_TRANSPORTATION = r'<PATH>\ACS\ACSDT5Y2021.B08301-Data.csv'
FIPS_COUNTIES   =['001','013','041','055','075','081','085','095','097']
CONCURRENT_DAY_MIN = 4
HOW='lenient'

## Definitions
*Low income blockgroup*: a blockgroup in the 90th percentile of blockgroups by households with an annual income of \$25,000 or less.
 - source: Table B19001
 - fields:
   - B19001_001E (total)
   - B19001_002E (lt \$10k)
   - B19001_003E (\\$10k-\$15k)
   - B19001_004E (\\$15k-\$20k)
   - B19001_005E (\\$20k-\$25k)
  
*BIPOC blockgroup*: a blockgroup in the 90th percentile by percent of households in the blockgroup who are black, indigenous, person-of-color, and hispanic.  Only if not already labeled "Low-Income Oversample"
 - source: Table B03002
 - fields:
    - B03002_001E (total)
    - B03002_003E (white alone, not hispanic or latino)

*Walk/bike/transit blockgroup*: segmented by county, a blockgroup in the 90th percentile of walk/bike/transit shares. Only if not already labeled "Low-Income Oversample" or "BIPOC Oversample".
 - source: Table B08301
 - fields:
   - B08301_001E (total)
   - B08301_010E (public transportation, excluding taxicab)
   - B08301_018E (bicycle)
   - B08301_019E (walk)

## Sources
 - 2017-2021 5-year ACS
 - Public Use Microdata Areas

In [3]:
acs_mode = pd.read_csv(ACS_MEANS_OF_TRANSPORTATION, skiprows=[1])
acs_mode = acs_mode[['GEO_ID','NAME','B08301_001E','B08301_010E','B08301_018E','B08301_019E']]
acs_mode.insert(1, 'county', acs_mode['GEO_ID'].map(lambda x: x[11:14]))
acs_mode.insert(2, 'bg', acs_mode['GEO_ID'].map(lambda x: int(x[9:])))
acs_mode = acs_mode.loc[acs_mode['county'].isin(FIPS_COUNTIES)]
acs_mode['tbw_share'] = acs_mode[['B08301_010E','B08301_018E','B08301_019E']].sum(axis=1) / acs_mode['B08301_001E']
q90 = acs_mode.groupby('county', as_index=False)['tbw_share'].quantile(0.9).rename(columns={'tbw_share':'q90'})
acs_mode = pd.merge(acs_mode, q90, on='county', how='left')
acs_mode['tbw_flag'] = acs_mode['tbw_share'].ge(acs_mode['q90']) * 1

In [4]:
incentives = pd.read_excel(INCENTIVES)

In [5]:
MISSING_CODES = [-1, 995,998,999] # missing, missing, don't know, prefer not to answer

In [6]:
#w2 = pd.ExcelWriter(r'Q:\Data\Surveys\HouseholdSurveys\MTC-SFCTA2022\Review_20240809\incentives_min_days_{}_{}_imputed.xlsx'.format(CONCURRENT_DAY_MIN, HOW))

In [7]:
s23 = Survey(**SURVEY_2023)

# SFCTA Completeness Evaluation
## Trip Completeness
add sfcta "trip complete" flag, cross-tab against rsg completeness

In [8]:
trip = pd.merge(s23.person[['person_id','diary_platform']], s23.trip)

In [9]:
## Create the trip flags
flag = False
trip_flags = []
for c in ['o_purpose_category','d_purpose_category','mode_type',
          'depart_hour','depart_minute','depart_seconds',
          'arrive_hour','arrive_minute','arrive_second',
          'o_lat','o_lon','d_lat','d_lon']:
    trip_flags.append(c+'_complete')
    trip[c+'_complete'] = ~(trip[c].isin([MISSING_CODES]) | pd.isnull(trip[c])) * 1
    flag = flag | trip[c].isin([MISSING_CODES]) | pd.isnull(trip[c])

# Flag the original
trip['sfcta_is_complete'] = (~flag)*1
trip['has_weight'] = trip['trip_weight'].gt(0) * 1
trip['has_weight_rmove'] = trip['trip_weight_rmove_only'].gt(0) * 1
s23.trip = pd.merge(s23.trip, trip[trip_flags+['trip_id','sfcta_is_complete','has_weight','has_weight_rmove']], on='trip_id', how='left')

In [10]:
trip['o_purpose_category_complete']

0         1
1         1
2         1
3         1
4         1
         ..
365826    1
365827    1
365828    1
365829    1
365830    1
Name: o_purpose_category_complete, Length: 365831, dtype: int32

## Day Completeness

In [11]:
## Roll up the completeness analysis to the day.
tmp = (s23.trip
       .groupby(['hh_id','person_id','day_num'])
       .agg({'sfcta_is_complete':'sum','trip_id':'count'})
       .rename(columns={'sfcta_is_complete':'sfcta_num_trips'}))
tmp['sfcta_day_trips_complete'] = (tmp['sfcta_num_trips'].eq(tmp['trip_id'])*1)
day = pd.merge(s23.day, tmp[['sfcta_num_trips','sfcta_day_trips_complete']], 
               left_on=['hh_id','person_id','day_num'], right_index=True, how='left')

day.loc[(day['made_travel'].eq(2) | day['num_reasons_no_travel'].gt(0)) & 
            pd.isnull(day['sfcta_day_trips_complete']), 'sfcta_num_trips'] = 0
day.loc[(day['made_travel'].eq(2) | day['num_reasons_no_travel'].gt(0)) 
            & pd.isnull(day['sfcta_day_trips_complete']), 'sfcta_day_trips_complete'] = 1
day = pd.merge(s23.person[['person_id','age','employment','is_proxy','has_proxy','student','diary_platform']],
               day)

In [12]:
flag=day['employment'].isin([1,2,3,7,8])
flag_cols = []
for c in ['telecommute_time']:
    flag_cols.append(c+'_complete')
    day[c+'_complete'] = ~(day['employment'].isin([1,2,3,7,8]) & (day[c].isin(MISSING_CODES) | pd.isnull(day[c]))) * 1
    flag = flag & (day[c].isin(MISSING_CODES) | pd.isnull(day[c]))
missing_telecommute_flag = flag

day['sfcta_day_survey_complete'] = (~missing_telecommute_flag)*1
day['sfcta_day_complete'] = (day['sfcta_day_survey_complete'].eq(1) & day['sfcta_day_trips_complete'].eq(1))*1
day['has_weight'] = day['day_weight'].gt(0) * 1
day['has_weight_rmove'] = day['day_weight_rmove_only'].gt(0) * 1

# flag the original
s23.day = pd.merge(s23.day, day[flag_cols + ['day_id','sfcta_num_trips','sfcta_day_survey_complete',
                                 'sfcta_day_trips_complete','sfcta_day_complete','has_weight','has_weight_rmove']],
                   on='day_id', how='left')

## Concurrent Days

In [13]:
# check that each travel_dow within a household is assigned the same `travel_date`
s = day.groupby(['hh_id','travel_dow']).agg({'travel_date':'nunique'})

In [14]:
concurrent_day = day.pivot_table(index=['hh_id','diary_platform'], 
                                 columns=['has_proxy','travel_dow'], 
                                 values='sfcta_day_complete', 
                                 aggfunc='sum').fillna(0).astype(int).reset_index().set_index('hh_id')

In [15]:
concurrent_weighted_day = day.pivot_table(index=['hh_id','diary_platform'], 
                                 columns=['has_weight','travel_dow'], 
                                 values='sfcta_day_complete', 
                                 aggfunc='sum').fillna(0).astype(int).reset_index().set_index('hh_id')

In [16]:
hh_size = s23.person.groupby('hh_id').agg({'person_id':'nunique',
                                           'has_proxy':'sum'}).rename(columns={'person_id':'persons',
                                                                               'has_proxy':'proxy_persons'})
hh_size['non_proxy_persons'] = hh_size['persons'] - hh_size['proxy_persons']

In [17]:
concurrent_day['num_concurrent_non_proxy_days'] = (concurrent_day[0].eq(hh_size['non_proxy_persons'], axis=0)*1).sum(axis=1)
concurrent_day['num_concurrent_total_days'] = ((concurrent_day[0]+concurrent_day[1]).eq(hh_size['persons'], axis=0)*1).sum(axis=1)

In [18]:
concurrent_weighted_day['num_concurrent_weighted_weekdays'] = ((concurrent_weighted_day[1].loc[:,2:4]).eq(hh_size['persons'], axis=0)*1).sum(axis=1)
concurrent_weighted_day['num_concurrent_weighted_days'] = ((concurrent_weighted_day[1]).eq(hh_size['persons'], axis=0)*1).sum(axis=1)

In [19]:
concurrent_day['sfcta_pay_complete'] = ((concurrent_day['diary_platform'].eq('rmove') & 
                                         concurrent_day['num_concurrent_non_proxy_days'].ge(CONCURRENT_DAY_MIN) & 
                                         concurrent_day['num_concurrent_total_days'].ge(1)) | 
                                        (concurrent_day['diary_platform'].isin(['call','browser']) &
                                         concurrent_day['num_concurrent_non_proxy_days'].ge(1) & 
                                         concurrent_day['num_concurrent_total_days'].ge(1)))* 1
concurrent_day['sfcta_deliver_complete'] = (concurrent_day['num_concurrent_total_days'].ge(1)) * 1

In [20]:
# = concurrent_day.groupby('diary_platform').agg({('sfcta_pay_complete',''):'sum',
#                                              ('sfcta_deliver_complete',''):'sum'})
#t.loc['total'] = t.sum()

In [21]:
#s = concurrent_day.loc[concurrent_day.index.isin(s23.hh.loc[s23.hh['home_county'].eq(6075),'hh_id'].tolist())].groupby('diary_platform').agg({('sfcta_pay_complete',''):'sum',
#                                              ('sfcta_deliver_complete',''):'sum'})
#s.loc['total'] = s.sum()

In [22]:
j = pd.DataFrame(concurrent_day[['num_concurrent_non_proxy_days','num_concurrent_total_days','sfcta_pay_complete','sfcta_deliver_complete']])
j.columns=['num_concurrent_non_proxy_days','num_concurrent_total_days','sfcta_pay_complete','sfcta_deliver_complete']

In [23]:
k = pd.DataFrame(concurrent_weighted_day[['num_concurrent_weighted_weekdays','num_concurrent_weighted_days']])
k.columns=['num_concurrent_weighted_weekdays','num_concurrent_weighted_days']

In [24]:
jk = pd.merge(j, k, left_index=True, right_index=True)

In [25]:
hh = pd.merge(s23.hh, jk, left_on='hh_id', right_index=True)

## Person Completeness

In [26]:
# roll up to person level
s23.person = pd.merge(s23.person,
                      day.groupby('person_id', as_index=False)
                         .agg({'sfcta_day_complete':'sum'})
                         .rename(columns={'sfcta_day_complete':'sfcta_num_days_complete'}),
                      how='left')
person = s23.person

In [27]:
p16 = person['age'].ge(3)
p18 = person['age'].ge(4)

missing_person_ethnicity_flag = False | (p18 & (person['ethnicity_imputed'].eq('missing') | pd.isnull(person['ethnicity_imputed'])))
missing_person_race_flag = False | (p18 & (person['race_imputed'].eq('missing') | pd.isnull(person['race_imputed'])))
missing_person_gender_flag = person['gender_imputed'].eq('missing') | pd.isnull(person['gender_imputed'])
missing_person_age_flag = False | person['age'].isin(MISSING_CODES) | pd.isnull(person['age'])
missing_person_has_proxy_flag = False | person['has_proxy'].isin(MISSING_CODES) | pd.isnull(person['has_proxy'])

missing_person_student_flag = False | (p16 & (person['student'].isin(MISSING_CODES) | pd.isnull(person['student'])))
missing_person_employment_flag = False | (p16 & (person['employment'].isin(MISSING_CODES) | pd.isnull(person['employment'])))
missing_person_can_drive_flag = False | (p16 & (person['can_drive'].isin(MISSING_CODES) | pd.isnull(person['can_drive'])))
missing_person_education_flag = False | (p18 & (person['education'].isin(MISSING_CODES) | pd.isnull(person['education'])))

# telework flag and work_loc, work_park flag
# first check whether they're employed
flag = (p16 & person['employment'].isin([1,2,3,7]))
# then check that the do not ONLY work from home
flag = flag & (person['job_type'].isin([1,2,4,5]))
# for work_loc, check that they travel to work not never
missing_work_loc_flag = flag & (person['commute_freq'].isin([1,2,3,4,5,6,7,8]))
# check missing work park
missing_work_park_flag = missing_work_loc_flag & (person['work_park'].isin(MISSING_CODES) | pd.isnull(person['work_park']))
# check missing work loc
for c in ['work_lat','work_lon']:
    missing_work_loc_flag = missing_work_loc_flag & (person[c].isin(MISSING_CODES) | pd.isnull(person[c]))
# for telework check that they go to the office 5 or fewer days a week
missing_telework_flag = flag & (person['commute_freq'].isin([2,3,4,5,6,7,8,996]))
# then check that they reported telework frequency
missing_telework_flag = missing_telework_flag & (person['telework_freq'].isin(MISSING_CODES) | pd.isnull(person['telework_freq']))

# school loc
# check that person is student age, not cared for at home, attend dayschool outside home, or homeschooled
flag = p16 & (person['school_type'].isin([3,5,6,7,10,11,12,13,997]))
# then check that they go to a school in person
flag = flag & person['school_attend'].isin([1,2])
for c in ['school_lat','school_lon']:
    flag = flag & (person[c].isin(MISSING_CODES) | pd.isnull(person[c]))
missing_school_loc_flag = flag

# transit pass flag  NOTE THERE IS ANOTHER transit_pass, which is not being checked
missing_transit_pass_flag = person['clipper_card'].isin(MISSING_CODES)

In [28]:
missing_person_any_flag = (missing_person_ethnicity_flag | missing_person_race_flag | missing_person_gender_flag | missing_person_age_flag | 
                          missing_person_employment_flag | missing_person_student_flag | missing_telework_flag | 
                          missing_school_loc_flag | missing_work_loc_flag | missing_work_park_flag | missing_transit_pass_flag | 
                          missing_person_has_proxy_flag | missing_person_can_drive_flag)

In [29]:
person_flags_priority =    [missing_person_race_flag,
                            missing_person_ethnicity_flag,
                            missing_person_gender_flag,
                            missing_person_age_flag,
                            missing_person_employment_flag,
                            missing_telework_flag,
                            missing_school_loc_flag,
                            missing_work_park_flag,
                            missing_person_has_proxy_flag,
                            missing_person_can_drive_flag,
                            missing_work_loc_flag,
                            missing_person_student_flag,
                            missing_transit_pass_flag,
                           ]

In [30]:
names = ['race_complete','ethnicity_complete','gender_complete','age_complete','employment_complete',
         'telework_complete','school_loc_complete','work_park_complete','has_proxy_complete','can_drive_complete',
         'work_loc_complete','student_complete','transit_pass_complete']
for n, f in zip(names, person_flags_priority):
    person[n] = (~f)*1

In [31]:
hh_person_complete = person.groupby('hh_id')[names].sum().eq(person.groupby('hh_id').size(), axis='rows') * 1

In [32]:
person['sfcta_person_days_complete'] = (((person['diary_platform'].isin(['browser','call']) | person['has_proxy'].eq(1)) & person['sfcta_num_days_complete'].ge(1)) |
                                        (person['diary_platform'].eq('rmove') & person['sfcta_num_days_complete'].ge(CONCURRENT_DAY_MIN))) * 1
person['sfcta_person_survey_complete'] = ~missing_person_any_flag*1
person['sfcta_person_complete'] = (~missing_person_any_flag & person['sfcta_person_days_complete'].eq(1))*1
person['has_weight'] = person['person_weight'].gt(0) * 1
person['has_weight_rmove'] = person['person_weight_rmove_only'].gt(0) * 1
# don't need to copy back because person is an alias of s23.person

### Person hard-to-reach
- race_1: african american or black
- race_2: american indian or alaska native
- race_3: asian
- race_4: native hawaiian or pacific islander
- race_5: white
- race_997: other
- race_999: prefer not to answer
- ethnicity_1: not hispanic
- ethnicity_2: mexican, mexican american, chicano
- ethnicity_3: puerto rican
- ethnicity_4: cuban
- ethnicity_997: other hispanic origin
- ethnicity_999: prefer not to answer

In [33]:
re_flag = False
for c in ['race_1','race_2','race_3','race_4','race_997', 'ethnicity_2','ethnicity_3', 'ethnicity_4','ethnicity_997']:
    re_flag = re_flag | person[c].eq(1)
person['sfcta_bipoc_flag'] = re_flag * 1

## Household Completeness

In [34]:
tmp = (s23.person
       .groupby('hh_id', as_index=False)
       .agg({'sfcta_person_complete':'sum', 'person_id':'count','sfcta_bipoc_flag':'sum'})
       .rename(columns={'sfcta_person_complete':'sfcta_num_persons_complete',
                        'person_id':'persons'})
       )
tmp['sfcta_hh_persons_complete'] = (tmp['sfcta_num_persons_complete'].eq(tmp['persons']))*1
tmp['sfcta_bipoc_flag'] = tmp['sfcta_bipoc_flag'].ge(1) * 1
hh = pd.merge(hh, tmp[['hh_id','sfcta_num_persons_complete','sfcta_hh_persons_complete','sfcta_bipoc_flag']],
              on='hh_id', how='left')

In [35]:
flag = False
for c in ['num_workers','num_adults','num_kids', #'income_detailed','income_broad',
          'num_workers','num_vehicles','home_lat','home_lon']:
    hh[c+'_complete'] = ~(hh[c].isin(MISSING_CODES) | pd.isnull(hh[c])) * 1
    flag = flag | hh[c].isin(MISSING_CODES) | pd.isnull(hh[c])

missing_hh_income_flag = hh['income_imputed'].eq('missing') | pd.isnull(hh['income_imputed'])
missing_hh_basic_flag = flag

In [36]:
hh = pd.merge(hh, hh_person_complete, how='left', left_on='hh_id', right_index=True)

In [37]:
hh['sfcta_hh_survey_complete'] = ~(missing_hh_basic_flag | missing_hh_income_flag)*1
hh['sfcta_hh_complete'] = (hh['sfcta_hh_survey_complete'].eq(1) & hh['sfcta_hh_persons_complete'].eq(1))*1
hh['has_weight'] = hh['hh_weight'].gt(0) * 1
hh['has_weight_rmove'] = hh['hh_weight_rmove_only'].gt(0) * 1

### Household hard-to-reach
- income_broad = 1: under \$25,000
- income_detailed in (1, 2): under \$25,000
- income_followup = 1: under \$25,000

In [38]:
lowinc_flag = hh['income_broad'].eq(1) | hh['income_followup'].eq(1) | hh['income_detailed'].isin([1,2])
hh['sfcta_lowinc_flag'] = lowinc_flag * 1
hh.insert(1,'season',(hh['first_travel_date'].lt('2023-07-01')*1).map(lambda x: {0:'fall',1:'spring'}[x]))
hh = pd.merge(hh, acs_mode[['county','bg','tbw_flag']].rename(columns={'bg':'home_bg_2020','tbw_flag':'sfcta_tbw_flag'}),
              how='left')
hh['sfcta_hard_to_reach'] = hh[['sfcta_bipoc_flag','sfcta_lowinc_flag','sfcta_tbw_flag']].sum(axis=1).ge(1)*1

s23.hh = hh

In [39]:
dates = pd.DataFrame(pd.to_datetime(hh.groupby('first_travel_date').size().index))
dates['gap'] =  dates['first_travel_date'].shift(-1) - dates['first_travel_date']
dates.loc[dates['gap'].gt(dt.timedelta(days=1))]

Unnamed: 0,first_travel_date,gap
0,2023-05-07,2 days
32,2023-06-09,131 days
62,2023-11-16,5 days
63,2023-11-21,6 days
81,2023-12-14,5 days
82,2023-12-19,19 days
89,2024-01-13,3 days


In [40]:
hh.groupby([hh['first_travel_date'].lt('2023-07-01')]).agg({'hh_id':['min','max']})

Unnamed: 0_level_0,hh_id,hh_id
Unnamed: 0_level_1,min,max
first_travel_date,Unnamed: 1_level_2,Unnamed: 2_level_2
False,23100065,23803687
True,23000023,23041498


In [41]:
incentives['season'] = (incentives['hh_id'].lt(23100000) * 1).map(lambda x: {0:'fall', 1:'spring'}[x])

In [42]:
incentives = pd.merge(incentives, hh[['hh_id','num_people', 'num_surveyable', 'num_participants',
                                      'num_adults', 'num_kids','first_travel_date', 'diary_platform',
                                      'sfcta_bipoc_flag','sfcta_lowinc_flag','sfcta_tbw_flag',
                                      'sfcta_hard_to_reach','sfcta_hh_complete','sfcta_num_persons_complete']],
                      how='left')

In [43]:
incentives['per_person1'] = incentives['total_incentive_before_discount'] / incentives['num_participants']
incentives['per_person2'] = incentives['total_incentive_before_discount'] / incentives['sfcta_num_persons_complete']

In [44]:
mp = {0: 'incomplete',
      1: 'complete',
      995: 'missing',
      998: 'dont know',
      999: 'prefer not to answer'}

In [45]:
df = pd.DataFrame(data=0,
                  index=['hh','person','day','trip'], 
                  columns=['complete_has_weight','complete_no_weight','incomplete_has_weight','incomplete_no_weight'])

In [46]:
rename = {(0,0):'incomplete_no_weight',
          (0,1):'incomplete_has_weight',
          (1,0):'complete_no_weight',
          (1,1):'complete_has_weight',}

In [47]:
_h = s23.hh.groupby(['sfcta_hh_complete','has_weight']).agg({'hh_id':'count'}).T.rename(index={'hh_id':'hh'})
_p = s23.person.groupby(['sfcta_person_complete','has_weight']).agg({'person_id':'count'}).T.rename(index={'person_id':'person'})
_d = s23.day.groupby(['sfcta_day_complete','has_weight']).agg({'day_id':'count'}).T.rename(index={'day_id':'day'})
_t = s23.trip.groupby(['sfcta_is_complete','has_weight']).agg({'trip_id':'count'}).T.rename(index={'trip_id':'trip'})

In [48]:
for x in [_h,_p,_d,_t]:
    new_cols = []
    for c in x.columns:
        new_cols.append(rename[c])
    x.columns = new_cols

In [49]:
df.update(_h)
df.update(_p)
df.update(_d)
df.update(_t)

In [50]:
df.to_csv('completeness_by_weight_status.csv')

In [51]:
pc = pd.DataFrame(index=[n.replace('_complete','') for n in names],
                  columns=['complete_has_weight','complete_no_weight','complete_total',
                           'incomplete_has_weight','incomplete_no_weight','incomplete_total',
                           'cumulative_complete_has_weight','cumulative_complete_no_weight','cumulative_complete_total'])
cumulative = 1
for n in names:
    cumulative = cumulative * person[n]
    x = person.groupby([n, 'has_weight']).agg({'person_id':'size'}).T.rename(index={'person_id':n.replace('_complete','')})
    new_cols = []
    for c in x.columns:
        new_cols.append(rename[c])
    x.columns = new_cols
    
    y = person.groupby([cumulative, 'has_weight']).agg({'person_id':'size'}).T.rename(index={'person_id':n.replace('_complete','')})
    new_cols = []
    for c in y.columns:
        new_cols.append('cumulative_'+rename[c])
    y.columns = new_cols
    pc.update(x)
    pc.update(y)

In [52]:
pc.fillna(0, inplace=True)

In [53]:
pc['complete_total'] = pc['complete_has_weight'] + pc['complete_no_weight']
pc['incomplete_total'] = pc['incomplete_has_weight'] + pc['incomplete_no_weight']
pc['cumulative_complete_total'] = pc['cumulative_complete_has_weight'] + pc['cumulative_complete_no_weight']

In [54]:
hpc = pd.DataFrame(index=[n.replace('_complete','') for n in names],
                   columns=['complete_has_weight','complete_no_weight','complete_total',
                            'incomplete_has_weight','incomplete_no_weight','incomplete_total',
                            'cumulative_complete_has_weight','cumulative_complete_no_weight','cumulative_complete_total'])
cumulative = 1
for n in names:
    cumulative = cumulative * hh[n]
    x = hh.groupby([n, 'has_weight']).agg({'hh_id':'size'}).T.rename(index={'hh_id':n.replace('_complete','')})
    new_cols = []
    for c in x.columns:
        new_cols.append(rename[c])
    x.columns = new_cols
    
    y = hh.groupby([cumulative, 'has_weight']).agg({'hh_id':'size'}).T.rename(index={'hh_id':n.replace('_complete','')})
    new_cols = []
    for c in y.columns:
        new_cols.append('cumulative_'+rename[c])
    y.columns = new_cols
    hpc.update(x)
    hpc.update(y)

In [55]:
hpc.fillna(0, inplace=True)

In [56]:
hpc['complete_total'] = hpc['complete_has_weight'] + hpc['complete_no_weight']
hpc['incomplete_total'] = hpc['incomplete_has_weight'] + hpc['incomplete_no_weight']
hpc['cumulative_complete_total'] = hpc['cumulative_complete_has_weight'] + hpc['cumulative_complete_no_weight']

In [57]:
s23.hh.to_csv(os.path.join(OUTDIR,'flagged','hh.csv'), index=False)
s23.person.to_csv(os.path.join(OUTDIR,'flagged','person.csv'), index=False)
s23.day.to_csv(os.path.join(OUTDIR,'flagged','day.csv'), index=False)
s23.trip.to_csv(os.path.join(OUTDIR,'flagged','trip.csv'), index=False)
incentives.to_csv(os.path.join(OUTDIR,'flagged','incentives.csv'), index=False)

In [58]:
pc.to_csv(os.path.join(OUTDIR,'flagged','person_completeness.csv'))
hpc.to_csv(os.path.join(OUTDIR,'flagged','household_person_completeness.csv'))

In [59]:
hh.loc[hh['hh_id'].isin(person.loc[person['has_weight'].eq(0) & person['can_drive_complete'].eq(0),'hh_id'].tolist())].to_csv(os.path.join(OUTDIR,'hh_with_persons_missing_weights.csv'))