In [1]:
# Convert Household, Person, and Trip files to Daysim format for estimation and calibration

In [1]:
import pandas as pd
import geopandas as gpd

In [2]:
# Flexible column names, given that these may change in future surveys
hhno = 'hhid'
hownrent = 'rent_own'
hrestype = 'res_type'
hhincome = 'hhincome_detailed'
hhtaz = 'final_home_taz2010'
hhexpfac = 'hh_wt_revised'
hhwkrs = 'numworkers'
hhvehs = 'vehicle_count'
pno = 'pernum'

# Household

In [193]:
def total_persons_to_hh(hh, person, daysim_field, filter_field, 
                        filter_field_list, hhid_col=hhno, wt_col=hhexpfac):
    
    """Use person field to calculate total number of person in a household for a given field
    e.g., total number of full-time workers"""
    
    df = person[person[filter_field].isin(filter_field_list)]
    df = df.groupby(hhid_col).count().reset_index()[[wt_col,hhid_col]]
    df.rename(columns={wt_col: daysim_field}, inplace=True)
    
    # Join to households
    hh = pd.merge(hh, df, how='left', on=hhid_col)
    hh[daysim_field].fillna(0, inplace=True)
    
    return hh

In [194]:
# lookup maps for various fields
hownrent_map = {1:1, # Own: own
                2:2, # Rent: rent
                3:3, # provided by job/military: other
                4:3, # other: other
                5:3} # prefer not to answer: other

hhrestype_map = {1:1, # SFH: SFH
                 2:2, # Townhouse (attached house): duplex/triplex/rowhouse
                 3:2, # Building with 3 or fewer apartments/condos: duplex/triplex/rowhouse
                 4:3, # Building with 4 or more apartments/condos: apartment/condo
                 5:4, # Mobile home/trailer: Mobile home/trailer
                 6:5, # Dorm or institutional housing: Dorm room/rented room
                 7:6, # other: other
                   }

# Use the midpoint of the ranges provided since DaySim uses actual values
income_map = {
    1: 5000,
    2: 17500,
    3: 30000,
    4: 42500,
    5: 62500,
    6: 87500,
    7: 125000,
    8: 175000,
    9: 225000,
    10: 250000,
    11: -1
}

In [195]:
hh = pd.read_excel(r'J:\Projects\Surveys\HHTravel\Survey2017\Data\Export\Version 2\Restricted\In-house\2017-internal-v2-R-1-household.xlsx',
                         skiprows=1)
person = pd.read_excel(r'J:\Projects\Surveys\HHTravel\Survey2017\Data\Export\Version 2\Restricted\In-house\2017-internal-v2-R-2-person.xlsx',
                      skiprows=1)

In [196]:
# Do some up-front data prep
# This may be different for new data sets
# Identify high school students based on their school name
# This will not include all students, but we can start with these students
person['high_school'] = 0
person['school_loc_name'].fillna(' ', inplace=True)
person.ix[(person['school_loc_name'].str.contains("High", na=False)) &
          (person['schooltype'].isin([3,4])), "high_school"] = 1

# Students not in this group will be assumed as high school students
# if they're in age group 16-17, and 18-24 and are in K12 (public or private) 3, 4
# This is probably excluding some in the 12-15 year group, should try to sort this out better in the future
person.ix[(person['high_school'] != 0) & 
          (person['age'].isin([4,5])) &
          (person['schooltype'].isin([3,4])), 'high_school'] = 1

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  from ipykernel import kernelapp as app


In [197]:
# Full-time workers
hh = total_persons_to_hh(hh, person, daysim_field='hhftw', filter_field='employment', filter_field_list=[1])

# Part-time workers
hh = total_persons_to_hh(hh, person, daysim_field='hhptw', filter_field='employment', filter_field_list=[2])

# Retirees
hh = total_persons_to_hh(hh, person, daysim_field='hhret', filter_field='employment', filter_field_list=[6])

# Other Adults
hh = total_persons_to_hh(hh, person, daysim_field='hhoad', filter_field='employment', filter_field_list=[3,4,5,7])

# University Students
hh = total_persons_to_hh(hh, person, daysim_field='hhuni', filter_field='schooltype', filter_field_list=[6])

# High school students
hh = total_persons_to_hh(hh, person, daysim_field='hhhsc', filter_field='high_school', filter_field_list=[1])

# k12 age 5-15
age5_12 = person[person['age'].isin([2,3])]
hh = total_persons_to_hh(hh, age5_12, daysim_field='hh515', filter_field='schooltype', filter_field_list=[3,4])

# age under 5
hh = total_persons_to_hh(hh, person, daysim_field='hhcu5', filter_field='age', filter_field_list=[1])

hh['hownrent'] = hh[hownrent].map(hownrent_map) 
hh['hrestype'] = hh[hrestype].map(hhrestype_map) 
hh['hhincome'] = hh[hhincome].map(income_map) 
hh['hhtaz'] = hh[hhtaz]
hh['hhexpfac'] = hh[hhexpfac]
hh['hhwkrs'] = hh[hhwkrs]
hh['hhno'] = hh[hhno]
hh['hhvehs'] = hh[hhvehs]

In [175]:
# Need the parcel ID as well!
# Use geopandas to find nearest parcel node?
import geopandas as gpd

In [198]:
# daysim_fields = ['hhno','hhsize','hhvehs','hhwkrs','hhftw','hhptw','hhret','hhoad','hhuni','hhhsc','hh515',
#                  'hhcu5','hhincome','hownrent','hrestype','hhparcel','hhtaz','hhexpfac','samptype']
# Without parcel field
daysim_fields = ['hhno','hhsize','hhvehs','hhwkrs','hhftw','hhptw','hhret','hhoad','hhuni','hhhsc','hh515',
                 'hhcu5','hhincome','hownrent','hrestype','hhtaz','hhexpfac']

hh[daysim_fields].to_csv(r'R:\e2projects_two\SoundCastDocuments\2017Estimation\household17.csv',index=False)

In [201]:
# hh[daysim_fields]

# Person

In [368]:
# Reload to start with fresh data
person = pd.read_excel(r'J:\Projects\Surveys\HHTravel\Survey2017\Data\Export\Version 2\Restricted\In-house\2017-internal-v2-R-2-person.xlsx',
                      skiprows=1)

In [369]:
# person['pernum']

In [370]:
# Reload to start with fresh data
person = pd.read_excel(r'J:\Projects\Surveys\HHTravel\Survey2017\Data\Export\Version 2\Restricted\In-house\2017-internal-v2-R-2-person.xlsx',
                      skiprows=1)

# Person Type

# Full time worker
person.ix[person['employment'] == 1, 'pptyp'] = 1

# Part-time worker
person.ix[person['employment'] == 2, 'pptyp'] = 2

# Non-working adult age 65+
person.ix[(person['employment'] != 1) &  (person['age'].isin([10,11,12])), 'pptyp'] = 3

# High school student age 16+
person.ix[(person['age'] >= 4) & (person['schooltype'].isin([3,4,5])), 'pptyp'] = 6

# university student (full-time)
person.ix[(person['schooltype'].isin([6,7])) & (person['student'] == 3), 'pptyp'] = 5

# Child age 5-15
person.ix[person['schooltype'].isin([2,3]), 'pptyp'] = 7

# child under 5
person.ix[person['schooltype'].isin([1]), 'pptyp'] = 8

# Non-working adult age 65 should accoutn for all others
person.ix[person['pptyp'].isnull(), 'pptyp'] = 4

# Person worker type
person.ix[person['employment'].isin([1]), 'pwtyp'] = 1
person.ix[person['employment'].isin([2]), 'pwtyp'] = 2
person.ix[person['employment'].isin([3,4,5,6,7]), 'pwtyp'] = 0
person['pwtyp'].fillna(0,inplace=True)
person['pwtyp'] = person['pwtyp'].astype('int')

# Transit pass availability
# Care about people that have subsidized/free passes
# people that perceive transit cost as 0
person['ptpass'] = 0
person.ix[(person['tran_pass_12'].isin([1,2])) | (person['benefits_3'].isin([2,3])),'ptpass'] = 1

# Paid parking at work (any subsidization counts as 'paid')
person['ppaidprk'] = 0
person.ix[person['workpass'].isin([3,4]), 'ppaidprk'] = 1

# Take median age
age_map = {
    1: 2,
    2: 8,
    3: 14,
    4: 17,
    5: 21,
    6: 30,
    7: 40,
    8: 50,
    9: 60,
    10: 70,
    11: 80,
    12: 85
}

gender_map = {
    1: 1,    # male: male
    2: 2,    # female: female
    3: 9,    # another: missing
    4: 9     # prefer not to answer: missing
}

pstyp_map = {
    1: 0,
    2: 1,
    3: 2
}

hownrent_map = {
    1: 1,
    2: 2,
    3: 3,
    4: 3,
    4: 3
}

person['age'] = person['age'].astype('int')
person['pagey'] = person['age'].map(age_map)
person['pgend'] = person['gender'].map(gender_map)
person['pstyp'] = person['student'].map(pstyp_map)
person['pstyp'].fillna(0,inplace=True)
person['hhno'] = person['hhid']
person['pno'] = person['pernum']
person['psexpfac'] = person['hh_wt_revised']
person['pwtaz'] = -1
person['pstaz'] = -1
person['pwpcl'] = -1
person['pspcl'] = -1
# Need:
# pwpcl
# pwtaz
# pwautime
# pwaudist
# pspcl
# psautime
# psaudist
# puwmode
# puwarrp
# puwdepp

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  # This is added back by InteractiveShellApp.init_path()
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for

In [371]:
def get_zone_from_lat_lng(df, taz_gdf, zone_gdf_crs, loc_field, lat_field, lng_field, point_field):
    
    """ Joins lat long field to polygon file. 
        Requires specification of CRS of polyline file
        Assumes standard coordinate reference system of 4326 in lat long fields
    """
    lat_lng_crs = 'epsg:4326'
    
    # Filter for records with valid entries and create gdf
    _df = df[-df[loc_field].isnull()]
    _df = person[-person[lat_field].isnull()]
    
    gdf = gpd.GeoDataFrame(
        _df, geometry=gpd.points_from_xy(_df[lng_field], _df[lat_field]))
    gdf.crs = {'init' :lat_lng_crs}
    
    # Align CRS from zone file, convert to lat/long format
    taz_gdf.crs = {'init' :'epsg:'+str(zone_gdf_crs)}
    taz_gdf = taz_gdf.to_crs({'init': lat_lng_crs})
    
    # Spatial join between point and polyline
    result_df = gpd.sjoin(gdf, taz_gdf, how='left', op='intersects')
    
    # Update location field
    result_df[loc_field] = result_df[point_field]
    # Drop locations outside of the region
    result_df = result_df[-result_df[loc_field].isnull()]
    result_df[loc_field] = result_df[loc_field].astype('int')
    
    # Join back to original df
    df = df.drop(loc_field,axis=1)
    df = df.merge(result_df[['hhno','pno',loc_field]], on=['hhno','pno'], how='left')
    df[loc_field] = df[loc_field].fillna(-1)
    
    return df

In [372]:
def get_parcel_from_lat_lng():
    """ Load parcel file points, snap lat/lng to nearest parcel centroid
    """
    return None

In [9]:
person = pd.read_csv(r'R:\e2projects_two\SoundCastDocuments\2017Estimation\person17.csv')

In [10]:
# Get nearst parcel from lat long
# Load parcel input as gdf
# parcel_df = pd.read_csv(r'R:\e2projects_two\SoundCast\Inputs\dev\landuse\2014\lodes\parcels_urbansim.txt', delim_whitespace=True)

parcel_gdf = gpd.read_file(r'J:\Projects\UrbanSim\NEW_DIRECTORY\GIS\Shapefiles\Parcels\Region\2014\gapwork\prcl15_4k.shp')

In [11]:
parcel_gdf.crs = {'init' : 'epsg:2285'}

In [454]:
# Load parcel coordinates
lat_field = 'xcoord_p'
lng_field = 'ycoord_p'
gdf_parcel = gpd.GeoDataFrame(
        parcel_df, geometry=gpd.points_from_xy(parcel_df[lng_field], parcel_df[lat_field]))
gdf_parcel.crs = {'init' : 'epsg:2285'}     # Set CRS: NAD83, WA North ft

loc_field = 'pwpcl'
lat_field = 'work_lat'
lng_field = 'work_lng'
_df = person.copy()
_df = _df[-_df[loc_field].isnull()]
_df = _df[-_df[lat_field].isnull()]

gdf = gpd.GeoDataFrame(
        _df, geometry=gpd.points_from_xy(_df[lng_field], _df[lat_field]))
gdf.crs = {'init' :lat_lng_crs}
gdf = gdf.to_crs({'init': 'epsg:2285'})    # conver to state plane WA 4601 N WA

In [132]:
# Get usual work and school TAZ
taz_gdf = gpd.read_file(r'W:\geodata\forecast\taz2010nowater.shp')
person = get_zone_from_lat_lng(person, taz_gdf, zone_gdf_crs='2926', loc_field='pwtaz', 
                             lat_field='work_lat', lng_field='work_lng', point_field='TAZ')

person = get_zone_from_lat_lng(person, taz_gdf, zone_gdf_crs='2926', loc_field='pstaz', 
                             lat_field='school_loc_lat', lng_field='school_loc_lng', point_field='TAZ')

In [None]:
person = get_zone_from_lat_lng(person, parcel_gdf, zone_gdf_crs='2926', loc_field='pwpcl', 
                             lat_field='work_lat', lng_field='work_lng', point_field='parcelid')

In [133]:

daysim_cols = ['hhno', 'pno', 'pptyp', 'pagey', 'pgend', 'pwtyp', 'pwpcl', 'pwtaz', 'pwautime',
               'pwaudist', 'pstyp', 'pspcl', 'pstaz', 'psautime', 'psaudist', 'puwmode', 'puwarrp', 
               'puwdepp', 'ptpass', 'ppaidprk', 'pdiary', 'pproxy', 'psexpfac']

# Add empty columns to fill in later with skims
for col in daysim_cols:
    if col not in person.columns:
        person[col] = -1
        
person = person[daysim_cols]

person.to_csv(r'R:\e2projects_two\SoundCastDocuments\2017Estimation\person17.csv',index=False)

In [85]:
person.head()

Unnamed: 0,hhno,pno,pptyp,pagey,pgend,pwtyp,pwpcl,pwtaz,pwautime,pwaudist,...,psautime,psaudist,puwmode,puwarrp,puwdepp,ptpass,ppaidprk,pdiary,pproxy,psexpfac
0,17100005,1,4.0,60,2,0,-1,-1.0,-1,-1,...,-1,-1,-1,-1,-1,0,0,-1,-1,24.441709
1,17100005,2,3.0,70,1,0,-1,-1.0,-1,-1,...,-1,-1,-1,-1,-1,0,0,-1,-1,24.441709
2,17100024,1,5.0,30,9,2,-1,-1.0,-1,-1,...,-1,-1,-1,-1,-1,0,0,-1,-1,26.224981
3,17100024,2,1.0,30,2,1,-1,597.0,-1,-1,...,-1,-1,-1,-1,-1,0,0,-1,-1,26.224981
4,17100024,3,8.0,2,9,0,-1,-1.0,-1,-1,...,-1,-1,-1,-1,-1,0,0,-1,-1,26.224981


# Trips

In [255]:
from sqlalchemy import create_engine
import pyodbc

conn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER=AWS-PROD-SQL\Coho;DATABASE=Elmer;Trusted_Connection=yes')
trip = pd.read_sql(sql='select * from HHSurvey.v_trips_2017', con=conn)

trip['hhno'] = trip['household_dim_id']
trip['pno'] = trip['person_dim_id']
trip['day'] = trip['daynum'].astype(int)
trip['tsvid'] = trip['recid']

####### FIXME:
# Need: 
# tour
# half
# tseg

# Select only weekday trips (Should we also include Friday?)
trip = trip[trip['dayofweek'].isin(['Monday','Tuesday','Wednesday','Thursday'])]

# Recode purposes
day_map = {
    'Monday': 1,
    'Tuesday': 2,
    'Wednesday': 3,
    'Thursday': 4
}

purpose_map = {
    1: 0, # home
    6: 2, # school
    9: 3, # escort
    10: 1, # work
    11: 1, # work-related
    14: 1, # work-related
    30: 5, # grocery -> shop
    32: 5, # other shopping -> shop
    33: 4,
    34: 9, # medical
    50: 6, # restaurant -> meal
    51: 8, 
    52: 7,
    53: 8,
    54: 7, # religious/community/volunteer -> social
    56: 7, # family activity -> social
    60: 10, # change mode
    61: 4,
    62: 7, # other social
    97: -1 # other
}

dorp_map = {
    1: 1,
    2: 2,
    3: 9
}

# Survey DB is formatted with string values, need to translate again with above dict
df_purp_lookup = pd.read_sql(sql='select * from HHSurvey.DataExplorerValues2017 where VariableID = 125', con=conn)
new_purp_map = {}
for val in df_purp_lookup['ValueOrder'].unique():
    text = df_purp_lookup.loc[df_purp_lookup['ValueOrder'] == val,'ValueText'].values[0]
    new_purp_map[text] = purpose_map[val]

trip['day'] = trip['dayofweek'].map(day_map)

trip['opurp'] = trip['origin_purpose'].map(new_purp_map)
trip['dpurp'] = trip['dest_purpose'].map(new_purp_map)

trip['dorp'] = trip['dest_purpose'].map(new_purp_map)

# origin and destination TAZs
trip['otaz'] = trip['o_taz2010']
trip['dtaz'] = trip['d_taz2010']

##############################
# Start and end time
##############################
# Filter out rows with None
trip = trip[-trip['depart_time_hhmm'].isnull()]
trip = trip[-trip['arrival_time_hhmm'].isnull()]

# Minutes
for db_col_name, daysim_col_name in {'arrival_time_hhmm': 'arrtm', 'depart_time_hhmm': 'deptm'}.items():
    # Filter rows without valid depart and start times
    trip = trip[-trip[db_col_name].isnull()]
    
    # Get minutes from time stamp, as values to right of :
    minutes = trip[db_col_name].apply(lambda row: str(row).split(' ')[-1].split(':')[1])
    minutes = minutes.apply(lambda row: row.split('.')[0]).astype('int') # Trim any decimal places and takes whole numbers
    
    # Get hours from time stamp
    hours = trip[db_col_name].apply(lambda row: str(row).split(' ')[-1].split(':')[0]).astype('int')
    
    # In minutes after midnight****
    ##########
    # NOTE: Check that daysim uses MAM and not minutes after 3 A
    ##########
    trip[daysim_col_name] = hours*60 + minutes
    
#     # Convert to minutes after 3 AM; sum of minutes minus 3*60 
#     trip[daysim_col_name] = (hours*60)+minutes - 180
    # This gives negative values to numbers starting between midnight and 3 am, move these to *after* midnight
    # 60*24 (1440) - <negative value>
#     trip.loc[trip[daysim_col_name] < 0,daysim_col_name] = 1440 + trip.loc[trip[daysim_col_name] < 0,daysim_col_name]

##############################
# Mode
##############################
trip['mode'] = trip['main_mode'].copy()
# Get HOV2/HOV3 based on total number of travelers
trip.loc[trip['mode'] == 'HOV','mode'] = 'HOV2'
trip.loc[(trip['travelers_total'] > 2) & (trip['main_mode'] == 'HOV'),'mode'] = 'HOV3+'

trip.loc[trip['mode_1'] == 'Other hired service (e.g., Lyft, Uber)','mode'] = 'TNC'

# Lookup values
mode_dict = {
    'Walk': 1,
    'Bike': 2,
    'SOV': 3,
    'HOV2': 4,
    'HOV3+': 5,
    'Transit': 6,
    'TNC': 9,
    'Other': 10
}

trip['mode'] = trip['mode'].map(mode_dict)

trip['trexpfac'] = trip['trip_weight_revised']

##############################
# Origin and Destination Types
##############################

# Assume "other" by default
trip.loc[:,'oadtyp'] = 4
trip.loc[:,'dadtyp'] = 4

# Trips with origin/destination purpose of "Home" (0) have a origin/destination address type of "Home" (1)
trip.loc[trip['opurp'] == 0,'oadtyp'] = 1
trip.loc[trip['dpurp'] == 0,'dadtyp'] = 1

# Trips to/from work are considered "usual workplace" only if dpcl == workplace parcel
#### FIX ME: do not have PARCELS, only using TAZ
# must join person records to get usual work and school location
person = pd.read_csv(r'R:\e2projects_two\SoundCastDocuments\2017Estimation\person17.csv')
trip = trip.merge(person[['hhno','pno','pwtaz','pstaz']], on=['hhno','pno'], how='left')

# If trip is to/from TAZ of usual workplace and trip purpose is work
trip.loc[(trip['opurp'] == 0) & (trip['otaz'] == trip['pwtaz']),'oadtyp'] = 2
trip.loc[(trip['dpurp'] == 0) & (trip['dtaz'] == trip['pwtaz']),'dadtyp'] = 2

# usual school
trip.loc[(trip['opurp'] == 0) & (trip['otaz'] == trip['pstaz']),'oadtyp'] = 3
trip.loc[(trip['dpurp'] == 0) & (trip['dtaz'] == trip['pstaz']),'dadtyp'] = 3

# Change mode
trip.loc[trip['opurp'] == 10,'oadtyp'] = 6
trip.loc[trip['dpurp'] == 10,'dadtyp'] = 6

##############################
# Skim Values
##############################

trip['travcost'] = -1
trip['travtime'] = -1
trip['travdist'] = -1

# Add submode
trip['trip_pathtype'] = 1
for index, row in trip.iterrows():
    if row['main_mode'] == 1:
        if 'Ferry or water taxi' in row[['mode_1','mode_2','mode_3','mode_4']].values:
            trip.loc[index,'trip_pathtype'] = 7
        elif 'Commuter rail (Sounder, Amtrak)' in row[['mode_1','mode_2','mode_3','mode_4']].values:
            trip.loc[index,'trip_pathtype'] = 6
        elif 'Urban rail (e.g., Link light rail, monorail)' in row[['mode_1','mode_2','mode_3','mode_4']].values:
            trip.loc[index,'trip_pathtype'] = 4
        else:
            trip.loc[index,'trip_pathtype'] = 3


# Attach skim values in a separate process 
# https://github.com/psrc/travel-modeling/blob/master/survey/survey_attach_skims/2017/attach_skim_values_2017_no_tours.py

In [None]:
trip.groupby('mode_1').count()

In [239]:
##############################
# Spatial Join Parcel ID for origin/dest
##############################

# NOTE: This takes a while to process and load the parcel data

import geopandas as gpd


# Load parcel shapefile
parcel_gdf = gpd.read_file(r'J:\Projects\UrbanSim\NEW_DIRECTORY\GIS\Shapefiles\Parcels\Region\2014\gapwork\prcl15_4k.shp')
parcel_gdf.crs = {'init' :'epsg:2285'}
parcel_gdf = parcel_gdf[['parcel_id','geometry']]
parcel_gdf = parcel_gdf.to_crs({'init': lat_lng_crs})

AttributeError: 'NoneType' object has no attribute 'is_empty'

In [None]:
# Spatial join!
lat_lng_crs = 'epsg:4326'

for trip_end in ['origin', 'dest']:
    # Filter for records with valid entries and create gdf
    lng_field = trip_end+'_lng'
    lat_field = trip_end+'_lat'
    gdf = gpd.GeoDataFrame(
        trip, geometry=gpd.points_from_xy(trip[lng_field], trip[lat_field]))
    gdf.crs = {'init' :lat_lng_crs}

    gdf_join = gpd.sjoin(gdf, parcel_gdf, how='left', op='intersects')
    parcel_field = trip_end[0]+'pcl'
    gdf_join[parcel_field] = gdf_join['parcel_id'].fillna(-1)
    gdf_join[parcel_field] = gdf_join[parcel_field].astype('int')
    trip = trip.merge(gdf_join[['recid',parcel_field]], on='recid', how='left')

In [256]:
trip['opcl'] = -1
trip['dpcl'] = -1

In [257]:
##############################
# Export to File
##############################

# For 

trip_cols = ['hhno','pno','tsvid','day','mode','opurp','dpurp','deptm',
             'otaz','dtaz','opcl','dpcl','oadtyp','dadtyp',
             'arrtm','trexpfac','travcost','travtime','travdist',
            'trip_pathtype']

trip = trip[-trip['mode'].isnull()]
trip = trip[-trip['opurp'].isnull()]
trip = trip[-trip['dpurp'].isnull()]
trip = trip[-trip['otaz'].isnull()]
trip = trip[-trip['dtaz'].isnull()]

# Write to file
trip = trip[trip_cols]
trip.to_csv(r'R:\e2projects_two\SoundCastDocuments\2017Estimation\trip17.csv', index=False)

In [84]:
# # Load the trip file with skim values attached
# trip = pd.read_csv(r'R:\e2projects_two\SoundCastDocuments\2017Estimation\skims_attached\tripP17_w.dat')

# ### TEMP: Fix ME
# # Join with the day column - should remove this once skims are attached the file above
# _trip = pd.read_csv(r'R:\e2projects_two\SoundCastDocuments\2017Estimation\trip17.csv')
# cols = [u'hhno', u'pno', u'mode', u'opurp', u'dpurp', u'deptm', u'otaz','oadtyp', 'dadtyp',
#        u'dtaz', u'arrtm', u'trexpfac']
# trip = trip.merge(_trip[cols+['day']], on=cols, how='inner')
# trip = trip.drop_duplicates()

# # Remove any people that haave Nan in day
# ####

# # Create a unique person record
# trip['personid'] = trip['hhno'].astype('str')+trip['pno'].astype('str')

# Tours

In [259]:
trip['personid'] = trip['hhno'].astype('int') + trip['pno'].astype('int')

tour_dict = {}
mylist = []
bad_trips = []
tour_id = 0

for personid in trip['personid'].value_counts().index.values:
# for personid in ['171000051','171317451']:

    person_df = trip.loc[trip['personid'] == personid]
    # Loop through each day
    for day in person_df['day'].unique():
        df = person_df.loc[person_df['day'] == day]
    
        # First trip record should be home (?)
        if df.groupby('personid').first()['opurp'].values[0] != 0:
            bad_trips.append(df['personid'].iloc[0])
            continue

        # identify home tours first, then check for work and other subtours 
        home_tours_start = df[df['opurp'] == 0]
        home_tours_end = df[df['dpurp'] == 0]

        # skip person if they have a different number of tour starts/ends at home
        if len(home_tours_start) != len(home_tours_end):
            bad_trips.append(df['personid'].iloc[0])
            continue

        # Loop through each set of home-based tours
        for set_index in range(len(home_tours_start)):

            tour_dict[tour_id] = {}       

            # start row for this set
            start_row_id = home_tours_start.index[set_index]
    #         print start_row
            end_row_id = home_tours_end.index[set_index]
    #         print '-----'
            # iterate between the start row id and the end row id to build the tour

            # Select slice of trips that correspond to a trip set
            _df = df.loc[start_row_id:end_row_id]

            #################################
            # Skip this trip set under certain conditions
            #################################

            if len(_df) == 0:
                continue

            # Trips with negative purposes
            if (_df['opurp'] < 0).any() or (_df['dpurp'] < 0).any():
                print('negative person :(' + str(_df['personid'].iloc[0]))
                bad_trips.append(df['personid'].iloc[0])
                continue

            # Trips with same opurp and dpurp that is home
            if len(_df[(_df['opurp'] == _df['dpurp']) & (_df['opurp'] == 0)]) > 0:
                bad_trips.append(df['personid'].iloc[0])
                continue

    #         # Trips that have different purposes in sequence
    #         if len (df[df.shift(-1)['opurp']!=df['dpurp']]) > 0:
    #             bad_trips.append(df['personid'].iloc[0])
    #             continue

            # First row
            tour_dict[tour_id]['tlvorig'] = _df.iloc[0]['deptm']
            tour_dict[tour_id]['tardest'] = _df.iloc[0]['arrtm']
            tour_dict[tour_id]['totaz'] = _df.iloc[0]['otaz']
            tour_dict[tour_id]['topcl'] = _df.iloc[0]['opcl']
            tour_dict[tour_id]['toadtyp'] = _df.iloc[0]['oadtyp']
            # NEED PARCEL DATA ON TRIP RECORDS!!!

            # Last row
            tour_dict[tour_id]['tlvdest'] = _df.iloc[-1]['deptm']
            tour_dict[tour_id]['tarorig'] = _df.iloc[-1]['arrtm']
            tour_dict[tour_id]['tdtaz'] = _df.iloc[-1]['dtaz']

            # Household and person info
            tour_dict[tour_id]['hhno'] = _df.iloc[0]['hhno']
            tour_dict[tour_id]['pno'] = _df.iloc[0]['pno']
            tour_dict[tour_id]['day'] = day

            # Identify primary purpose and figure out the tour halves
        #   ****ASSUMING primary tour is the activity that takes the longest amount of time

             # Determine if this is part of the first half tour or second half tour
            # calculate duration, as difference between arrival at a place and start of next trip
            _df['duration'] = _df.shift(-1).iloc[:-1]['deptm']-_df.iloc[:-1]['arrtm']

            if len(_df) > 3:
                mylist.append(_df['personid'].iloc[0])

            # For tour groups with only 2 trips, the halves are simply the first and second trips
            if len(_df) == 2:
                tour_dict[tour_id]['pdpurp'] = _df.iloc[0]['dpurp']
                tour_dict[tour_id]['tripsh1'] = 1
                tour_dict[tour_id]['tripsh2'] = 1
            # For tour groups with > 2 trips, calculate primary purpose and halves
            else:
                # Assuming that the primary purpose is the purpose for the trip to place with longest duration
                primary_purp_index = _df['duration'].argmax()
                tour_dict[tour_id]['pdpurp'] = _df.loc[_df['duration'].argmax()]['opurp']
                

                # Get the tour DTAZ as the DTAZ of the primary trip destination; also dest address type
                tour_dict[tour_id]['tdtaz'] = _df.loc[primary_purp_index]['dtaz']
                tour_dict[tour_id]['tdpcl'] = _df.loc[primary_purp_index]['dpcl']
                tour_dict[tour_id]['tdadtyp'] = _df.loc[_df['duration'].argmax()]['dadtyp']
                
                # Pathtype is defined by a heirarchy, where highest number is chosen first
                # Ferry > Commuter rail > Light Rail > Bus > Auto Network
                tour_dict[tour_id]['tpathtp'] = _df.loc[_df['duration'].argmax()]['trip_pathtype']
                
                # need destination parcel

                # Get number of trips in the first half tour
                tour_dict[tour_id]['tripsh1'] = len(_df.iloc[0:primary_purp_index+1])

                # trips in second half tour
                tour_dict[tour_id]['tripsh2'] = len(_df.iloc[primary_purp_index:])

                # look for subtours
                ##### FIX ME: #####
                # for now just set subtours as 0 - do not use this for tour estimation



            # Calculate number of subtours
            # trips that have the same origin/dest pairs before returning home

    #         print personid

            # Extract main mode type
            # use a heirarchy of modes used on the trip
            mode_list = _df['mode'].value_counts().index.astype('int').values
            mode_heirarchy = [3,4,5,6,9,2,1]
            for mode in mode_heirarchy:
                if mode in mode_list:
                    tour_dict[tour_id]['tmodetp'] = mode
                    break


            tour_id += 1
            


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 

negative person :(1730936183
negative person :(1729374117
negative person :(1730018498
negative person :(1730493399
negative person :(1731731357
negative person :(1731731356
negative person :(1729374119
negative person :(1729374120
negative person :(1731446941
negative person :(1730037589


In [260]:
# How to get subtours?
tour

Unnamed: 0,tlvorig,tardest,totaz,topcl,toadtyp,tlvdest,tarorig,tdtaz,hhno,pno,day,pdpurp,tripsh1,tripsh2,tmodetp,tdpcl,tdadtyp
0,450.0,455.0,184.0,1269054.0,1.0,455.0,460.0,184.0,17118701.0,1.711870e+09,4,8.0,1,1,1,,
1,465.0,480.0,184.0,1269054.0,1.0,780.0,810.0,1517.0,17118701.0,1.711870e+09,4,3.0,6,0,3,749153.0,4.0
2,810.0,815.0,184.0,1269054.0,1.0,820.0,825.0,184.0,17118701.0,1.711870e+09,4,8.0,1,1,1,,
3,1095.0,1110.0,184.0,1269054.0,1.0,1110.0,1120.0,184.0,17118701.0,1.711870e+09,4,3.0,1,1,3,,
4,1170.0,1185.0,184.0,1269054.0,1.0,1185.0,1200.0,184.0,17118701.0,1.711870e+09,4,8.0,1,1,1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1896,510.0,530.0,1922.0,824146.0,1.0,930.0,960.0,1922.0,17142654.0,1.714265e+09,4,1.0,1,1,4,,
1897,405.0,435.0,721.0,800752.0,1.0,995.0,1020.0,721.0,17137682.0,1.713768e+09,4,2.0,1,1,6,,
1898,810.0,825.0,184.0,1269068.0,1.0,840.0,865.0,184.0,17126027.0,1.712603e+09,4,9.0,1,1,3,,
1899,1110.0,1130.0,1654.0,1125853.0,1.0,1170.0,1195.0,1654.0,17143729.0,1.714373e+09,4,5.0,1,1,5,,


In [None]:
# Some columns are not used but should be present in the estimation file

# To ADD:
# parent: parent tour ID, should be the ID of the tour except for subtours?
# subtours: need to calculate subtours
# pathtype

# 

for col in ['jtindex', 'parent', 'subtrs','tpathtp', 
            'tautotime', 'tautocost', 'tautodist', 
            'phtindx1', 'phtindx2', 'fhtindx1', 'fhtindx2']:
    result_df[col] = 0 
result_df['toexpfac'] = 1

tour.to_csv(r'R:\e2projects_two\SoundCastDocuments\2017Estimation\tour17.csv', index=False)

In [118]:
# After processing tours, need to add some of this info back to the Trip file

# tour: Tour ID
# half: tour half
# tseg: tour segment number within half tour
trip.columns

Index(['household_dim_id', 'person_dim_id', 'hh_group', 'sample_segment',
       'sample_county', 'final_cnty', 'cityof_redmond', 'cityofseattle',
       'psrc', 'finalhomehaddress',
       ...
       'deptm', 'mode', 'trexpfac', 'oadtyp', 'dadtyp', 'pwtaz', 'pstaz',
       'travcost', 'travtime', 'travdist'],
      dtype='object', length=303)

In [110]:
trip['dadtyp'].mean()

2.8957863054928517

In [83]:
# join empty columns
result_df.to_csv(r'R:\e2projects_two\SoundCastDocuments\2017Estimation\tour17.csv', index=False)

In [96]:
tour = pd.read_csv(r'R:\e2projects_two\SoundCastDocuments\2017Estimation\tour17.csv')

In [97]:
# Origin/Destination Address Type
tour

Unnamed: 0,tdtaz,tlvdest,tripsh1,tmodetp,totaz,tardest,day,hhno,tlvorig,pdpurp,...,tdpcltpathtp,tautotime,tautocost,tautodist,phtindx1,phtindx2,fhtindx1,fhtindx2,toexpfac,tour
0,1881,1134,0,3,4,779,4,17131745,765,1,...,0,0,0,0,0,0,0,0,1,1
1,2392,1278,0,3,4,751,1,17131745,745,1,...,0,0,0,0,0,0,0,0,1,2
2,1586,761,0,3,1298,128,1,17144080,105,0,...,0,0,0,0,0,0,0,0,1,3
3,1586,723,0,3,1298,112,2,17144080,90,1,...,0,0,0,0,0,0,0,0,1,4
4,1586,728,0,3,1298,119,3,17144080,95,1,...,0,0,0,0,0,0,0,0,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7827,541,1180,0,6,71,370,3,17144118,335,1,...,0,0,0,0,0,0,0,0,1,7828
7828,503,960,0,3,141,535,3,17116364,495,1,...,0,0,0,0,0,0,0,0,1,7829
7829,1615,1080,0,3,1680,560,4,17114662,540,1,...,0,0,0,0,0,0,0,0,1,7830
7830,524,720,0,1,574,230,3,17139680,220,1,...,0,0,0,0,0,0,0,0,1,7831


# Person Day


In [55]:
#
# pday = person.copy()
tour = pd.read_csv(r'R:\e2projects_two\SoundCastDocuments\2017Estimation\tour17.csv')
tour['person_id'] = tour['hhno'].astype('str') + tour['pno'].astype('str')
trip = pd.read_csv(r'R:\e2projects_two\SoundCastDocuments\2017Estimation\trip17.csv')
trip['person_id'] = trip['hhno'].astype('str') + trip['pno'].astype('str')

pday_survey = pd.read_excel(r'J:\Projects\Surveys\HHTravel\Survey2017\Data\Export\Version 2\Public\2017-pr2-4-day.xlsx',
                           sheet_name='4-Day-v3', skiprows=1)
pday_survey['personid'] = pday_survey['hhid'].astype('str') + pday_survey['pernum'].astype('str')
pday_survey['telework_time'] = pday_survey['telework_time'].fillna(0)

In [56]:
# Work through each person's day using tour file
person['id'] = person['hhno'].astype('str') + person['pno'].astype('str')

In [57]:
pday = pd.DataFrame()
for person_rec in person['id'].unique():
    
    # get this person's tours
    _tour = tour[tour['person_id'] == person_rec]
    
    # Loop through each day
    for day in _tour['day'].unique():
        
        # from survey data
        
        _pday_survey = pday_survey[(pday_survey['personid'] == person_rec) & (pday_survey['dayofweek'] == day)]
        
        day_tour = _tour[_tour['day'] == day]
        
        prec_id = str(person_rec) + str(day)
        pday.loc[prec_id,'hhno'] = day_tour['hhno'].iloc[0]
        pday.loc[prec_id,'pno'] = day_tour['pno'].iloc[0]
        pday.loc[prec_id,'day'] = day
        
        # Begin/End at home-
        # need to get from first and last trips of tour days 
        pday.loc[prec_id,'beghom'] = 0
        pday.loc[prec_id,'endhom'] = 0
        _trip = trip[(trip['person_id'] == person_rec) & (trip['day'] == day)]
        if _trip.iloc[0]['opurp'] == 0:
            pday.loc[prec_id,'beghom'] = 1
        if _trip.iloc[-1]['dpurp'] == 0:
            pday.loc[prec_id,'endhom'] = 1
    
        # Home-based tours 
        
        # work subtours

        # Tours to usual workplace in a day
    
        # Number of tours by purpose
        purp_dict = {
            'uw': 1,    # tours to usual workplace
            'sc': 2,
            'es': 3,
            'pb': 4,
            'sh': 5,
            'ml': 6,
            'so': 7,
            're': 8,
            'me': 9
        }
        for purp_name, purp_val in purp_dict.items():
            # Number of tours
            pday.loc[prec_id,purp_name+'tours'] = len(day_tour[day_tour['pdpurp'] == purp_val])
        
            # Number of stops
            day_tour_purp = day_tour[day_tour['pdpurp'] == purp_val]
            pday.loc[prec_id,purp_name+'stops'] = day_tour_purp[['tripsh1','tripsh2']].sum().sum() - 2
        
        # Minutes worked at home
        pday.loc[prec_id,'wkathome'] = _pday_survey['telework_time'].values[0]

IndexError: single positional indexer is out-of-bounds

# Household Day

In [54]:
# hhno
# day
# day of week
# jttours
# ph tours
# fh tours
# hd exp fac
# These don't matter for our person-level model
# Load person day file to generate household day
personday = pd.read_csv(r'R:\e2projects_two\SoundCastDocuments\2017Estimation\personday17.csv')

IOError: File R:\e2projects_two\SoundCastDocuments\2017Estimation\personday17.csv does not exist

In [53]:
hh

Unnamed: 0,hhno,hhsize,hhvehs,hhwkrs,hhftw,hhptw,hhret,hhoad,hhuni,hhhsc,hh515,hhcu5,hhincome,hownrent,hrestype,hhtaz,hhexpfac
0,17100005,2,2,1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,87500,1,1,1709,24.441709
1,17100024,3,1,2,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,87500,1,2,557,26.224981
2,17100052,1,0,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,17500,2,1,426,25.692826
3,17100059,1,0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,125000,2,3,568,47.768728
4,17100060,1,1,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,17500,2,2,3347,278.147225
5,17100102,1,1,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,87500,2,3,553,318.861344
6,17100108,2,1,2,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,225000,2,3,458,24.899462
7,17100111,4,3,2,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,225000,1,1,184,167.400000
8,17100137,1,1,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,17500,1,3,574,142.036552
9,17100149,4,2,2,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,125000,1,3,1465,561.492027
