In [1]:
# Convert Household, Person, and Trip files to Daysim format for estimation and calibration

In [2]:
import pandas as pd
import geopandas as gpd

In [3]:
# Flexible column names, given that these may change in future surveys
hhno = 'hhid'
hownrent = 'rent_own'
hrestype = 'res_type'
hhincome = 'hhincome_detailed'
hhtaz = 'final_home_taz2010'
hhexpfac = 'hh_wt_revised'
hhwkrs = 'numworkers'
hhvehs = 'vehicle_count'
pno = 'pernum'

# Household

In [193]:
def total_persons_to_hh(hh, person, daysim_field, filter_field, 
                        filter_field_list, hhid_col=hhno, wt_col=hhexpfac):
    
    """Use person field to calculate total number of person in a household for a given field
    e.g., total number of full-time workers"""
    
    df = person[person[filter_field].isin(filter_field_list)]
    df = df.groupby(hhid_col).count().reset_index()[[wt_col,hhid_col]]
    df.rename(columns={wt_col: daysim_field}, inplace=True)
    
    # Join to households
    hh = pd.merge(hh, df, how='left', on=hhid_col)
    hh[daysim_field].fillna(0, inplace=True)
    
    return hh

In [194]:
# lookup maps for various fields
hownrent_map = {1:1, # Own: own
                2:2, # Rent: rent
                3:3, # provided by job/military: other
                4:3, # other: other
                5:3} # prefer not to answer: other

hhrestype_map = {1:1, # SFH: SFH
                 2:2, # Townhouse (attached house): duplex/triplex/rowhouse
                 3:2, # Building with 3 or fewer apartments/condos: duplex/triplex/rowhouse
                 4:3, # Building with 4 or more apartments/condos: apartment/condo
                 5:4, # Mobile home/trailer: Mobile home/trailer
                 6:5, # Dorm or institutional housing: Dorm room/rented room
                 7:6, # other: other
                   }

# Use the midpoint of the ranges provided since DaySim uses actual values
income_map = {
    1: 5000,
    2: 17500,
    3: 30000,
    4: 42500,
    5: 62500,
    6: 87500,
    7: 125000,
    8: 175000,
    9: 225000,
    10: 250000,
    11: -1
}

In [195]:
hh = pd.read_excel(r'J:\Projects\Surveys\HHTravel\Survey2017\Data\Export\Version 2\Restricted\In-house\2017-internal-v2-R-1-household.xlsx',
                         skiprows=1)
person = pd.read_excel(r'J:\Projects\Surveys\HHTravel\Survey2017\Data\Export\Version 2\Restricted\In-house\2017-internal-v2-R-2-person.xlsx',
                      skiprows=1)

In [196]:
# Do some up-front data prep
# This may be different for new data sets
# Identify high school students based on their school name
# This will not include all students, but we can start with these students
person['high_school'] = 0
person['school_loc_name'].fillna(' ', inplace=True)
person.ix[(person['school_loc_name'].str.contains("High", na=False)) &
          (person['schooltype'].isin([3,4])), "high_school"] = 1

# Students not in this group will be assumed as high school students
# if they're in age group 16-17, and 18-24 and are in K12 (public or private) 3, 4
# This is probably excluding some in the 12-15 year group, should try to sort this out better in the future
person.ix[(person['high_school'] != 0) & 
          (person['age'].isin([4,5])) &
          (person['schooltype'].isin([3,4])), 'high_school'] = 1

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  from ipykernel import kernelapp as app


In [197]:
# Full-time workers
hh = total_persons_to_hh(hh, person, daysim_field='hhftw', filter_field='employment', filter_field_list=[1])

# Part-time workers
hh = total_persons_to_hh(hh, person, daysim_field='hhptw', filter_field='employment', filter_field_list=[2])

# Retirees
hh = total_persons_to_hh(hh, person, daysim_field='hhret', filter_field='employment', filter_field_list=[6])

# Other Adults
hh = total_persons_to_hh(hh, person, daysim_field='hhoad', filter_field='employment', filter_field_list=[3,4,5,7])

# University Students
hh = total_persons_to_hh(hh, person, daysim_field='hhuni', filter_field='schooltype', filter_field_list=[6])

# High school students
hh = total_persons_to_hh(hh, person, daysim_field='hhhsc', filter_field='high_school', filter_field_list=[1])

# k12 age 5-15
age5_12 = person[person['age'].isin([2,3])]
hh = total_persons_to_hh(hh, age5_12, daysim_field='hh515', filter_field='schooltype', filter_field_list=[3,4])

# age under 5
hh = total_persons_to_hh(hh, person, daysim_field='hhcu5', filter_field='age', filter_field_list=[1])

hh['hownrent'] = hh[hownrent].map(hownrent_map) 
hh['hrestype'] = hh[hrestype].map(hhrestype_map) 
hh['hhincome'] = hh[hhincome].map(income_map) 
hh['hhtaz'] = hh[hhtaz]
hh['hhexpfac'] = hh[hhexpfac]
hh['hhwkrs'] = hh[hhwkrs]
hh['hhno'] = hh[hhno]
hh['hhvehs'] = hh[hhvehs]

In [175]:
# Need the parcel ID as well!
# Use geopandas to find nearest parcel node?
import geopandas as gpd

In [198]:
# daysim_fields = ['hhno','hhsize','hhvehs','hhwkrs','hhftw','hhptw','hhret','hhoad','hhuni','hhhsc','hh515',
#                  'hhcu5','hhincome','hownrent','hrestype','hhparcel','hhtaz','hhexpfac','samptype']
# Without parcel field
daysim_fields = ['hhno','hhsize','hhvehs','hhwkrs','hhftw','hhptw','hhret','hhoad','hhuni','hhhsc','hh515',
                 'hhcu5','hhincome','hownrent','hrestype','hhtaz','hhexpfac']

hh[daysim_fields].to_csv(r'R:\e2projects_two\SoundCastDocuments\2017Estimation\household17.csv',index=False)

In [201]:
# hh[daysim_fields]

# Person

In [3]:
# Reload to start with fresh data
person = pd.read_excel(r'J:\Projects\Surveys\HHTravel\Survey2017\Data\Export\Version 2\Restricted\In-house\2017-internal-v2-R-2-person.xlsx',
                      skiprows=1)

In [145]:
# person['pernum']

In [128]:
# Reload to start with fresh data
person = pd.read_excel(r'J:\Projects\Surveys\HHTravel\Survey2017\Data\Export\Version 2\Restricted\In-house\2017-internal-v2-R-2-person.xlsx',
                      skiprows=1)

# Person Type

# Full time worker
person.ix[person['employment'] == 1, 'pptyp'] = 1

# Part-time worker
person.ix[person['employment'] == 2, 'pptyp'] = 2

# Non-working adult age 65+
person.ix[(person['employment'] != 1) &  (person['age'].isin([10,11,12])), 'pptyp'] = 3

# High school student age 16+
person.ix[(person['age'] >= 4) & (person['schooltype'].isin([3,4,5])), 'pptyp'] = 6

# university student (full-time)
person.ix[(person['schooltype'].isin([6,7])) & (person['student'] == 3), 'pptyp'] = 5

# Child age 5-15
person.ix[person['schooltype'].isin([2,3]), 'pptyp'] = 7

# child under 5
person.ix[person['schooltype'].isin([1]), 'pptyp'] = 8

# Non-working adult age 65 should accoutn for all others
person.ix[person['pptyp'].isnull(), 'pptyp'] = 4

# Person worker type
person.ix[person['employment'].isin([1]), 'pwtyp'] = 1
person.ix[person['employment'].isin([2]), 'pwtyp'] = 2
person.ix[person['employment'].isin([3,4,5,6,7]), 'pwtyp'] = 0
person['pwtyp'].fillna(0,inplace=True)
person['pwtyp'] = person['pwtyp'].astype('int')

# Transit pass availability
# Care about people that have subsidized/free passes
# people that perceive transit cost as 0
person['ptpass'] = 0
person.ix[(person['tran_pass_12'].isin([1,2])) | (person['benefits_3'].isin([2,3])),'ptpass'] = 1

# Paid parking at work (any subsidization counts as 'paid')
person['ppaidprk'] = 0
person.ix[person['workpass'].isin([3,4]), 'ppaidprk'] = 1

# Take median age
age_map = {
    1: 2,
    2: 8,
    3: 14,
    4: 17,
    5: 21,
    6: 30,
    7: 40,
    8: 50,
    9: 60,
    10: 70,
    11: 80,
    12: 85
}

gender_map = {
    1: 1,    # male: male
    2: 2,    # female: female
    3: 9,    # another: missing
    4: 9     # prefer not to answer: missing
}

pstyp_map = {
    1: 0,
    2: 1,
    3: 2
}

hownrent_map = {
    1: 1,
    2: 2,
    3: 3,
    4: 3,
    4: 3
}

person['age'] = person['age'].astype('int')
person['pagey'] = person['age'].map(age_map)
person['pgend'] = person['gender'].map(gender_map)
person['pstyp'] = person['student'].map(pstyp_map)
person['pstyp'].fillna(0,inplace=True)
person['hhno'] = person['hhid']
person['pno'] = person['pernum']
person['psexpfac'] = person['hh_wt_revised']
person['pwtaz'] = -1
person['pstaz'] = -1

# Need:
# pwpcl
# pwtaz
# pwautime
# pwaudist
# pspcl
# psautime
# psaudist
# puwmode
# puwarrp
# puwdepp

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  # This is added back by InteractiveShellApp.init_path()
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for

In [129]:
def get_zone_from_lat_lng(df, taz_gdf, zone_gdf_crs, loc_field, lat_field, lng_field, point_field):
    
    """ Joins lat long field to polygon file. 
        Requires specification of CRS of polyline file
        Assumes standard coordinate reference system of 4326 in lat long fields
    """
    lat_lng_crs = 'epsg:4326'
    
    # Filter for records with valid entries and create gdf
    _df = df[-df[loc_field].isnull()]
    _df = person[-person[lat_field].isnull()]
    
    gdf = gpd.GeoDataFrame(
        _df, geometry=gpd.points_from_xy(_df[lng_field], _df[lat_field]))
    gdf.crs = {'init' :lat_lng_crs}
    
    # Align CRS from zone file, convert to lat/long format
    taz_gdf.crs = {'init' :'epsg:'+str(zone_gdf_crs)}
    taz_gdf = taz_gdf.to_crs({'init': lat_lng_crs})
    
    # Spatial join between point and polyline
    result_df = gpd.sjoin(gdf, taz_gdf, how='left', op='intersects')
    
    # Update location field
    result_df[loc_field] = result_df[point_field]
    # Drop locations outside of the region
    result_df = result_df[-result_df[loc_field].isnull()]
    result_df[loc_field] = result_df[loc_field].astype('int')
    
    # Join back to original df
    df = df.drop(loc_field,axis=1)
    df = df.merge(result_df[['hhno','pno',loc_field]], on=['hhno','pno'], how='left')
    df[loc_field] = df[loc_field].fillna(-1)
    
    return df

In [None]:
taz_gdf = gpd.read_file(r'W:\geodata\forecast\taz2010nowater.shp')

In [132]:
person = get_zone_from_lat_lng(person, taz_gdf, zone_gdf_crs='2926', loc_field='pwtaz', 
                             lat_field='work_lat', lng_field='work_lng', point_field='TAZ')

person = get_zone_from_lat_lng(person, taz_gdf, zone_gdf_crs='2926', loc_field='pstaz', 
                             lat_field='school_loc_lat', lng_field='school_loc_lng', point_field='TAZ')

In [133]:

daysim_cols = ['hhno', 'pno', 'pptyp', 'pagey', 'pgend', 'pwtyp', 'pwpcl', 'pwtaz', 'pwautime',
               'pwaudist', 'pstyp', 'pspcl', 'pstaz', 'psautime', 'psaudist', 'puwmode', 'puwarrp', 
               'puwdepp', 'ptpass', 'ppaidprk', 'pdiary', 'pproxy', 'psexpfac']

# Add empty columns to fill in later with skims
for col in daysim_cols:
    if col not in person.columns:
        person[col] = -1
        
person = person[daysim_cols]

person.to_csv(r'R:\e2projects_two\SoundCastDocuments\2017Estimation\person17.csv',index=False)

# Trips

In [125]:
from sqlalchemy import create_engine
import pyodbc

conn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER=AWS-PROD-SQL\Coho;DATABASE=Elmer;Trusted_Connection=yes')
# trip = pd.read_sql(sql='select * from HHSurvey.vTrip2017', con=conn)

trip = pd.read_excel(r'\\aws-prod-file01\datateam\Projects\Surveys\HHTravel\Survey2017\Data\Dataset_2 August 2017\Trips\5-Trip_rMove-v10-LINKED.xlsx',
             sheetname='5-Trip-rMove')

trip['hhno'] = trip['HHID']
trip['pno'] = trip['PerNum']
trip['day'] = trip['DayNum'].astype(int)
# Need: 
# tour
# half
# tseg
trip['tsvid'] = trip['TripNum']

# Select only weekday trips (Should we also include Friday?)
trip = trip[trip['Dayofweek'].isin(['Monday','Tuesday','Wednesday','Thursday'])]

# Recode purposes
day_map = {
    'Monday': 1,
    'Tuesday': 2,
    'Wednesday': 3,
    'Thursday': 4
}

purpose_map = {
    1: 0, # home
    6: 2, # school
    9: 3, # escort
    10: 1, # work
    11: 1, # work-related
    14: 1, # work-related
    30: 5, # grocery -> shop
    32: 5, # other shopping -> shop
    33: 4,
    34: 9, # medical
    50: 6, # restaurant -> meal
    51: 8, 
    52: 7,
    53: 8,
    54: 7, # religious/community/volunteer -> social
    56: 7, # family activity -> social
    60: 10, # change mode
    61: 4,
    62: 7, # other social
    97: -1 # other
}

dorp_map = {
    1: 1,
    2: 2,
    3: 9
}

# Survey DB is formatted with string values, need to translate again with above dict
df_purp_lookup = pd.read_sql(sql='select * from HHSurvey.DataExplorerValues2017 where VariableID = 125', con=conn)
new_purp_map = {}
for val in df_purp_lookup['ValueOrder'].unique():
    text = df_purp_lookup.loc[df_purp_lookup['ValueOrder'] == val,'ValueText'].values[0]
    new_purp_map[text] = purpose_map[val]

trip['day'] = trip['Dayofweek'].map(day_map)

trip['opurp'] = trip['OriginPurpose'].map(new_purp_map)
trip['dpurp'] = trip['DestPurpose'].map(new_purp_map)

trip['dorp'] = trip['DestPurpose'].map(new_purp_map)

# origin and destination TAZs
trip['otaz'] = trip['OTaz2010']
trip['dtaz'] = trip['DTaz2010']

##############################
# Start and end time
##############################
# Filter out rows with None
trip = trip[-trip['DepartTimeTimestamp'].isnull()]
trip = trip[-trip['ArrivalTimeTimestamp'].isnull()]

# Minutes
for db_col_name, daysim_col_name in {'ArrivalTimeTimestamp': 'arrtm', 'DepartTimeTimestamp': 'deptm'}.iteritems():
    # Filter rows without valid depart and start times
    trip = trip[-trip[db_col_name].isnull()]
    
    # Get minutes from time stamp, as values to right of :
    minutes = trip[db_col_name].apply(lambda row: str(row).split(':')[-1])
    minutes = minutes.apply(lambda row: row.split('.')[0]).astype('int') # Trim any decimal places and takes whole numbers
    
    # Get hours from time stamp
    hours = trip[db_col_name].apply(lambda row: str(row).split(' ')[-1].split(':')[0]).astype('int')
    
    # In minutes after midnight****
    trip[daysim_col_name] = hours*60 + minutes
    
#     # Convert to minutes after 3 AM; sum of minutes minus 3*60 
#     trip[daysim_col_name] = (hours*60)+minutes - 180
    
    # This gives negative values to numbers starting between midnight and 3 am, move these to *after* midnight
    # 60*24 (1440) - <negative value>
#     trip.loc[trip[daysim_col_name] < 0,daysim_col_name] = 1440 + trip.loc[trip[daysim_col_name] < 0,daysim_col_name]

##############################
# Mode
##############################
trip['mode'] = trip['MainMode'].copy()
# Get HOV2/HOV3 based on total number of travelers
trip.loc[trip['mode'] == 'HOV','mode'] = 'HOV2'
trip.loc[(trip['TravelersTotal'] > 2) & (trip['MainMode'] == 'HOV'),'mode'] = 'HOV3+'

trip.loc[trip['Mode1'] == 'Other hired service (e.g., Lyft, Uber)','mode'] = 'TNC'

# Lookup values
mode_dict = {
    'Walk': 1,
    'Bike': 2,
    'SOV': 3,
    'HOV2': 4,
    'HOV3+': 5,
    'Transit': 6,
    'TNC': 9,
    'Other': 10
}

trip['mode'] = trip['mode'].map(mode_dict)

trip['trexpfac'] = trip['TripWtFinal']

trip['travcost'] = -1
trip['travtime'] = -1
trip['travdist'] = -1

trip_cols = ['hhno','pno','day','mode','opurp','dpurp','deptm','otaz','dtaz','arrtm','trexpfac','travcost','travtime','travdist']

trip = trip[-trip['mode'].isnull()]
trip = trip[-trip['opurp'].isnull()]
trip = trip[-trip['dpurp'].isnull()]
trip = trip[-trip['otaz'].isnull()]
trip = trip[-trip['dtaz'].isnull()]

# Write to file
trip = trip[trip_cols]
trip.to_csv(r'R:\e2projects_two\SoundCastDocuments\2017Estimation\trip17.csv', index=False)

# Attach skim values in a separate process 
# https://github.com/psrc/travel-modeling/blob/master/survey/survey_attach_skims/2017/attach_skim_values_2017_no_tours.py

  **kwds)


KeyError: 'HHID'

# Tours
Generate a tour file from the trips

In [46]:
# Load the trip file with skim values attached
trip = pd.read_csv(r'R:\e2projects_two\SoundCastDocuments\2017Estimation\skims_attached\tripP17_w.dat')

### TEMP: Fix ME
# Join with the day column - should remove this once skims are attached the file above
_trip = pd.read_csv(r'R:\e2projects_two\SoundCastDocuments\2017Estimation\trip17.csv')
cols = [u'hhno', u'pno', u'mode', u'opurp', u'dpurp', u'deptm', u'otaz',
       u'dtaz', u'arrtm', u'trexpfac']
trip = trip.merge(_trip[cols+['day']], on=cols, how='inner')
trip = trip.drop_duplicates()

# Remove any people that haave Nan in day
####

# Create a unique person record
trip['personid'] = trip['hhno'].astype('str')+trip['pno'].astype('str')

In [53]:
tour_dict = {}
mylist = []
bad_trips = []
tour_id = 0

for personid in trip['personid'].value_counts().index.values:
# for personid in ['171000051','171317451']:

    person_df = trip.loc[trip['personid'] == personid]
    # Loop through each day
    for day in person_df['day'].unique():
        df = person_df.loc[person_df['day'] == day]
    
        # First trip record should be home (?)
        if df.groupby('personid').first()['opurp'].values[0] != 0:
            bad_trips.append(df['personid'].iloc[0])
            continue

        # identify home tours first, then check for work and other subtours 
        home_tours_start = df[df['opurp'] == 0]
        home_tours_end = df[df['dpurp'] == 0]

        # skip person if they have a different number of tour starts/ends at home
        if len(home_tours_start) != len(home_tours_end):
            bad_trips.append(df['personid'].iloc[0])
            continue

        # Loop through each set of home-based tours
        for set_index in xrange(len(home_tours_start)):

            tour_dict[tour_id] = {}       

            # start row for this set
            start_row_id = home_tours_start.index[set_index]
    #         print start_row
            end_row_id = home_tours_end.index[set_index]
    #         print '-----'
            # iterate between the start row id and the end row id to build the tour

            # Select slice of trips that correspond to a trip set
            _df = df.loc[start_row_id:end_row_id]

            #################################
            # Skip this trip set under certain conditions
            #################################

            if len(_df) == 0:
                continue

            # Trips with negative purposes
            if (_df['opurp'] < 0).any() or (_df['dpurp'] < 0).any():
                print 'negative person :(' + str(_df['personid'].iloc[0])
                bad_trips.append(df['personid'].iloc[0])
                continue

            # Trips with same opurp and dpurp that is home
            if len(_df[(_df['opurp'] == _df['dpurp']) & (_df['opurp'] == 0)]) > 0:
                bad_trips.append(df['personid'].iloc[0])
                continue

    #         # Trips that have different purposes in sequence
    #         if len (df[df.shift(-1)['opurp']!=df['dpurp']]) > 0:
    #             bad_trips.append(df['personid'].iloc[0])
    #             continue

            # First row
            tour_dict[tour_id]['tlvorig'] = _df.iloc[0]['deptm']
            tour_dict[tour_id]['tardest'] = _df.iloc[0]['arrtm']
            tour_dict[tour_id]['totaz'] = _df.iloc[0]['otaz']
            # NEED PARCEL DATA ON TRIP RECORDS!!!

            # Last row
            tour_dict[tour_id]['tlvdest'] = _df.iloc[-1]['deptm']
            tour_dict[tour_id]['tarorig'] = _df.iloc[-1]['arrtm']
            tour_dict[tour_id]['tdtaz'] = _df.iloc[-1]['otaz']

            # Household and person info
            tour_dict[tour_id]['hhno'] = _df.iloc[0]['hhno']
            tour_dict[tour_id]['pno'] = _df.iloc[0]['pno']
            tour_dict[tour_id]['day'] = day

            # Identify primary purpose and figure out the tour halves
        #   ****ASSUMING primary tour is the activity that takes the longest amount of time

             # Determine if this is part of the first half tour or second half tour
            # calculate duration, as difference between arrival at a place and start of next trip
            _df['duration'] = _df.shift(-1).iloc[:-1]['deptm']-_df.iloc[:-1]['arrtm']

            if len(_df) > 3:
                mylist.append(_df['personid'].iloc[0])

            # For tour groups with only 2 trips, the halves are simply the first and second trips
            if len(_df) == 2:
                tour_dict[tour_id]['pdpurp'] = _df.iloc[0]['dpurp']
                tour_dict[tour_id]['tripsh1'] = 1
                tour_dict[tour_id]['tripsh2'] = 1
            # For tour groups with > 2 trips, calculate primary purpose and halves
            else:
                # Assuming that the primary purpose is the purpose for the trip to place with longest duration
                primary_purp_index = _df['duration'].argmax()
                tour_dict[tour_id]['pdpurp'] = _df.loc[_df['duration'].argmax()]['opurp']

                # Get the tour DTAZ as the DTAZ of the primary trip destination
                tour_dict[tour_id]['tdtaz'] = _df.loc[primary_purp_index]['dtaz']
                # destination parcel

                # Get number of trips in the first half tour
                tour_dict[tour_id]['tripsh1'] = len(_df.iloc[0:primary_purp_index+1])

                # trips in second half tour
                tour_dict[tour_id]['tripsh2'] = len(_df.iloc[primary_purp_index:])

                # look for subtours
                ##### FIX ME: #####
                # for now just set subtours as 0 - do not use this for tour estimation



            # Calculate number of subtours
            # trips that have the same origin/dest pairs before returning home

    #         print personid

            # Extract main mode type
            # use a heirarchy of modes used on the trip
            mode_list = _df['mode'].value_counts().index.astype('int').values
            mode_heirarchy = [3,4,5,6,9,2,1]
            for mode in mode_heirarchy:
                if mode in mode_list:
                    tour_dict[tour_id]['tmodetp'] = mode
                    break


            tour_id += 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
will be corrected to return the positional maximum in the future.
Use 'series.values.argmax' to get the position of the maximum now.
will be corrected to return the positional maximum in the future.
Use 'series.values.argmax' to get the position of the maximum now.


negative person :(171440801
negative person :(171440801
negative person :(171244111
negative person :(171244111
negative person :(171431541
negative person :(171150922
negative person :(171423971
negative person :(171091091
negative person :(171324671
negative person :(171324671
negative person :(171373821
negative person :(171035871
negative person :(171035871
negative person :(171035871
negative person :(171298461
negative person :(171298461
negative person :(171504073
negative person :(171003031
negative person :(171003031
negative person :(171504072
negative person :(171533672
negative person :(171328432
negative person :(171328432
negative person :(171328432
negative person :(171396421
negative person :(171372521
negative person :(171310881
negative person :(171052211
negative person :(171085952
negative person :(171085952
negative person :(171085952
negative person :(171085952
negative person :(171085952
negative person :(171488861
negative person :(171120241
negative person :(17

In [None]:
# trip
# df = trip.loc[(trip['personid'] == '171027872') & (trip['day']==2)]
# [i for i in trip['personid'].unique()]
# df

# # Identify subtours
# # Subtours begin with a trip from work to anywhere but home
# df[(df['opurp'] == 1) & (df['dpurp'] != 0)]
# # It's a work subtour if any of the subsequent trips return to work without returning home
# # loop thorugh the next trips
# for index, row in df[(df['opurp'] == 1) & (df['dpurp'] != 0)].iterrows():
#     if row['dpurp'] == 0:
#         break
#     if row['dpurp'] == 1:
#         end_trip = row
# begin_trip = df[(df['opurp'] == 1) & (df['dpurp'] != 0)].iloc[0]

In [96]:
end_trip

hhno         17102787
pno                 2
mode                9
opurp               1
dpurp               1
deptm            1120
otaz               47
dtaz              313
arrtm            1135
trexpfac      5.51514
id               3107
travcost            0
travdist         4.41
travtime        14.91
day                 2
personid    171027872
Name: 43093, dtype: object

In [54]:
result_df = pd.DataFrame.from_dict(tour_dict, orient='index')
result_df['personid'] = result_df['hhno'].astype('str')+result_df['pno'].astype('str')

In [69]:
# Read old tour file to get list of columns
_tour = pd.read_csv(r'R:\e2projects_two\SoundCastDocuments\2014Estimation\New_06_20_16\inputs\tourP14.dat', delim_whitespace=True)
_tour.columns

Index([u'hhno', u'pno', u'day', u'tour', u'jtindex', u'parent', u'subtrs',
       u'pdpurp', u'tlvorig', u'tardest', u'tlvdest', u'tarorig', u'toadtyp',
       u'tdadtyp', u'topcl', u'totaz', u'tdpcl', u'tdtaz', u'tmodetp',
       u'tpathtp', u'tautotime', u'tautocost', u'tautodist', u'tripsh1',
       u'tripsh2', u'phtindx1', u'phtindx2', u'fhtindx1', u'fhtindx2',
       u'toexpfac'],
      dtype='object')

In [70]:
result_df.columns

Index([u'tdtaz', u'tlvdest', u'tripsh1', u'tmodetp', u'totaz', u'tardest',
       u'day', u'hhno', u'tlvorig', u'pdpurp', u'pno', u'tarorig', u'tripsh2',
       u'personid'],
      dtype='object')

In [81]:
# Need to fille these in
for col in [u'jtindex', u'parent', u'subtrs','toadtyp', u'tdadtyp', u'topcl',u'tdpcl'u'tpathtp', 
            u'tautotime', u'tautocost', u'tautodist',u'tripsh1',
       u'tripsh2', u'phtindx1', u'phtindx2', u'fhtindx1', u'fhtindx2']:
    result_df[col] = 0 
result_df['toexpfac'] = 1

In [82]:
result_df.head()

Unnamed: 0,tdtaz,tlvdest,tripsh1,tmodetp,totaz,tardest,day,hhno,tlvorig,pdpurp,...,topcl,tdpcltpathtp,tautotime,tautocost,tautodist,phtindx1,phtindx2,fhtindx1,fhtindx2,toexpfac
0,1881.0,1134,0,3,4.0,779,4,17131745,765,1.0,...,0,0,0,0,0,0,0,0,0,1
1,2392.0,1278,0,3,4.0,751,1,17131745,745,1.0,...,0,0,0,0,0,0,0,0,0,1
2,1586.0,761,0,3,1298.0,128,1,17144080,105,0.0,...,0,0,0,0,0,0,0,0,0,1
3,1586.0,723,0,3,1298.0,112,2,17144080,90,1.0,...,0,0,0,0,0,0,0,0,0,1
4,1586.0,728,0,3,1298.0,119,3,17144080,95,1.0,...,0,0,0,0,0,0,0,0,0,1


In [83]:
# join empty columns
result_df.to_csv(r'R:\e2projects_two\SoundCastDocuments\2017Estimation\tour17.csv', index=False)

In [84]:
tour = result_df.copy()

# Person Day


In [252]:
#
# pday = person.copy()
tour = pd.read_csv(r'R:\e2projects_two\SoundCastDocuments\2017Estimation\tour17.csv')
tour['person_id'] = tour['hhno'].astype('str') + tour['pno'].astype('str')
trip = pd.read_csv(r'R:\e2projects_two\SoundCastDocuments\2017Estimation\trip17.csv')
trip['person_id'] = trip['hhno'].astype('str') + trip['pno'].astype('str')

pday_survey = pd.read_excel(r'J:\Projects\Surveys\HHTravel\Survey2017\Data\Export\Version 2\Public\2017-pr2-4-day.xlsx',
                           sheet_name='4-Day-v3', skiprows=1)
pday_survey['personid'] = pday_survey['hhid'].astype('str') + pday_survey['pernum'].astype('str')
pday_survey['telework_time'] = pday_survey['telework_time'].fillna(0)

In [254]:
# Columns
# hhno
# pno

# Work through each person's day using tour file
# day
person['id'] = person['hhno'].astype('str') + person['pno'].astype('str')

In [None]:
pday = pd.DataFrame()
for person_rec in person['id'].unique():
    
    # get this person's tours
    _tour = tour[tour['person_id'] == person_rec]
    
    # Loop through each day
    for day in _tour['day'].unique():
        
        # from survey data
        
        _pday_survey = pday_survey[(pday_survey['personid'] == person_rec) & (pday_survey['dayofweek'] == day)]
        
        day_tour = _tour[_tour['day'] == day]
        
        prec_id = str(person_rec) + str(day)
        pday.loc[prec_id,'hhno'] = day_tour['hhno'].iloc[0]
        pday.loc[prec_id,'pno'] = day_tour['pno'].iloc[0]
        pday.loc[prec_id,'day'] = day
        
        # Begin/End at home-
        # need to get from first and last trips of tour days 
        pday.loc[prec_id,'beghom'] = 0
        pday.loc[prec_id,'endhom'] = 0
        _trip = trip[(trip['person_id'] == person_rec) & (trip['day'] == day)]
        if _trip.iloc[0]['opurp'] == 0:
            pday.loc[prec_id,'beghom'] = 1
        if _trip.iloc[-1]['dpurp'] == 0:
            pday.loc[prec_id,'endhom'] = 1
            
        # Home-based tours
        # Work based tours
        # Tours to usual workplace in a day
    
        # Number of tours by purpose
        purp_dict = {
            'wk': 1,
            'sc': 2,
            'es': 3,
            'pb': 4,
            'sh': 5,
            'ml': 6,
            'so': 7,
            're': 8,
            'me': 9
        }
        for purp_name, purp_val in purp_dict.items():
            # Number of tours
            pday.loc[prec_id,purp_name+'tours'] = len(day_tour[day_tour['pdpurp'] == purp_val])
        
            # Numbefr of stops
            day_tour_purp = day_tour[day_tour['pdpurp'] == purp_val]
            pday.loc[prec_id,purp_name+'stops'] = day_tour_purp[['tripsh1','tripsh2']].sum().sum() - 2

        pday.loc[prec_id,'wkathome'] = _pday_survey['telework_time'].values[0]

In [271]:
pday

Unnamed: 0,hhno,pno,day,beghom,endhom,wktours,wkstops,sctours,scstops,estours,...,shstops,mltours,mlstops,sotours,sostops,retours,restops,metours,mestops,wkathome
1710000514,17100005.0,1.0,4.0,1.0,1.0,0.0,-2.0,0.0,-2.0,0.0,...,-2.0,0.0,-2.0,1.0,-2.0,0.0,-2.0,0.0,-2.0,60.0


In [269]:
_pday_survey['telework_time']

0    60.0
Name: telework_time, dtype: float64

# Household Day

In [None]:
# hhno
# day
# day of week
# jttours
# ph tours
# fh tours
# hd exp fac
# These don't matter for our person-level model