In [1]:
# Combine separate walk-access/egress and transit trips into single transit trip

In [1]:
import pandas as pd
import numpy as np

In [4]:
trip = pd.read_excel(r'J:\Projects\Surveys\HHTravel\Survey2017\Data\Dataset_2 August 2017\Trips\5-Trip_rMove-v4.xlsx',
             sheetname='5-Trip-rMove')

In [5]:
# First find the people whose mode changes at all between trips
# Loop through each person in the survey

# Ignore trips that are drop-off/pick-up. These might indicate a mode change (SOV to HOV 2 or 3+) but we
# don't want them to be linked. 

# Max size of trip sets (don't try to link more than 4 trips)
trip_set_max = 4

In [6]:
def unique_ordered_list(seq):
    seen = set()
    seen_add = seen.add
    return [ x for x in seq if not (x in seen or seen_add(x))]

In [7]:
# # Some trips have multiple modes listed - these need to be considered separately
# single_mode_trips = trip[trip['mode_2'].isnull()]
# multi_mode_trips = trip[-trip['mode_2'].isnull()]

In [8]:
# # Work with single mode trips first
# df = multi_mode_trips

# Get unique list of person
uniquePersonIDs = trip.groupby('personid').count().index

In [9]:
# sort the trip ID list by trip ID
trip.sort('tripid', inplace=True)

In [10]:
daynum = 1

In [11]:
[uniquePersonIDs[0]]

[1710005901]

In [12]:
problem_trips = []
person_counter = 0
# Loop through each person day
for person in uniquePersonIDs:
# for person in [uniquePersonIDs[0]]:
    
    # loop through each person's travel day
    travel_days = trip[trip['personid'] == person].groupby('daynum').count().index.values
    for daynum in travel_days:
        flag = 1
        #print person_counter
        
        # Trips unique to a single travel day for one person
        trip_subsample = trip[(trip['personid'] == person ) & (trip['daynum'] == daynum)]
        
        # Potential logic check: remove all persons that only drive for all trips
        # Consider only transit/walk/bike trips for linking?
            
        # Loop through each person's trips
        for row in xrange(0, len(trip_subsample)-1):
            person_trip = trip_subsample.iloc[row]
            next_pers_trip = trip_subsample.iloc[row+1]

            # Are current and next trips linked?
            # Ignore drop-off/pick-up. These might indicate a mode change (SOV to HOV 2 or 3+) but we don't want them to be linked. 
            # Also ignore purpose of mode transfer.
            # Also ignore bus-bus trips
            if (    (person_trip['mode_1'] <> next_pers_trip['mode_1']          # Include mode changes, otherwise likely same mode is take for return leg of tour
                or  (person_trip['mode_1'] == next_pers_trip['mode_1'] == 23)    # Include bus-to-bus transfer
                or  (person_trip['mode_1'] == next_pers_trip['mode_1'] == 52))   # Include train-to-train transfer
                and person_trip['a_dur'] <= 15    # Transfer must be under 15 minutes                                                  
                and (   person_trip['d_purp'] == next_pers_trip['d_purp']     # Trip purp must be the same
                     or person_trip['d_purp'] == 60)                     # or purp listed as "mode change"
                and person_trip['d_purp'] <> 9                           # Exclude drop-off/pick-up trips
                or person_trip['d_purp'] == 15 and next_pers_trip['mode_1'] <> 31  # Include all mode changes except planes
#                 or person_trip['Loc_type'] in ['P&R','PARKING','TRANSIT']
                ):
                # If this looks unlinked, flag it
                problem_trips.append(["%05d" % (person_counter,) + "%02d" % (flag,), person_trip['tripid']])
                problem_trips.append(["%05d" % (person_counter,) + "%02d" % (flag,), next_pers_trip['tripid']])

                # Is this trip part of an existing linked trip pair or a new trip pair?
                # Is the activity duration longer 30 minutes? Then it's probably a separate commute trip
                # I moved this to the outer loop - originally it was inside the above if statement...
                # The linked ID is no longer sequential but it is at least unique
            if next_pers_trip['a_dur'] > 30 or next_pers_trip['dest_name'] == 'HOME' or next_pers_trip['dest_name'] == 'WORK':
                flag += 1

        person_counter += 1

In [13]:
problem_trips_df = pd.DataFrame(problem_trips,columns=['linked_flag', 'tripid'])
merged_trips = pd.merge(left=trip,right=problem_trips_df,on='tripid',left_index=True,how='outer')
merged_trips.drop_duplicates(inplace=True) # Remove duplicates
merged_trips.fillna(0, inplace=True)

In [14]:
# Isolate unlinked trips
unlinked_trips = merged_trips.query("linked_flag <> 0")

# List of all linked trip sets and the number of records in each
unlinked_sets = unlinked_trips.groupby('linked_flag').count()['recid']

# Summary statistics on linked trip sets - gives us an idea of how well we identified linked trips
setsize = {}
for idx in list(unlinked_sets.index):
     # Examine each set of linked trips
     trip_set = unlinked_trips[unlinked_trips['linked_flag'] == idx]
     setsize[idx] = len(trip_set)

In [15]:
# Find distribution of set sizes
df_setsize = pd.DataFrame([setsize.keys(), setsize.values()]).T
df_setsize.index = df_setsize[0]    # Set index equal to the set ID

setsize_dist = df_setsize.groupby(1).count()   # Distribution of set size
setsize_dist

Unnamed: 0_level_0,0
1,Unnamed: 1_level_1
2,1183
3,213
4,73
5,16
6,2
7,3
10,1
13,1


In [16]:
# Distribution shows that most (90%) of sets are 2 or 3 trips only. Let's automatically join these only and do the others manually. 
# Discard sets with more than 3 trips because these are too unusual
# linked_list = linked_list[linked_list <= 3]

unlinked_trips_df = pd.DataFrame(unlinked_trips)
unlinked_trips_df.index = unlinked_trips.linked_flag    # Change index to work with trip sets with same flag id

# Get mode combination for each set
unlinked_trips_df['mode_1'] = unlinked_trips_df['mode_1'].astype("int64")   # Convert from float to int first
unlinked_trips_df['mode_1'] = pd.DataFrame(unlinked_trips_df['mode_1'].astype("str"))     # Convert to string
# Create new column with concatentation of modes
unlinked_trips_df['combined_modes'] = unlinked_trips_df.groupby('linked_flag').apply(lambda x: '-'.join(x['mode_1']))

# We could also concatenate other fields in this way...
#unlinked_trips_df['driver'] = unlinked_trips_df['driver'].astype("int64")   # Convert from float to int first
#unlinked_trips_df['driver'] = pd.DataFrame(unlinked_trips_df['driver'].astype("str"))     # Convert to string
#unlinked_trips_df['linked_driver'] = unlinked_trips_df.groupby('linked_flag').apply(lambda x: '-'.join(x['driver']))

# Filter out sets with more than 4 unlinked trip and flag them for manual inspection
# The name "..._max4" is poorly titled. The max set size is now flexible so that greater or fewer 
unlinked_trips_max4 = unlinked_trips_df[unlinked_trips_df['combined_modes'].str.count('-') < trip_set_max]

# Want the sum of all trips in a set for these values
sum_fields = ['trip_path_distance', 'google_duration', 'reported_duration',
             'bus_pay','ferry_pay','rail_pay','air_pay']

# Want the max of all trips in a set for these values (to capture any instance of use)
# This captures any instance of use in the trip set and assumes only 1 instance per set.
# This is sort of okay since we only link up to 3 trips and it's unlinkely many of these fields will have multiple
# instances, but it should be more methodical in the future. 
max_fields = ['taxi_type', 'taxi_pay', 'driver', 'toll', 'toll_pay',
              'park_ride_area_start','park_ride_area_end', 'park_ride_lot_start',
              'park_ride_lot_end', 'bus_type','bus_cost_dk',
              'ferry_type','ferry_cost_dk','rail_type','rail_cost_dk',
              'air_type','airfare_cost_dk',
              'change_vehicles', 'park','park_type','park_pay', 'mode_acc', 'mode_egr', ]

# Convert to consistent type - float 64
for field in sum_fields:
    unlinked_trips_max4[field] = unlinked_trips_max4[field].astype("float64")

# Convert transitline data into integer
for field in ['transit_line_' + str(x) for x in xrange(1,5)]:
    unlinked_trips_max4[field] = unlinked_trips_max4[field].astype("int")

# Get the sums and max values of trips grouped by each person's set
sums = unlinked_trips_max4.groupby('linked_flag').sum()
maxes = unlinked_trips_max4.groupby('linked_flag').max()

# Now we want to squish those unlinked trips together!
# The "primary trip" will inherit characeristics of associated trips
# Return list of primary trips and max distance for each set
#primary_trips = linked_trips_df.groupby('linked_flag').max()[['tripID','gdist']]

# change index to be trip ID because this is the number we ultimately want
df = pd.DataFrame(unlinked_trips_max4)
df.index = unlinked_trips_max4['tripid']
# Find the trip ID of the longest trip in each set
primary_trips = pd.DataFrame(df.groupby('linked_flag')['trip_path_distance'].agg(lambda x: x.idxmax()))
#unlinked_trips_max4.groupby('linked_flag')

# Select only the primary trip from each set
primary_trips_df = unlinked_trips_max4[df['tripid'].isin(primary_trips['trip_path_distance'])]
primary_trips_df.index = primary_trips_df.linked_flag   # Reset index to trip set ID

# Change primary trip start time to time of first in linked trip set
for field in ['depart_time_mam', 'depart_time_hhmm','depart_time_timestamp','o_purp',
              'origin_name','origin_address', 'origin_lat', 'origin_lng']:
    # Save the original data in a new column
    #primary_trips_df.loc[:,field + '_original'] = primary_trips_df[field]
    primary_trips_df.loc[:,field] = df.groupby('linked_flag').apply(lambda x: x[field].iloc[0])

# Change primary trip start time to time of last in linked trip set
# Change primary purpose and activity duration to that of the last trip in the set
for field in ['arrival_time_mam', 'arrival_time_hhmm','arrival_time_timestamp',
              'a_dur', 'd_purp','dest_lat', 'dest_lng','dest_name','dest_address']:
    # Save the original data in a new column
    #primary_trips_df.loc[:,field + '_original'] = primary_trips_df[field]
    primary_trips_df.loc[:,field] = df.groupby('linked_flag').apply(lambda x: x[field].iloc[-1])
    
for field in sum_fields:
    # Save original primary trip info in a new column appened with "_original"
    #primary_trips_df.loc[:,field + '_original'] = primary_trips_df[field]
    # Replace the primary trip fields with summed data
    primary_trips_df.loc[:,field] = sums[field]

for field in max_fields:
    # Save original primary trip info in a new column appened with "_original"
    #primary_trips_df.loc[:,field + '_original'] = primary_trips_df[field]
    # Replace the primary trip fields with summed data
    primary_trips_df.loc[:,field] = maxes[field]

#df_min_stop_time = df_stop_times.sort('stop_sequence', ascending=True).groupby('trip_id', as_index=False).first()
##need min stop time
#df_trips = pd.merge(left = df_trips, right = df_min_stop_time, on=['trip_id'])

## Save transitline data into primary trip record
#tr1 = pd.DataFrame(df.groupby('linked_flag')[['transitline1']].agg(lambda x: x.tolist()))
## Create new column to store unique transitline trips
#for each in ['transitline' + str(x) for x in xrange(1,5)]:
#    primary_trips_df[each + '_list'] = ""
#tr2 = pd.DataFrame(df.groupby('linked_flag')['transitline2'].agg(lambda x: x.tolist()))

# this returns greater than zero values for a single row - a single list of a list

# Collect all transitline1 values for a set in a single array
tr1 = pd.DataFrame(df.groupby('linked_flag')[['transit_line_1']].agg(lambda x: x.tolist()))
tr2 = pd.DataFrame(df.groupby('linked_flag')[['transit_line_2']].agg(lambda x: x.tolist()))
tr3 = pd.DataFrame(df.groupby('linked_flag')[['transit_line_3']].agg(lambda x: x.tolist()))
tr4 = pd.DataFrame(df.groupby('linked_flag')[['transit_line_4']].agg(lambda x: x.tolist()))
ts1 = pd.DataFrame(df.groupby('linked_flag')[['transit_system_1']].agg(lambda x: x.tolist()))
ts2 = pd.DataFrame(df.groupby('linked_flag')[['transit_system_2']].agg(lambda x: x.tolist()))
ts3 = pd.DataFrame(df.groupby('linked_flag')[['transit_system_3']].agg(lambda x: x.tolist()))
ts4 = pd.DataFrame(df.groupby('linked_flag')[['transit_system_4']].agg(lambda x: x.tolist()))

# Add together all the transitline values (1 through 4)
combined_transitlines = pd.DataFrame(tr1['transit_line_1'] + tr2['transit_line_2'] + tr3['transit_line_3'] + tr4['transit_line_4'])
combined_transitsys = pd.DataFrame(ts1['transit_system_1'] + ts2['transit_system_2'] + ts3['transit_system_3'] + ts4['transit_system_4'])
#combined_transitlines[0].iloc[0]

combined_transitlines["tr1"] = ""
combined_transitlines["tr2"] = ""
combined_transitlines["tr3"] = ""
combined_transitlines["tr4"] = ""
combined_transitsys["ts1"] = ""
combined_transitsys["ts2"] = ""
combined_transitsys["ts3"] = ""
combined_transitsys["ts4"] = ""

# Number of columns for transit lines or transit systems (4 in 2014 survey design)
num_transitlines = 4
num_transys = 4

for row in xrange(0, len(combined_transitlines)):
    # Add all unlinked trips' transitline data into a list
    combined_transitlines[0].iloc[row] = unique_ordered_list(combined_transitlines[0].iloc[row])  #[0] selects df column
    combined_transitsys[0].iloc[row] = unique_ordered_list(combined_transitsys[0].iloc[row])  #[0] selects df column
    # Remove zeros that might be at beginning of the list
    combined_transitlines[0].iloc[row] = [x for x in combined_transitlines[0].iloc[row] if x != 0]
    combined_transitsys[0].iloc[row] = [x for x in combined_transitsys[0].iloc[row] if x != 0]
    # But we want to pad the rest with zeros for consistent array shape
    combined_transitlines[0].iloc[row] = np.pad(combined_transitlines[0].iloc[row],
                                                (0,num_transitlines-len(combined_transitlines[0].iloc[row])),
                                                mode='constant')
    combined_transitsys[0].iloc[row] = np.pad(combined_transitsys[0].iloc[row],
                                                (0,num_transitlines-len(combined_transitsys[0].iloc[row])),
                                                mode='constant')

    for i in xrange(4):
        combined_transitlines["tr" + str(i + 1)].iloc[row] = combined_transitlines[0].iloc[row][i]
        combined_transitsys["ts" + str(i + 1)].iloc[row] = combined_transitsys[0].iloc[row][i]

# Add the transitline values to the primary trip record
for i in xrange(1,5):
    primary_trips_df['transit_line_' + str(i)] = combined_transitlines['tr' + str(i)]
    primary_trips_df['transit_system_' + str(i)] = combined_transitsys['ts' + str(i)]

In [17]:
# Trips with all unlinked trips removed
# note the "-trip" call to grab inverse of selection, so we're getting all survey trips NOT in unlinked_trips_df
trip_unlinked_removed_all = trip[-trip['tripid'].isin(unlinked_trips_df.tripid)]   # ALL unlinked trips removed

###########
# Manually process list of trips to be linked optionally

# # Okay now we want to filter out some bad linked trips and just import the unlinked trip
# # Do this before we add the linked trips onto the main file
# home2home = primary_trips_df.query("place_end == 'HOME' and place_start == 'HOME'")

# # List of bad links in the manually ID'ed linked_flag value of trips that look incorrect.
# # Remove these from the auto-linked trip and keep in the unlinked trip file\
# #bad_links = pd.read_csv(bad_trips)
# with open(bad_trips, 'r') as f:
#     bad_trip_list = []
#     for item in f:
#         bad_trip_list.append(item[:-1])

# bad_trip_df = primary_trips_df[primary_trips_df['linked_flag'].isin(bad_trip_list)]
# # Append the home2home trips on the bad_trip_df
# bad_trip_df = bad_trip_df.append(home2home)

# # Remove bad trips from combined trip file
# primary_trips_df = primary_trips_df[-primary_trips_df['tripID'].isin(bad_trip_df.tripID)]
# primary_trips_df['linked_flag'] = primary_trips_df.index

# # Add unlinked trips back in to unlinked file
# #unlinked_trips_df = unlinked_trips_df.append(unlinked_trips_df[unlinked_trips_df['linked_flag'].isin(bad_trip_df.linked_flag)])     
############

# Trips with all linked trips added (and unlinked trips removed)
trip_with_linked = pd.concat([trip_unlinked_removed_all,primary_trips_df])

# List of still unlinked trips - these still need to be addressed
unprocessed_unlinked_trips = unlinked_trips_df[unlinked_trips_df['combined_modes'].str.count('-') >= trip_set_max]
# unprocessed_unlinked_trips = unprocessed_unlinked_trips.append(unlinked_trips_df[unlinked_trips_df['linked_flag'].isin(bad_trip_df.linked_flag)])    

# # Distribution of combined trip modes
# a = primary_trips_df.groupby('combined_modes').count()['recordID']

# # Add the count of unlinked trips in each linked trip 
trip_with_linked['num_trips_linked'] = df_setsize[1]
trip_with_linked['num_trips_linked'].fillna(0)

# Reorder columns to match original trip file
# trip_with_linked.columns(trip_unlinked_removed_all.columns)

# Send to excel
writer = pd.ExcelWriter(r'J:\Projects\Surveys\HHTravel\Survey2017\Data\Dataset_2 August 2017\Trips\Linked\5-Trip-rMove-v4-LINKED.xlsx')

# Trip file with ALL unlinked files removed and new linked trips added (reording cols to match original trip file order)
# trip_with_linked.to_excel(writer, "Linked Trips Combined", cols=list(trip_unlinked_removed_all.columns) + ['combined_modes', 'num_trips_linked'])
trip_with_linked.to_excel(writer, "Linked Trips Combined")

# Trips with ALL unlinked trips removed
trip_unlinked_removed_all.to_excel(writer, 'All Unlinked Trips Removed')

# Linked Trips only
# Join with regular trip file data
primary_trips_df.to_excel(writer, 'Linked Trips Only')

# Unlinked Trips only
unlinked_trips_df.to_excel(writer, 'Unlinked Trips Only')

# List of unprocessed unlinked trips
unprocessed_unlinked_trips.to_excel(writer, "Unprocessed Unlinked Trips")

# Unlinked trips that need to be edited by hand
writer.close()