In [6]:
import pandas as pd
import numpy as np

# format sig figs
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [7]:
output_dir = r'J:\Projects\FasTrips\obs\output\OBS_fasttrips_demand_v1.0_stochastic_iter2_cap'

# Combine observed & modeled trip links 
----

**output: chosenpath_links_with_observed.csv**

In [8]:
def prep_df(data, record_type, unique_fields ,colname='record_type'):
    '''Load text data as df, create unique trip record ID, and tag as model/observed record'''
    df = pd.read_csv(data)
    df[colname] = record_type    # tag as model/observed record

    # Convert all specified unique_fields to string and concatenate as new unique_id field 
    df[unique_fields] = pd.DataFrame([df[col].astype('int').astype('str') for col in unique_fields]).T
    df['unique_id'] = df[unique_fields].apply(lambda x: '_'.join(x), axis=1)

    return df

In [9]:
def append(*args):
    '''Union dataframes with similar structures'''
    df = pd.DataFrame()
    for data in args:
        df = df.append(data)

    return df

In [10]:
def select_common_records(df1,df2,field):
    '''Return dataframe of matching, common records only.
       Example, person 1034 exists in df1, but not in df2, so new copy of df1 without 1034 is created
    '''
    df1 = df1[df1[field].isin(df2[field])]
    df2 = df2[df2[field].isin(df1[field])]

    return df1, df2

## Load modeled and observed data
Create unique ID based on person-id and trip-list-id-num fields, concatentated with "_"

In [11]:
# Load the modeled and observed datasets
model_results_dir = r'J:\Projects\FasTrips\obs\output\OBS_fasttrips_demand_v1.0_stochastic_iter2_cap\chosenpaths_links.csv'
observed_results_di = r'..\data\obs\obs_links.csv'

- load data from CSV to dataframe
- add model/observed text field 
- add unique ID

In [12]:
model = prep_df(data=model_results_dir, record_type='model', unique_fields=['person_id','trip_list_id_num'])
observed = prep_df(data=observed_results_di, record_type='observed', unique_fields=['person_id','trip_list_id_num'])

In [13]:
print len(observed.groupby(['person_id','trip_list_id_num']).count()['mode'])
print len(model.groupby(['person_id','trip_list_id_num']).count()['mode'])

22599
21948


## Select unique IDs common to each dataset

In [14]:
model, observed = select_common_records(model,observed,'person_id')

In [15]:
print len(observed.groupby('unique_id').count()['person_id'])
print len(model.groupby('unique_id').count()['person_id'])

18911
18860


## Append observed data rows to modeled and export to file


In [16]:
df = append(model, observed)
df.to_csv(output_dir + '\chosenpaths_links_with_observed.csv')

# Compare paths
----

**output: path_comparison.csv**

In [17]:
obs = pd.read_csv(r'..\data\obs\obs_links.csv')
obs['person_id'] = obs['person_id'].astype('str')


# Fast Trips output: link-level results for the chosen path only (chosenpaths_links)
model = pd.read_csv(r'J:\Projects\FasTrips\obs\output\OBS_fasttrips_demand_v1.0_stochastic_iter2_cap\chosenpaths_links.csv')
model['route_id'].fillna("",inplace=True)
# NOTE: assume we look at only the last iteration of path sets ?
model = model[model['iteration'] == model['iteration'].max()]
model['person_id'] = model['person_id'].astype('int').astype('str')


# Fast Trips output: link-level results for ALL paths created by Fast Trips (pathset_links)
pathset_links = pd.read_csv(r'J:\Projects\FasTrips\obs\output\OBS_fasttrips_demand_v1.0_stochastic_iter2_cap\pathset_links.csv')
pathset_links['route_id'].fillna("",inplace=True)
pathset_links = pathset_links[pathset_links['iteration'] == pathset_links['iteration'].max()]

# Fast Trips output: paths-level results for all paths created by Fast Trips (pathset_paths)
pathset_paths = pd.read_csv(r'J:\Projects\FasTrips\obs\output\OBS_fasttrips_demand_v1.0_stochastic_iter2_cap\pathset_paths.csv')

### Reset Indeces

In [18]:
obs.reset_index(inplace=True)
model.reset_index(inplace=True)
pathset_links.reset_index(inplace=True)
pathset_paths.reset_index(inplace=True)

### Create Unique ID

In [19]:
obs['unique_id'] = obs['person_id'].astype('str') +"_"+obs['trip_list_id_num'].astype('str')
model['unique_id'] = model['person_id'].astype('str') +"_"+model['trip_list_id_num'].astype('str')

pathset_links['unique_id'] = pathset_links['person_id'].astype('int').astype('str') +"_"+pathset_links['trip_list_id_num'].astype('str')

pathset_paths['person_id'] = pathset_paths['person_id'].astype('int').astype('str')
pathset_paths['unique_id'] = pathset_paths['person_id'].astype('str') +"_"+pathset_paths['trip_list_id_num'].astype('str')

** each unique ID should exist within the observed and the model results **

In [20]:
# Compare the unique_id fields between each data set
print len(obs.groupby('unique_id').count().index)
print len(model.groupby('unique_id').count().index)
print len(list(set((obs['unique_id'].values)).intersection((model['unique_id'].values))))

22599
20836
17876


Some observed trips were filtered out, but the number of modeled trips should be the same as the 
number of common unique_ids. For some reason it's less

**Filter out modeled & observed unique_ids not common to both**

In [21]:
# Use the common unique_ids to filter out the data
model = model[model['unique_id'].isin(list(set((obs['unique_id'].values)).intersection((model['unique_id'].values))))]
obs = obs[obs['unique_id'].isin(list(set((obs['unique_id'].values)).intersection((model['unique_id'].values))))]

In [22]:
print len(obs.groupby('unique_id').count().index)
print len(model.groupby('unique_id').count().index)
print len(list(set((obs['unique_id'].values)).intersection((model['unique_id'].values))))

17876
17876
17876


**Compare against the pathset links and paths files**

In [23]:
print len(obs.groupby('unique_id').count().index)
print len(pathset_links.groupby('unique_id').count().index)
print len(list(set((obs['unique_id'].values)).intersection((pathset_links['unique_id'].values))))
obs = obs[obs['unique_id'].isin(list(set((obs['unique_id'].values)).intersection((model['unique_id'].values))))]
pathset_links = pathset_links[pathset_links['unique_id'].isin(list(set((obs['unique_id'].values)).intersection((model['unique_id'].values))))]
print len(obs.groupby('unique_id').count().index)
print len(pathset_links.groupby('unique_id').count().index)
print len(list(set((obs['unique_id'].values)).intersection((pathset_links['unique_id'].values))))

17876
25348
17876
17876
17876
17876


In [24]:
print len(obs.groupby('unique_id').count().index)
print len(pathset_paths.groupby('unique_id').count().index)
print len(list(set((obs['unique_id'].values)).intersection((pathset_paths['unique_id'].values))))
obs = obs[obs['unique_id'].isin(list(set((obs['unique_id'].values)).intersection((pathset_paths['unique_id'].values))))]
pathset_paths = pathset_paths[pathset_paths['unique_id'].isin(list(set((obs['unique_id'].values)).intersection((pathset_paths['unique_id'].values))))]
print len(obs.groupby('unique_id').count().index)
print len(pathset_paths.groupby('unique_id').count().index)
print len(list(set((obs['unique_id'].values)).intersection((pathset_paths['unique_id'].values))))

17876
25441
17876
17876
17876
17876


# Join OBS-formatted transit routes/agencies to Fast Trips output
----

In [25]:
# model['route_id']

In [26]:
obs_to_gtfs = pd.read_csv('../data/obs/obs_to_gtfs_v1.8.csv')
agency_lookup = pd.read_csv('../data/obs/obs_agency_lookup.csv')
routes = pd.read_csv(r'../data/gtfs/routes.txt')

In [27]:
# from the routes file get a lookup for agency from the route_id prefix
routes['agency_num'] = routes['route_id'].apply(lambda x: x.split("_")[0])
routes['route_id_value'] = routes['route_id'].apply(lambda x: "_".join(x.split("_")[1:]))

### route info for model links

In [28]:
# FT's route_id format is xx_yyy where xx is an agency ID number, and yyy is the route_id
# Get the route number and agency ID
model['route_id'].fillna("",inplace=True)
subdf = model[model['route_id'] != ""]

# Select only rows with a route_id
subdf['agency_num'] = subdf['route_id'].apply(lambda x: x.split("_")[0])
subdf['route_id_value'] = subdf['route_id'].apply(lambda x: "_".join(x.split("_")[1:]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [29]:
# Join the agency name for all records
newmodel = pd.merge(subdf, obs_to_gtfs, left_on='route_id_value', right_on='GTFS1.8_route_id_value', how='left')

# get the agency name for bart records
newmodel = pd.merge(newmodel, routes, left_on='route_id_value', right_on='route_id_value', how='left')

In [30]:
# Add the OBS-version of the agency as both the route and the agency name,
# we aren't differentiating between route IDs for BART and caltrain
newmodel.ix[newmodel.agency_id=='bart', 'OBS_agency'] = 'BART'
newmodel.ix[newmodel.agency_id=='bart', 'OBS_route_id'] = 'BART'
newmodel.ix[newmodel.agency_id=='caltrain', 'OBS_agency'] = 'Caltrain'
newmodel.ix[newmodel.agency_id=='caltrain', 'OBS_route_id'] = 'Caltrain'

In [31]:
# Add the new columns back to the original model outputs dataset
model['agency'] = newmodel['OBS_agency']
model['route_id'] = newmodel['OBS_route_id']

### route info for pathset_links

In [32]:
# do the same thing for pathset_links
# FT's route_id format is xx_yyy where xx is an agency ID number, and yyy is the route_id
# Get the route number and agency ID
pathset_links['route_id'].fillna("",inplace=True)
subdf = pathset_links[pathset_links['route_id'] != ""]

# Select only rows with a route_id
subdf['agency_num'] = subdf['route_id'].apply(lambda x: x.split("_")[0])
subdf['route_id_value'] = subdf['route_id'].apply(lambda x: "_".join(x.split("_")[1:]))

newmodel = pd.merge(subdf, obs_to_gtfs, left_on='route_id_value', right_on='GTFS1.8_route_id_value', how='left')
# get the agency name for bart records
newmodel = pd.merge(newmodel, routes, left_on='route_id_value', right_on='route_id_value', how='left')

# Add the OBS-version of the agency as both the route and the agency name,
# we aren't differentiating between route IDs for BART and caltrain
newmodel.ix[newmodel.agency_id=='bart', 'OBS_agency'] = 'BART'
newmodel.ix[newmodel.agency_id=='bart', 'OBS_route_id'] = 'BART'
newmodel.ix[newmodel.agency_id=='caltrain', 'OBS_agency'] = 'Caltrain'
newmodel.ix[newmodel.agency_id=='caltrain', 'OBS_route_id'] = 'Caltrain'

# Add the new columns back to the original model outputs dataset
pathset_links['agency'] = newmodel['OBS_agency']
pathset_links['route_id'] = newmodel['OBS_route_id']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [33]:
model[model['person_id'] == "3"]

Unnamed: 0,index,person_id,trip_list_id_num,pf_iteration,pathnum,linkmode,trip_id_num,A_id_num,B_id_num,A_seq,...,missed_xfer,sim_cost,chosen,board_time,overcap,overcap_frac,alight_time,iteration,unique_id,agency
0,76967,3,1,1,4,access,,105000,7309,-1,...,0,20.29342,1.0,,,,,2,3_1,
1,76968,3,1,1,4,transit,8279.0,7309,7134,38,...,0,83.5054,1.0,07:51:46,-63.0,-63.0,08:25:20,2,3_1,SamTrans
2,76969,3,1,1,4,egress,,7134,105279,-1,...,0,50.72242,1.0,,,,,2,3_1,


In [34]:
obs[obs['person_id'] == "3"]

Unnamed: 0.1,index,Unnamed: 0,person_id,linkmode,A_id_num,B_id_num,A_id,B_id,linknum,mode,route_id,path_o_taz,path_d_taz,trip_list_id_num,o_taz,d_taz,unique_id,agency
2527,2527,2527,3,access,1280.0,,1280.0,,0,walk_access,,1280.0,1315.0,1,1280,1315,3_1,
2528,2528,2528,3,transit,,,,,1,commuter_rail,Caltrain,1280.0,1315.0,1,1280,1315,3_1,Caltrain
2529,2529,2529,3,egress,,1315.0,,1315.0,2,walk_egress,,1280.0,1315.0,1,1280,1315,3_1,


# Produce joined fields from pathset link files 

In [35]:
def produce_path_fields(df, group):
    '''
    '''
    # create "path_routes"
    df['path_routes'] = df['route_id'].apply(lambda x: x.strip())
    path_routes = pd.DataFrame(df.groupby(group)['path_routes'].apply(lambda x: "%s" % ' '.join(x).strip()))
    
    result_df = pd.DataFrame(index=path_routes.index)
    result_df['path_routes'] = path_routes
    
    # create "path_modes"
    df['path_modes'] = df['mode'].apply(lambda x: x.strip())
    result_df['path_modes'] = pd.DataFrame(df.groupby(group)['mode'].apply(lambda x: "%s" % ' '.join(x).strip()))
    
    # create "path_agencies"
    df['path_agencies'] = df['agency'].apply(lambda x: x.strip())
    result_df['path_agencies'] = pd.DataFrame(df.groupby(group)['agency'].apply(lambda x: "%s" % ' '.join(x).strip()))

    # Create "path_components"
    df['path_components'] = df['A_id'].astype('str')+" "+df['mode']+" "+df['route_id'] +"_"+ df['B_id'].astype('str')
    df['path_components'] = df['path_components'].apply(lambda x: x.strip())
    result_df['path_components'] = pd.DataFrame(df.groupby(group)['path_components'].apply(lambda x: "%s" % ' '.join(x).strip()))
    
    # Return ID field from index
    result_df['unique_id'] = result_df.index.get_level_values(0).values
    

    return result_df

In [36]:
# Create caluclated joined fields for the observed, modeled, and pathset links file

# Concatenate modes, route IDs, etc to produce unique trip identities
# Do this for each set of trips in the oberved data, as well as for the modeled, since we don't have the field produced by FT
obs['route_id'].fillna("",inplace=True)
obs['agency'].fillna("",inplace=True)

observed_path = produce_path_fields(obs, group=['unique_id'])

In [131]:
# obs['agency']

In [37]:
model['route_id'].fillna("",inplace=True)
model['agency'].fillna("",inplace=True)

modeled_path = produce_path_fields(model, group=['unique_id'])

In [38]:
# also process the detailed pathset_links files, so each path in the pathset has a unique trip identity
pathset_links['route_id'].fillna("",inplace=True)
pathset_links['agency'].fillna("",inplace=True)

new_pathset = produce_path_fields(pathset_links, group=['unique_id','pathnum'])

In [39]:
# Make sure we only evaluate the overlapping records

In [40]:
obs = obs[obs['unique_id'].isin(new_pathset['unique_id'].values)]
model = model[model['unique_id'].isin(new_pathset['unique_id'].values)]
new_pathset = new_pathset[new_pathset['unique_id'].isin(obs['unique_id'].values)]
new_pathset = new_pathset[new_pathset['unique_id'].isin(model['unique_id'].values)]

In [41]:
print len(new_pathset)
print len(model)
print len(obs)

301330
92452
61556


In [122]:
model.groupby('unique_id')['agency'].count()

unique_id
100066_15335     3
1000_431         3
1000_432         5
100122_15336     3
10022_3618       5
100262_15337     5
100281_15338     3
10028_3620       5
1002_433         3
10036_3621      11
10038_3622       5
10040_3623       7
100476_15340     5
100488_15341     9
100495_15342     7
...
955_401       5
957_402       3
957_403       3
964_405       9
968_406       7
974_408       3
975_411       7
977_413       3
978_414      21
978_415       5
980_417       5
980_418       5
984_420       5
987_423       7
997_430       7
Name: agency, Length: 17876, dtype: int64

## Compare if modeled/observed trips match, completed or partially

In [42]:
# Join the observed and modeled fields
df = pd.merge(observed_path, modeled_path, on='unique_id',suffixes=("_observed","_model"))

In [43]:
# Find rows with matching path routes
complete_route_match = df[df['path_routes_observed'] == df['path_routes_model']]
complete_mode_match = df[df['path_modes_observed'] == df['path_modes_model']]
# Add complete_agency_match when available
complete_agency_match = df[df['path_agencies_observed'] == df['path_agencies_model']]

In [44]:
## Extract order of transit routes taken
df['model_path_route_list'] = df['path_routes_model'].apply(lambda x: x.split(" "))
df['obs_path_route_list'] = df['path_routes_observed'].apply(lambda x: x.split(" "))

df['model_path_mode_list'] = df['path_modes_model'].apply(lambda x: x.split(" "))
df['obs_path_mode_list'] = df['path_modes_observed'].apply(lambda x: x.split(" "))

df['model_path_agencies_list'] = df['path_agencies_model'].apply(lambda x: x.split(" "))
df['obs_path_agencies_list'] = df['path_agencies_observed'].apply(lambda x: x.split(" "))

In [45]:
# Isolate transit modes only, because all trips should have walk & transfer components
non_transit_modes = ['transfer','walk_access','walk_egress','bike_access','bike_egress',
                     'PNR_access','PNR_egress','KNR_access','KNR_egress']
df['model_transit_modes'] = df['model_path_mode_list'].apply(
    lambda row: [element for element in row if element not in non_transit_modes])
df['obs_transit_modes'] = df['obs_path_mode_list'].apply(
    lambda row: [element for element in row if element not in non_transit_modes])

In [46]:
# Find the intersection between the chosen model/observed paths using different criteria

# transit route IDs only
df.apply(lambda row: all(i in row['model_path_route_list'] for i in row['obs_path_route_list']), axis=1)
df['routes_intersection'] = [list(set(a).intersection(set(b))) for a, b in zip(df['model_path_route_list'], df['obs_path_route_list'])]

# All Modes (including transfer, access/egress)
df.apply(lambda row: all(i in row['model_path_mode_list'] for i in row['obs_path_mode_list']), axis=1)
df['all_modes_intersection'] = [list(set(a).intersection(set(b))) for a, b in zip(df['model_path_mode_list'], df['obs_path_mode_list'])]

# Transit modes only (type of vehicle taken and number of boardings)
df.apply(lambda row: all(i in row['model_path_mode_list'] for i in row['obs_path_mode_list']), axis=1)
df['transit_modes_intersection'] = [list(set(a).intersection(set(b))) for a, b in zip(df['model_path_mode_list'], df['obs_path_mode_list'])]

# Next steps: 
# Agency Intersection
df.apply(lambda row: all(i in row['model_path_agencies_list'] for i in row['obs_path_agencies_list']), axis=1)
df['transit_agencies_intersection'] = [list(set(a).intersection(set(b))) for a, b in zip(df['model_path_agencies_list'], df['obs_path_agencies_list'])]

# All path components

# Number of boardings?

# Exact Match of Path Routes, Modes, Components

In [95]:
# Exact match of path modes
complete_mode_match['complete_mode_match'] = 1
complete_agency_match['complete_agency_match'] = 1

df = pd.merge(df, complete_mode_match[['unique_id','complete_mode_match']], how='left', on='unique_id')
df['complete_mode_match'].fillna(0,inplace=True)

In [96]:
complete_route_match['complete_route_match'] = 1
df = pd.merge(df, complete_route_match[['unique_id','complete_route_match']], how='left', on='unique_id')

In [97]:
complete_agency_match['complete_agency_match'] = 1
df = pd.merge(df, complete_agency_match[['unique_id','complete_agency_match']], how='left', on='unique_id')

In [98]:
df['complete_route_match'] = df['complete_route_match'].replace('nan',0)
df['complete_agency_match'] = df['complete_agency_match'].replace('nan',0)

In [99]:
df.columns

Index([u'path_routes_obs', u'path_agencies_obs', u'path_components_obs', u'unique_id', u'path_routes_pathset', u'path_agencies_pathset', u'path_components_pathset', u'pathnum', u'path_modes_obs', u'path_modes_pathset', u'complete_mode_match', u'complete_route_match', u'complete_agency_match'], dtype='object')

In [100]:
df['complete_route_match'].mean()

0.0005548442168160478

In [101]:
df['complete_mode_match'].mean()

0.6068501920614596

In [102]:
df['complete_agency_match'].mean()

0.0033504054630815196

In [104]:
df[['path_agencies_obs','path_agencies_pathset']]

Unnamed: 0,path_agencies_obs,path_agencies_pathset
0,Sonoma County,
1,Sonoma County,
2,Sonoma County,
3,Sonoma County,
4,Sonoma County,
5,Sonoma County,
6,Sonoma County,
7,Sonoma County,
8,Sonoma County,
9,Sonoma County,


## Export fields to CSV for Tableau:
- 'complete_route_match' 0/1
- 'complete_mode_match' 0/1
- 'complete_agency_match' 0/1

In [57]:
len(df)

17876

# % trips with matching or partial matching routes

In [58]:
# Now we find the percent of trips with matching routes or partial matching routes
# df.mean()[['complete_match','partial_match']]
# Note, shuold be 100% when using the same data set, what's up??

#############
# Join the filtered data to the original results
df['common_mode_count'] = [len(row) for row in df['all_modes_intersection']]
df['common_transit_mode_count'] = [len(row) for row in df['transit_modes_intersection']]

# How many rows have at least one mode in common?
df['partial_mode_match'] = [1 if row > 0 else 0 for row in df['common_mode_count']]
df['partial_transit_mode_match'] = [1 if row > 0 else 0 for row in df['common_transit_mode_count']]

In [59]:

df['partial_mode_match'].mean()

1.0

In [60]:
df['partial_transit_mode_match'].mean()

1.0

## Export fields to CSV for Tableau:
- 'partial_mode_match' 0/1
- 'complete_mode_match' 0/1

# Compare probability of observed path to pathset
# & Check if path is in pathset


In [63]:
## Add a field to the new_pathset that lists the pathnum
new_pathset['pathnum'] = new_pathset.index.get_level_values(1)

In [67]:
# Do this with a merge?
# for path modes only
df = pd.merge(observed_path, new_pathset, how='left',
              left_on=['unique_id','path_modes'],right_on=['unique_id','path_modes'], suffixes=['_obs','_pathset'])

df['path_modes_obs'] = df['path_modes']
df.drop('path_modes',axis=1, inplace=True)
len(df)

46860

In [68]:
df = (pd.merge(df, modeled_path[['unique_id','path_modes']], how='left'))
df['path_modes_pathset'] = df['path_modes']
df.drop('path_modes',axis=1,inplace=True)

In [69]:
print len(df)
print len(observed_path)
print len(new_pathset)

46860
17876
301330


In [70]:
# How many unique ID's don't have a pathset attacheds
len(df[df['pathnum'].isnull()])/float(len(df))

0.27166026461801107

In [72]:
df['pathnum'] = df['pathnum'].fillna(0)
df['pathnum'] = df['pathnum'].astype('int')

In [73]:
# Do we already have a unique ID? Try it anyway
pathset_paths['unique_id'] = pathset_paths['person_id'].astype('int').astype('str')+"_"+pathset_paths['trip_list_id_num'].astype('int').astype('str')

In [74]:
# Now look up the probability of each path between new_pathset and pathset_paths
newdf = pd.merge(df,pathset_paths,left_on=['unique_id','pathnum'], right_on=['unique_id','pathnum'])

In [75]:
newdf['probability'] = newdf['probability'].fillna('no_match')

In [76]:
max_prob = newdf.groupby('unique_id').max()['probability']

# to take the mean, need to do some filtering
# mean_prob = newdf.groupby('unique_id').()['probability']
min_prob = newdf.groupby('unique_id').min()['probability']

In [77]:
#Create indicator for paths that exist
# New dataframe that has prob matching record for each unique ID
prob_export = pd.DataFrame([max_prob,min_prob]).T
prob_export.columns = ['max_prob','min_prob']

In [78]:
# Pull binary data for each person
prob_export['path_exists'] = prob_export['max_prob'].apply(lambda row_value: 0 if row_value == 'no_match' else 1)

In [79]:
prob_export['path_exists'].mean()

0.7456925486686059

In [80]:
# Path exists in the pathset, based on mode strings, for 25% of all trips

In [81]:
# Is the max probability above a defined threshold?
threshold = 0.3
# Mark no_match_records
prob_export.ix[prob_export['max_prob'] >= threshold, 'above_threshold'] = 1
prob_export.ix[prob_export['max_prob'] < threshold, 'above_threshold'] = 0
prob_export.ix[prob_export['max_prob'] == 'no_match', 'above_threshold'] = -1

In [82]:
# Percent of trips above a threshold
prob_export[prob_export['above_threshold'] != -1].mean()

max_prob          0.48728
min_prob          0.29646
path_exists       1.00000
above_threshold   0.63931
dtype: float64

# Join all relevant columns & export to csv

In [84]:
prob_export['unique_id'] = prob_export.index

In [85]:
export_df = pd.merge(df, prob_export, on='unique_id')

In [86]:
len(export_df) == len(df) == len(prob_export)

False

In [87]:
# We can consider dropping columns if needed
export_df.columns

Index([u'path_routes_obs', u'path_agencies_obs', u'path_components_obs', u'unique_id', u'path_routes_pathset', u'path_agencies_pathset', u'path_components_pathset', u'pathnum', u'path_modes_obs', u'path_modes_pathset', u'max_prob', u'min_prob', u'path_exists', u'above_threshold'], dtype='object')

In [88]:
export_df['person_id'] = export_df['unique_id'].apply(lambda row: row.split("_")[0])
export_df['trip_list_id_num'] = export_df['unique_id'].apply(lambda row: row.split("_")[-1])

In [89]:
# export_df['above_threshold'].astype('int')

In [90]:
export_df.to_csv(output_dir + '\path_comparison.csv', index=False)

In [91]:
# export_df['path_agencies_pathset']

In [92]:
export_df.columns

Index([u'path_routes_obs', u'path_agencies_obs', u'path_components_obs', u'unique_id', u'path_routes_pathset', u'path_agencies_pathset', u'path_components_pathset', u'pathnum', u'path_modes_obs', u'path_modes_pathset', u'max_prob', u'min_prob', u'path_exists', u'above_threshold', u'person_id', u'trip_list_id_num'], dtype='object')

In [93]:
df.columns

Index([u'path_routes_obs', u'path_agencies_obs', u'path_components_obs', u'unique_id', u'path_routes_pathset', u'path_agencies_pathset', u'path_components_pathset', u'pathnum', u'path_modes_obs', u'path_modes_pathset'], dtype='object')

In [94]:
export_df

Unnamed: 0,path_routes_obs,path_agencies_obs,path_components_obs,unique_id,path_routes_pathset,path_agencies_pathset,path_components_pathset,pathnum,path_modes_obs,path_modes_pathset,max_prob,min_prob,path_exists,above_threshold,person_id,trip_list_id_num
0,Sonoma County_62,Sonoma County,2361.0 walk_access _nan nan local_bus Sonoma C...,100066_15335,,,2361 walk_access _9376 9376 local_bus _9378 93...,0,walk_access local_bus walk_egress,walk_access premium_bus walk_egress,0.16506,0.00000,1,0.00000,100066,15335
1,Sonoma County_62,Sonoma County,2361.0 walk_access _nan nan local_bus Sonoma C...,100066_15335,,,2361 walk_access _9376 9376 local_bus _9378 93...,1,walk_access local_bus walk_egress,walk_access premium_bus walk_egress,0.16506,0.00000,1,0.00000,100066,15335
2,Sonoma County_62,Sonoma County,2361.0 walk_access _nan nan local_bus Sonoma C...,100066_15335,,,2361 walk_access _9376 9376 local_bus _9378 93...,2,walk_access local_bus walk_egress,walk_access premium_bus walk_egress,0.16506,0.00000,1,0.00000,100066,15335
3,Sonoma County_62,Sonoma County,2361.0 walk_access _nan nan local_bus Sonoma C...,100066_15335,,,2361 walk_access _9376 9376 local_bus _9378 93...,3,walk_access local_bus walk_egress,walk_access premium_bus walk_egress,0.16506,0.00000,1,0.00000,100066,15335
4,Sonoma County_62,Sonoma County,2361.0 walk_access _nan nan local_bus Sonoma C...,100066_15335,,,2361 walk_access _9376 9376 local_bus _9378 93...,5,walk_access local_bus walk_egress,walk_access premium_bus walk_egress,0.16506,0.00000,1,0.00000,100066,15335
5,Sonoma County_62,Sonoma County,2361.0 walk_access _nan nan local_bus Sonoma C...,100066_15335,,,2361 walk_access _9402 9402 local_bus _9378 93...,6,walk_access local_bus walk_egress,walk_access premium_bus walk_egress,0.16506,0.00000,1,0.00000,100066,15335
6,Sonoma County_62,Sonoma County,2361.0 walk_access _nan nan local_bus Sonoma C...,100066_15335,,,2361 walk_access _9402 9402 local_bus _9378 93...,7,walk_access local_bus walk_egress,walk_access premium_bus walk_egress,0.16506,0.00000,1,0.00000,100066,15335
7,Sonoma County_62,Sonoma County,2361.0 walk_access _nan nan local_bus Sonoma C...,100066_15335,,,2361 walk_access _9402 9402 local_bus _9378 93...,8,walk_access local_bus walk_egress,walk_access premium_bus walk_egress,0.16506,0.00000,1,0.00000,100066,15335
8,Sonoma County_62,Sonoma County,2361.0 walk_access _nan nan local_bus Sonoma C...,100066_15335,,,2361 walk_access _9402 9402 local_bus _9378 93...,9,walk_access local_bus walk_egress,walk_access premium_bus walk_egress,0.16506,0.00000,1,0.00000,100066,15335
9,Sonoma County_62,Sonoma County,2361.0 walk_access _nan nan local_bus Sonoma C...,100066_15335,,,2361 walk_access _9402 9402 local_bus _9378 93...,10,walk_access local_bus walk_egress,walk_access premium_bus walk_egress,0.16506,0.00000,1,0.00000,100066,15335
