# Find redundant trips
Goal of this notebook is to find repeat trips, these will have similar start and end points and the GPS points will have similar coverage (so can be done before map-matching). In addition, we want to identify trip direction (from home/to home). Lastly, we want to inspect the duplicate users we found in the previous notebook to see if they have similar trip starts/ends.

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
from sklearn.cluster import DBSCAN
import similaritymeasures
import pickle
from pathlib import Path
from shapely.ops import LineString
from tqdm import tqdm

In [2]:
import json
config = json.load((Path.cwd() / 'config.json').open('rb'))
export_fp = Path(config['project_directory']) / 'CycleAtlanta'
if export_fp.exists() == False:
    export_fp.mkdir()
    
project_dir = Path(config['project_directory']) / "CycleAtlanta"
user_data_definitions = json.load(open(Path.cwd()/'user_data_definition.json'))
cycleatlanta_data_filepath = Path(config['cycleatlanta'])

Look at duplicate users to determine if we should combine

In [3]:
#project_dir = Path.home() / "Documents/BikewaySimData/Projects/gdot/gps_traces"
with (project_dir/'trips_1.pkl').open('rb') as fh:
    trips = pickle.load(fh)
trips = trips[['tripid','userid','remapped_userid','start_X','start_Y','end_X','end_Y']]
trips.set_index('tripid',inplace=True,drop=False)

with (project_dir/'users_1.pkl').open('rb') as fh:
    users = pickle.load(fh)

with (project_dir/'remapped_users.pkl').open('rb') as fh:
    user_remap = pickle.load(fh)

#precomputed and simplified trajectories
with (project_dir/'rdp.pkl').open('rb') as fh:
    trip_lines = pickle.load(fh)

For each user, we want to retrieve all their trip start and end coordinates (from trips) and run DBSCAN on them. 

In [4]:
#for each user run DBSCAN on their origins and destinations
start_labels = {}
end_labels = {}
max_cluster_distance = 1500
min_samples_in_cluster = 2
userid_col = 'remapped_userid' # put 'userid' if okay with potential duplicate users


for userid in tqdm(trips[userid_col].unique()):
    
    subset = trips.loc[trips[userid_col]==userid].copy()
    starts = subset[['tripid','start_X','start_Y']].to_numpy()
    ends = subset[['tripid','end_X','end_Y']].to_numpy()
    combined = np.vstack([starts,ends])

    #eps = maximum distance in cluster
    clustering = DBSCAN(eps=max_cluster_distance, min_samples=min_samples_in_cluster).fit(combined[:,1:])
    #add labels to array
    combined = np.column_stack([combined,clustering.labels_])
    #reshape array to unstack start/end
    labelled = np.column_stack([combined[0:starts.shape[0],[0,-1]],combined[starts.shape[0]:,[-1]]])

    start_labels.update({tripid:start_label for tripid, start_label, end_label in labelled})
    end_labels.update({tripid:end_label for tripid, start_label, end_label in labelled})

#add as columns to dataframe
trips['start_label'] = trips['tripid'].map(start_labels)
trips['end_label'] = trips['tripid'].map(end_labels)

  0%|          | 0/972 [00:00<?, ?it/s]

100%|██████████| 972/972 [00:04<00:00, 209.83it/s]


To prove the duplicates, we'd ideally want at least one of the duplicate user's origin/destination to have the same label as one of the other duplicate userids.

In [5]:
duplicate_users = users.loc[users['userid'].apply(lambda x: isinstance(x,list)),'userid']
duplicate_users

902      [361, 722]
903      [390, 604]
904      [260, 998]
905    [1524, 1671]
906     [209, 1307]
           ...     
967     [108, 1592]
968      [210, 667]
969     [451, 1406]
970    [1463, 1602]
971     [235, 1633]
Name: userid, Length: 70, dtype: object

In [6]:
for idx, duplicate in duplicate_users.items():
    sets = []
    # if len(duplicate) > 2:
    #     print('More than two')
    for userid in duplicate:
        new_set = trips.loc[trips['userid']==userid,['start_label','end_label']].to_numpy().flatten()
        new_set = set(new_set[new_set>=0])
        sets.append(new_set)

    all_shared = set.intersection(*sets)

    at_least_one = set()
    for i in range(len(sets)):
        for j in range(i+1, len(sets)):
            at_least_one |= sets[i].intersection(sets[j])

    if len(all_shared) > 0:
        #print("No shared origins or destinations")
        users.loc[idx,'shared_locations'] = 'yes'
    elif len(at_least_one) > 0: 
        users.loc[idx,'shared_locations'] = 'at least one pair'
    else:
        #print("Shared destinations")
        users.loc[idx,'shared_locations'] = 'no shared locations'


In [7]:
users[~users['shared_locations'].isna()].sort_values('count',ascending=False)

Unnamed: 0,userid,created_date,email,age,gender,income,ethnicity,homeZip,schoolZip,workZip,cycling_freq,rider_history,rider_type,count,shared_locations
942,"[931, 1331]",2013-06-18 21:17:57,kmyerschamberlain@gmail.com,"[NULL, 25-34]","[NULL, Female]","[NULL, $60,000 to $74,999]","[NULL, White]","[30030, 30306]",-1,"[30308, 30329]","[NULL, Several times per week]","[NULL, Since childhood]","[NULL, Enthused & confident]",3947,yes
937,"[940, 1275]",2013-06-21 04:38:21,josephmcintyre@mac.com,"[NULL, 25-34]","[NULL, Male]","$60,000 to $74,999",White,30317,-1,30318,Daily,"[Several years, Since childhood]","[Strong & fearless, Comfortable, but cautious]",1055,yes
966,"[78, 1369]",2012-10-16 03:28:50,stephenwillard@bellsouth.net,"[NULL, 45-54]","[NULL, Male]","[NULL, $100,000 or greater]",White,30030,-1,30306,Several times per week,Since childhood,"[Enthused & confident, Strong & fearless]",937,yes
967,"[108, 1592]",2012-10-16 15:50:09,syl.turner@gmail.com,25-34,Male,"[$75,000 to $99,999, $100,000 or greater]",White,30030,-1,"[30308, -1]","[NULL, Several times per week]",Several years,"Comfortable, but cautious",444,yes
922,"[341, 741, 1037, 1057, 1435, 1585, 1650, 1661,...",2012-11-15 14:15:57,cqholt@gmail.com,25-34,Male,"[$75,000 to $99,999, $60,000 to $74,999, $40,0...",White,30317,-1,"[30338, -1, 30008, 30303]","[NULL, Daily, Several times per week, Several ...",Since childhood,"[Strong & fearless, Enthused & confident]",242,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
946,"[441, 504]",2013-01-08 09:18:11,laurenfun@me.com,18-24,Female,"$40,000 to $59,999",Hispanic / Mexican / Latino,30307,-1,30312,Several times per week,Several years,"Comfortable, but cautious",4,yes
905,"[1524, 1671]",2014-08-04 13:48:41,Luke.h.starr@gmail.com,25-34,Male,"[$60,000 to $74,999, $75,000 to $99,999]",White,30316,-1,30308,Several times per month,"[One year or less, Several years]","Comfortable, but cautious",4,yes
961,"[359, 1670]",2012-11-22 08:52:04,sam.ludden@gmail.com,"[18-24, 25-34]",Male,"[Less than $20,000, $20,000 to $39,999]",White,"[30305, 30324]","[30303, -1]","[30309, 30315]","[NULL, Several times per week]","[Since childhood, Several years]","[Enthused & confident, Strong & fearless]",4,no shared locations
960,"[313, 1572]",2012-11-06 16:39:28,rwolfe8@gatech.edu,18-24,Male,"$20,000 to $39,999",White,"[30313, 30309]",-1,30313,Daily,Several years,Strong & fearless,3,no shared locations


In [8]:
#remove loop trips and ones that arent labelled
no_noise = (trips['start_label'] != -1) & (trips['end_label'] != -1)
no_loop_trips = trips['start_label'] != trips['end_label']

#use numpy to make sort the labels
trips[['sort_start_label','sort_end_label']] = np.sort(trips[['start_label','end_label']])
trips['reverse_trajectory'] = (trips[['start_label','end_label']].to_numpy() != trips[['sort_start_label','sort_end_label']].to_numpy()).all(axis=1)

#groupby user so we can loop through
grouped = trips[no_noise & no_loop_trips].groupby([userid_col,'sort_start_label','sort_end_label'])['tripid'].agg(list).tolist()
#grouped = [tripids for tripids in grouped if len(tripids) > 1]

In [9]:
trips[no_noise & no_loop_trips].groupby(['userid','sort_start_label','sort_end_label'])['tripid'].nunique().sort_values(ascending=False)

userid  sort_start_label  sort_end_label
1275    0.0               1.0               824
1369    0.0               1.0               638
931     0.0               34.0              329
78      0.0               1.0               282
177     0.0               1.0               269
                                           ... 
1087    1.0               2.0                 1
317     0.0               16.0                1
                          7.0                 1
                          5.0                 1
1733    0.0               1.0                 1
Name: tripid, Length: 2221, dtype: int64

In [10]:
trip_pattern_mapping = {}
trip_pattern_prevalence = {}

In [11]:
grouped

[[68, 383, 1119, 1136, 5934],
 [71, 76, 655, 708, 993, 1052, 1267, 1293, 1545, 1702, 1753, 4864, 5965, 6094],
 [1089, 1092],
 [69, 5943],
 [4865],
 [5078, 5079],
 [80, 102, 432, 5028],
 [302],
 [110, 453],
 [260, 340, 387, 1461, 1521, 1967],
 [6478],
 [761, 3235],
 [704],
 [1654],
 [1675],
 [691, 1740],
 [1663],
 [9358],
 [636],
 [2057],
 [5953],
 [4806],
 [5955],
 [4925, 6765, 8215],
 [103, 241],
 [290, 647],
 [104, 233, 331, 480, 746, 801, 860, 912, 1083, 1135],
 [1880, 1883],
 [2514],
 [107, 236, 338, 431, 1990, 2927, 4571, 15625],
 [1989, 4144],
 [4143],
 [111,
  176,
  192,
  245,
  300,
  326,
  495,
  660,
  758,
  759,
  788,
  926,
  930,
  991,
  1043,
  1100,
  1140,
  1264,
  1290,
  1320,
  1339,
  1509,
  1704,
  1757,
  1811,
  1886,
  1937,
  1962,
  2077,
  2182,
  2183,
  2224,
  2269,
  2317,
  2655,
  2669,
  2693,
  2709,
  2745,
  2759,
  2804,
  2938,
  2983,
  3027,
  3088,
  3114,
  3150,
  3168,
  3229,
  3322,
  3417,
  3456,
  6855,
  28012,
  28097,
  29507

In [12]:
for tripids in tqdm(grouped):

    # #see if any of these trips have already been in mapped not sure why i need this
    # check = [x for x in tripids if x not in trip_pattern_mapping.keys()]
    # if len(check) == 0:
    #     continue
    
    assigned = []
    frechet_distance_max = 500 #needs to be calibrated, any literature on this?
    trip_pattern = 0

    # if there's only one trip then nothing to check against
    if len(tripids) == 1:
        trip_pattern_mapping.update({tripids[0]:trip_pattern})
        trip_pattern_prevalence.update({tripids[0]:1})
        continue

    # there is one user that has a lot of trips and will take substantially longer than the others
    if len(tripids) > 500:
        print('Warning: More than 500 trips for this user')

    while len(tripids) > 0:
        tripid0 = tripids[0]
        similar = [tripid0]
        tripids.remove(tripid0)

        # dont need this anymore
        # if len(tripids) == 0:
        #     continue

        for tripid1 in tripids:
            #retrieve lines
            line0 = trip_lines[tripid0]
            line1 = trip_lines[tripid1]

            #check to see if trajectory order needs to be reversed
            if trips.at[tripid0,'reverse_trajectory']:
                line0 = line0[::-1]
            if trips.at[tripid1,'reverse_trajectory']:
                line1 = line1[::-1]

            frechet_distance = similaritymeasures.frechet_dist(line0,line1)
            
            if frechet_distance < frechet_distance_max:
                similar.append(tripid1)

        #remove assigned trips
        tripids = [tripid for tripid in tripids if tripid not in similar]

        #once above loop is run then update the dictionary
        trip_pattern_mapping.update({tripid:trip_pattern for tripid in similar})
        trip_pattern_prevalence.update({tripid:len(similar) for tripid in similar})
        trip_pattern += 1  



#     #loop through each trip and find similar trajectories
#     for tripid0 in to_assign:
        
#         # #skip if this trip has already been assigned a pair
#         # if tripid0 in assigned:
#         #     continue

#         similar = [tripid0]
#         to_assign.remove(tripid0)

#         #we don't need to loop from beginning to end each time
#         for tripid1 in to_assign:
#             # #skip if it's a same match
#             # if tripid0 == tripid1:
#             #     continue
            
#             #retrieve lines
#             line0 = trip_lines[tripid0]
#             line1 = trip_lines[tripid1]

#             #check to see if trajectory order needs to be reversed
#             if trips.at[tripid0,'reverse_trajectory']:
#                 line0 = line0[::-1]
#             if trips.at[tripid1,'reverse_trajectory']:
#                 line1 = line1[::-1]

#             frechet_distance = similaritymeasures.frechet_dist(line0,line1)
            
#             if frechet_distance < frechet_distance_max:
#                 similar.append(tripid1)
#                 to_assign.remove(tripid1)
#                 #assigned.append(tripid1)

#         #once above loop is run then update the dictionary
#         trip_pattern_mapping.update({tripid:trip_pattern for tripid in similar})
#         trip_pattern_prevalence.update({tripid:len(similar) for tripid in similar})
#         trip_pattern += 1  

# trip_pattern_mapping

 10%|▉         | 209/2110 [00:12<00:44, 42.26it/s]



 67%|██████▋   | 1424/2110 [11:49<32:36,  2.85s/it]  



100%|██████████| 2110/2110 [19:18<00:00,  1.82it/s]  


In [13]:
trips['trip_patterns'] = trips['tripid'].map(trip_pattern_mapping)
trips['trip_pattern_prevalence'] = trips['tripid'].map(trip_pattern_prevalence)

In [14]:
with (project_dir/'trips_2.pkl').open('wb') as fh:
    pickle.dump(trips,fh)

# Go to map matching after this

In [15]:
# trips.columns
# # make sure that every labeled trip is assigned a trip pattern
# # the only ones that won't have a trip pattern will be loops
# # could be cool to look at these later though
# labelled = (trips[['start_label','end_label']] != (-1,-1)).all(axis=1)
# isna = trips['trip_patterns'].isna()
# noloop = trips['sort_start_label'] != trips['sort_end_label']
# trips[labelled & isna & noloop]
# trips[(trips['start_label','end_label'] != (-1,-1)).all(axis=1)]
# trips[trips['trip_patterns'].isna()]
# ## Inspect Results
# subset = trips.copy()

# subset['geometry'] = subset['tripid'].apply(lambda x: LineString(trip_lines[x]))
# subset = gpd.GeoDataFrame(subset,crs='epsg:2240')

# subset.loc[[71,76]].reset_index(drop=True).explore('tripid',categorical=True)


# Excess code past here (examining code for looking at redundant trips)
TODO: Streamline the viz code for examining redundant trips. It'd be neat to have all a persons unique trips in one view with each trip type being togglable by direction, pattern, etc. But this can be explored later.

Initial results show that the Frechet distance needs adjustment. Some trips are assigned two different trip patterns despite looking pretty similar.

Also, possible novel inclusion? It sounds like Aditi maybe experimented with this a tiny bit: "Recall that a similar path is one that has a Hausdorff distance of less than 500 feet between the other path and their lengths and origins are within 500 feet of each other." Frechet takes continuity into account while Hausdorff does not.

'''
Suppose a man is walking a dog. Assume the man is walking on one curve and the dog on
another curve. Both can adjust their speeds but are not allowed to move backwards. The Fr´echet distance of
the two curves is then the minimum length of leash necessary to connect the man and the dog. https://courses.cs.duke.edu/spring07/cps296.2/scribe_notes/lecture23.pdf
'''

In [16]:

# #only want ones with a filled in trip pattern
# notnull = ~subset['trip_patterns'].insa()
# high_prevalence = subset['trip_pattern_prevalence'] > 1

# subset = subset[notnull & high_prevalence]

# subset.reset_index(drop=True).to_file(Path.home()/'Downloads/test.gpkg',layer='labelled_trips')
# subset.loc[[tripid0,tripid1]].reset_index(drop=True).explore()
# Pick a random trip pair and look at the different trip types
# import random

# grouped = subset.groupby(['remapped_userid','sort_start_label','sort_end_label'])['trip_patterns'].unique()
# grouped = grouped[grouped.apply(lambda x: len(x) > 1)]
# random_key = random.choice(list(grouped.index))
# #want ones with 
# selected = subset[(subset[['remapped_userid','sort_start_label','sort_end_label']]==random_key).all(axis=1)]
# selected.reset_index(drop=True).explore('trip_patterns',popup=True,categorical=True)
# Check frechet distance between lines
# tripid0 = 71
# tripid1 = 76

# #retrieve lines
# line0 = trip_lines[tripid0]
# line1 = trip_lines[tripid1]

# #check to see if trajectory order needs to be reversed
# if trips.at[tripid0,'reverse_trajectory']:
#     print('Reversing trajectory on trip0')
#     line0 = line0[::-1]
# if trips.at[tripid1,'reverse_trajectory']:
#     print('Reversing trajectory on trip1')
#     line1 = line1[::-1]

# frechet_distance = similaritymeasures.frechet_dist(line0,line1)
# print(frechet_distance,'feet')
# # if frechet_distance < frechet_distance_max:
# #     similar.append(tripid1)
# #     assigned.append(tripid1)
# import matplotlib.pyplot as plt
# plt.plot(line0)
# plt.plot(line1)

# line0 = line0[::-1]
# frechet_distance = similaritymeasures.frechet_dist(line0,line1)
# print(frechet_distance,'feet')
# with (project_dir/'trips_2.pkl').open('wb') as fh:
#     pickle.dump(trips,fh)
# # Visualize
# Better in QGIS, but at the very least show all trips overlayed when they are the same
# userid = 1714
# trip_ids = trips.loc[trips['userid']==userid]

# trip_ids.groupby(['start_label','end_label'])['trip_pattern'].agg(list).reset_index()
# start_label = 2
# end_label = 
# trip_ids['geometry'] = trip_ids['tripid'].apply(lambda x: LineString(coords_dict[x]['geometry'].tolist()))
# trip_ids = gpd.GeoDataFrame(trip_ids,crs='epsg:2240')
# trip_ids[['tripid','geometry']].reset_index(drop=True).explore('tripid',categorical=True)

# trip0 = np.asarray(trip_ids.loc[33341,'geometry'].coords)
# trip1 = np.asarray(trip_ids.loc[33375,'geometry'].coords)
# similaritymeasures.frechet_dist(trip0, trip1)
# trip_ids.columns
# keep = ['tripid',
#        'userid', 'trip_type', 'start_label', 'end_label',
#        'trip_pattern', 'geometry']
# trip_ids[keep].reset_index(drop=True).to_file(Path.home()/'Downloads/test.gpkg',layer='labelled_trips')
# trip_ids = trip_ids[]


# #retrieve from coords dict
# LineString(coords_dict[trip_ids[0]]['geometry'].tolist())
# tripid_list_of_lists.items()
# trips[~trips['trip_pattern'].isna()]
# color_dict = {
# 'start':'green',
# 'end':'red'
# }

# comb.explore(color = comb['type'].map(color_dict))
# #USE DBSCAN Algorithm to cluster trip origins and destinations
# from sklearn.cluster import DBSCAN

# comb['X'] = comb.geometry.x
# comb['Y'] = comb.geometry.y
# X = comb[['X','Y']].to_numpy()
# Have both origins and destinations in the clustering, easy way to detect loop trips
# clustering = DBSCAN(eps=1500, min_samples=2).fit(X)
# comb['label'] = clustering.labels_
# comb_new = comb[['tripid','label']]
# labelled = comb_new.groupby('tripid').agg(list).reset_index()
# labelled[['start_label','end_label']] = labelled['label'].apply(lambda x: pd.Series([x[0],x[1]]))
# labelled.drop(columns=['label'],inplace=True)
# comb.explore('label',marker_kwds={'radius':5},style_kwds={'fillOpacity':1},categorical=True)
# to_merge = comb[['tripid','type','label']]
# start = comb.loc[comb['type']=='start',['tripid','label']]
# end = comb.loc[comb['type']=='end',['tripid','label']]
# merged = pd.merge(start,end,on='tripid')
# merged.columns = ['tripid','start_label','end_label']
# merged
# final = pd.merge(test,merged,on='tripid')
# final
# First to last distance correpsonds well with the DB SCAN results
# final[final['start_label']==final['end_label']]
# #remove -1 (noise)
# no_noise = final[(final['start_label']!=-1) & (final['end_label']!=-1)]
# grouped = no_noise.groupby(['start_label','end_label'])['tripid'].agg(list).reset_index(name='tripids')
# grouped[['start_grouped','end_grouped']] = np.sort(grouped[['start_label','end_label']])
# cond = (grouped[['start_grouped','end_grouped']].to_numpy() == grouped[['start_label','end_label']].to_numpy()).all(axis=1)
# grouped_fwd = grouped.loc[cond,['start_label','end_label','tripids']]
# grouped_bck = grouped.loc[~cond,['start_grouped','end_grouped','tripids']]
# grouped_bck.columns = ['start_label','end_label','tripids']
# final = pd.merge(grouped_fwd,grouped_bck,on=['start_label','end_label'],suffixes=('_fwd','_bck'),how='left')
# final.set_index(['start_label','end_label'],inplace=True)
# We want a to_from and from_to column
# final
# final
# df = pd.DataFrame()
# for tripid in fwd_tripids:
#     df_to_append = cycleatl_db['coords'][tripid]
#     df_to_append['direction'] = 'fwd'
#     df = pd.concat([df,df_to_append],ignore_index=True)

# for tripid in bck_tripids:
#     df_to_append = cycleatl_db['coords'][tripid]
#     df_to_append['direction'] = 'bck'
#     df = pd.concat([df,df_to_append],ignore_index=True)

# df
# color_dict = {
# 'fwd':'green',
# 'bck':'red'
# }

# gdf = gpd.GeoDataFrame(df,crs='epsg:2240')[['tripid','direction','datetime','geometry']]
# gdf['datetime'] = gdf['datetime'].astype(str)
# gdf.explore(color = gdf['direction'].map(color_dict),popup=True)
# #Frechet Distance https://github.com/cjekel/similarity_measures
# #frechet distance is the minimum distance needed to connect two points along lines along the entire length of them (lower the better)
# tripid0 = 13339
# tripid1 = 25295
# tripid2 = 25022
# gdf[gdf['tripid'].isin([tripid0,tripid1,tripid2])].explore()
# trip0 = gdf[gdf['tripid']==tripid0].copy()
# trip0['x'] = trip0.geometry.x
# trip0['y'] = trip0.geometry.y
# trip0 = trip0[['x','y']].to_numpy()
# trip1 = gdf[gdf['tripid']==tripid1].copy()
# trip1['x'] = trip1.geometry.x
# trip1['y'] = trip1.geometry.y
# trip1 = trip1[['x','y']].to_numpy()
# trip2 = gdf[gdf['tripid']==tripid2].copy()
# trip2['x'] = trip2.geometry.x
# trip2['y'] = trip2.geometry.y
# trip2 = trip2[['x','y']].to_numpy()
# import numpy as np
# import similaritymeasures
# import matplotlib.pyplot as plt

# # # Generate random experimental data
# # x = np.random.random(100)
# # y = np.random.random(100)
# # exp_data = np.zeros((100, 2))
# # exp_data[:, 0] = x
# # exp_data[:, 1] = y

# # # Generate random numerical data
# # x = np.random.random(100)
# # y = np.random.random(100)
# # num_data = np.zeros((100, 2))
# # num_data[:, 0] = x
# # num_data[:, 1] = y

# # quantify the difference between the two curves using PCM
# pcm = similaritymeasures.pcm(trip0, trip1)

# # quantify the difference between the two curves using
# # Discrete Frechet distance
# df0 = similaritymeasures.frechet_dist(trip0, trip1)
# df1 = similaritymeasures.frechet_dist(trip1, trip2)
# print(df0,df1)

# trip1

# gdf['tripid'].nunique()
# #frechet does take into account direction it apppears
# visited = []
# similar_results = []
# frechet_distance_max = 500

# for tripid0 in gdf['tripid'].unique():
#     print(tripid0)
#     similar = []

#     #skip if this trip has already been assigned a pair
#     if tripid0 in visited:
#         print('skip')
#         continue

#     trip0 = gdf[gdf['tripid']==tripid0].copy()
#     trip0['x'] = trip0.geometry.x
#     trip0['y'] = trip0.geometry.y
#     trip0 = trip0[['x','y']].to_numpy()

#     for tripid1 in gdf['tripid'].unique():
#         #skip if the same
#         if tripid0 == tripid1:
#             continue
        
#         trip1 = gdf[gdf['tripid']==tripid1].copy()
#         trip1['x'] = trip1.geometry.x
#         trip1['y'] = trip1.geometry.y
#         trip1 = trip1[['x','y']].to_numpy()

#         frechet_distance = similaritymeasures.frechet_dist(trip0, trip1)
        
#         if frechet_distance < frechet_distance_max:
#             print('test')
#             #add trip1 to the visited list so we know its been assigned a pair and doesn't need to be looked at again
#             visited.append(tripid1)
#             similar.append(tripid1)
#             #results[tripid1] = frechet_distance
    
#     #if matches were found for tripid0 add this to the results list and then add it to the main result lists
#     if len(similar) > 0:
#         #if 
#         similar.append(tripid0)
#         similar_results.append(similar)

# similar_results
# gdf[gdf['tripid'].isin(similar_results[0])]['direction'].unique()
# gdf[gdf['tripid'].isin(similar_results[0])].explore()
# Next we need to figure out where homes/school/work are to figure out which direction?
# merged = test.merge(to_merge,left_on='start',right_on='type',how='left',suffixes=(None,'_drop'))
# merged.drop(columns=['tripid_drop','type'],inplace=True)
# merged.rename(columns={'label':'start_label'},inplace=True)
# merged = merged.merge(to_merge,left_on='end',right_on='type',how='left',suffixes=(None,'_drop'))
# merged.drop(columns=['tripid_drop','type'],inplace=True)
# merged.rename(columns={'label':'end_label'},inplace=True)
# merged.drop(columns=['start','end'],inplace=True)
# merged
# merged[merged['start_label'] == merged['end_label']]
# # -1 means unique locaiton most likely
# # for each point, find the distance to all other points, if wihtin X distance, consider them to be the same point
# (gpd.GeoSeries(test['start_coord'],crs='epsg:2240').distance(test.iloc[30]['start_coord']) < 1500).sum()

# # if greater than 1, then it's 
# test
# cols_to_keep = ['tripid', 'initial_start_time', 'initial_end_time', 'initial_duration','total_distance_ft', 'first_to_last_ft']
# trip = trip[cols_to_keep]
# trips_['start_coord'] = trip['tripid'].apply(lambda tripid: coords_dict[tripid].iloc[0]['geometry'])
# trip['end_coord'] = trip['tripid'].apply(lambda tripid: coords_dict[tripid].iloc[-1]['geometry'])
# #determine zip code of start and end location
# zip_codes = Path.home() / 'Downloads/tl_2014_us_zcta510.zip'
# zip_codes = gpd.read_file(zip_codes,bbox=[-84.55144,33.589187,-84.177219,33.954485]).to_crs('epsg:2240')
# zip_codes = zip_codes[['GEOID10','geometry']]
# zip_codes.plot()
# trip.shape[0]
# gpd.overlay(gpd.GeoDataFrame(trip,geometry='start_coord',crs='epsg:2240'),zip_codes)[['tripid','GEOID10']]

# #import the coordinates dictionary
# #for each trip by a user, calculate the distance between the different start points and end points (all-to-all)
# #if that distance is low (less than half a mile) then consider it to be the same origin/destination and assign it a unique identifier
# #for trips with the same start/end point buffer the points between trips and find their intersection area percent if high then likely a repeat trip that can be ignored

