In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import pickle
import time
import datetime
from pathlib import Path
from tqdm import tqdm
from shapely.ops import Point, LineString
import matplotlib.pyplot as plt

from importlib import reload

In [2]:
import json
config = json.load((Path.cwd() / 'config.json').open('rb'))
export_fp = Path(config['project_directory']) / 'CycleAtlanta'
if export_fp.exists() == False:
    export_fp.mkdir()
    
traces_fp = Path(config['project_directory']) / "CycleAtlanta"
user_data_definitions = json.load(open(Path.cwd()/'user_data_definition.json'))
cycleatlanta_data_filepath = Path(config['cycleatlanta'])


# Trip and User Data for Subsetting the Trace Data
Before importing the trace data, we want to subset the trips dataframe to know which trips to map match.
1. Remove loops
1. Retain the most prevalent trip pattern
1. If only two trip patterns between locations for a user, retain the first one that appears

In [3]:
#TODO these steps should be in the cycleatlanta repo
with (traces_fp/'users_1.pkl').open('rb') as fh:
    users = pickle.load(fh)

with (traces_fp/'trips_2.pkl').open('rb') as fh:
    trips_df = pickle.load(fh)

with (traces_fp/'trips_1.pkl').open('rb') as fh:
    trips_df1 = pickle.load(fh)

In [4]:
trips_df.reset_index(drop=True,inplace=True)

In [5]:
#add trip type, description, avg_speed, total_distance back in
trips_df1 = trips_df1[['tripid','trip_type','description','total_distance_ft','avg_speed_mph']]
trips_df = trips_df.merge(trips_df1,on='tripid')

In [6]:
trips_df.set_index('tripid',drop=False,inplace=True)

In [7]:
initial_trips = trips_df.shape[0]
print(initial_trips,'initial trips')

20338 initial trips


## Remove loop trips
These are trips where the origin is the same as the destination. The impedance calibration process can't account for these.

In [8]:
no_loops = trips_df['sort_start_label'] != trips_df['sort_end_label']
print('This many loops:',(~no_loops).sum())
trips_df = trips_df[no_loops]

This many loops: 3873


# Find unique trips
Will be labelled with a '-1' in either the start_label or end_label column. These will all be retained.

In [9]:
label = (trips_df[['sort_start_label','sort_end_label']] != (-1,-1)).all(axis=1)
unique_trips = trips_df.loc[~label]
print('This many unique trips without labels:',(~label).sum())

This many unique trips without labels: 1434


## Choose the most prevelant trip if the trip is not unique
Trip origins and destination were labelled by user using DBSCAN. Values of '-1' indicate that an origin or destination was not near others. For repeat trips between ODs (regardless of direction), each trip was labelled according to its trajectory similarity with other trips using Frechet distance. For instance, two trips going from point A to B with the same trip pattern label are similar in routing. A trip between A and B with a different trip pattern label would indicate that this trip takes a different route from the previous two. Each trip pattern label has a trip pattern prevalence that correpsonde to the total number of trips that follow this pattern.

For impedance calibration, we want the most common route between two points.

In [10]:
redundant_trips = trips_df.loc[label]

In [11]:
# if there is only one trip for a label pair, trip_patterns is nan and the trip is unique
#still nans popping up which tells me our code isn't working (need to return later though)
redundant_trips.groupby(['userid','sort_start_label','sort_end_label'])['trip_patterns'].agg(list)

userid  sort_start_label  sort_end_label
14      0.0               1.0                                       [0.0, 0.0, 0.0, 0.0, 1.0]
                          2.0               [0.0, 1.0, 2.0, 0.0, 3.0, 4.0, 5.0, 6.0, 7.0, ...
                          3.0                                                      [0.0, 0.0]
        1.0               2.0                                                      [0.0, 1.0]
        2.0               3.0                                                           [0.0]
                                                                  ...                        
1723    7.0               8.0                                                           [1.0]
                          10.0                                                          [0.0]
        10.0              12.0                                                          [0.0]
1727    0.0               1.0                                                      [0.0, 1.0]
1733    0.0        

In [12]:
grouped_maxes = redundant_trips.groupby(['userid','sort_start_label','sort_end_label'])['trip_pattern_prevalence'].idxmax()
most_prevalent = redundant_trips.loc[grouped_maxes]

Combine unique_trips and most_prevalent_trips

In [13]:
reduced_trip_set = pd.concat([most_prevalent,unique_trips])

# #for ones where it's idx select the first?
# isna = grouped_maxes[grouped_maxes.isna()].reset_index()
# isna.drop(columns=['trip_pattern_prevalence'],inplace=True)
# isna = pd.merge(trips_df,isna,on=['userid','sort_start_label','sort_end_label'])
# isna = isna.groupby(['userid','sort_start_label','sort_end_label'])['tripid'].agg(list)
# last_few = isna.apply(lambda x: x[0]).tolist()


In [14]:
#how many trips were not unique
print(trips_df.shape[0]-reduced_trip_set.shape[0],'trips were not unique')

12810 trips were not unique


In [15]:
print('Full trip set:',initial_trips,'Reduced trip set:',reduced_trip_set.shape[0])

Full trip set: 20338 Reduced trip set: 3655


# Additional Filters (trip type, description, distance, speed, etc.)

In [16]:
reduced_trip_set.columns

Index(['tripid', 'userid', 'remapped_userid', 'start_X', 'start_Y', 'end_X',
       'end_Y', 'start_label', 'end_label', 'sort_start_label',
       'sort_end_label', 'reverse_trajectory', 'trip_patterns',
       'trip_pattern_prevalence', 'trip_type', 'description',
       'total_distance_ft', 'avg_speed_mph'],
      dtype='object')

Trip Type

In [17]:
reduced_trip_set['trip_type'].value_counts()


Commute         1463
Social           751
Exercise         442
Errand           275
Shopping         190
Work-Related     181
School           125
Other            113
Work-related      59
other             56
Name: trip_type, dtype: int64

In [18]:
remove_type = reduced_trip_set['trip_type'].isin(['Exercise'])
print(remove_type.sum(),'removed exercise trips')
reduced_trip_set = reduced_trip_set[~remove_type]

442 removed exercise trips


Trip Distance

In [19]:

(reduced_trip_set['total_distance_ft'] / 5280).sort_values(ascending=False)

tripid
32568    24.137112
9151     20.275450
8796     20.248453
11904    19.330386
31397    18.740044
           ...    
7189      0.524510
7188      0.513198
9056      0.512816
8356      0.497441
3513      0.489709
Name: total_distance_ft, Length: 3213, dtype: float64

In [20]:
lower_end = 1
upper_end = 10
too_short = reduced_trip_set['total_distance_ft'] / 5280 < lower_end
too_long = reduced_trip_set['total_distance_ft'] / 5280 > upper_end
print(too_short.sum()+too_long.sum(),'were too long or short')
reduced_trip_set = reduced_trip_set[(~too_short) & (~too_long)]

222 were too long or short


Trip Speed

In [21]:
reduced_trip_set['avg_speed_mph'].sort_values(ascending=False)

tripid
13715    689.238694
13717    687.628536
17969     17.688240
7867      16.029994
379       15.710578
            ...    
5772       0.000000
10587      0.000000
10560      0.000000
5798            NaN
25844           NaN
Name: avg_speed_mph, Length: 2991, dtype: float64

In [22]:
too_slow = reduced_trip_set['avg_speed_mph'] < 5
too_fast = reduced_trip_set['avg_speed_mph'] > 16
print(too_slow.sum()+too_fast.sum(),'were too slow or fast')
reduced_trip_set = reduced_trip_set[(~too_slow)&(~too_fast)]

226 were too slow or fast


In [23]:
print(reduced_trip_set.shape[0],'trips remaining for calibration')

2765 trips remaining for calibration


# Export for Map Matching

In [24]:
with (traces_fp/'trips_3.pkl').open('wb') as fh:
    pickle.dump(reduced_trip_set,fh)