In [1]:
# import dependencies
import pandas as pd;
from geopy import distance;
from ast import literal_eval;
from dateutil import parser;
from datetime import timedelta;
import matplotlib;

In [2]:
# relevant constants
from secrets import HOME_LATLNG, WORK_LATLNG

COMMUTE_START_THRESHOLD = 250 # distance between start and home/work under which to count as commute
COMMUTE_END_THRESHOLD = 100 # distance between end and work/home under which to count as commute

In [3]:
# read the particular CSV
df = pd.read_csv('./strava_25261767_1581740508.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   commute           200 non-null    bool   
 1   distance          200 non-null    float64
 2   elapsed_time      200 non-null    int64  
 3   end_latlng        200 non-null    object 
 4   id                200 non-null    int64  
 5   moving_time       200 non-null    int64  
 6   start_date_local  200 non-null    object 
 7   start_latlng      200 non-null    object 
dtypes: bool(1), float64(1), int64(3), object(3)
memory usage: 11.3+ KB


In [5]:
# convert latlong literals into lists, then tuples
df['end_latlng'] = df['end_latlng'].apply(literal_eval)
df['start_latlng'] = df['start_latlng'].apply(literal_eval)
df['end_latlng'] = df['end_latlng'].apply(tuple)
df['start_latlng'] = df['start_latlng'].apply(tuple)

# convert start date (it's actually local, not UTC even though the
# string says Z)

df['start_date_local'] = df['start_date_local'].apply(parser.parse)
df['start_date_local'] = df['start_date_local'].apply(lambda x: x.replace(tzinfo=None))

In [6]:
# calculate distance from start and end points to home/work

df['dist_start_home'] = df['start_latlng'].apply(lambda x:distance.distance(x,HOME_LATLNG).m)
df['dist_start_work'] = df['start_latlng'].apply(lambda x:distance.distance(x,WORK_LATLNG).m)
df['dist_end_home'] = df['end_latlng'].apply(lambda x:distance.distance(x,HOME_LATLNG).m)
df['dist_end_work'] = df['end_latlng'].apply(lambda x:distance.distance(x,WORK_LATLNG).m)

In [7]:
# calculate end time from start time
df['end_date_local'] = df['start_date_local']+df['elapsed_time'].apply(lambda x: timedelta(0,x))

In [8]:
# apply distance thresholding to find commutes in and out
df['is_commute_in'] = (df['dist_start_home'] <= COMMUTE_START_THRESHOLD) & \
                      (df['dist_end_work'] <= COMMUTE_END_THRESHOLD)

df['is_commute_out'] = (df['dist_start_work'] <= COMMUTE_START_THRESHOLD) & \
                      (df['dist_end_home'] <= COMMUTE_END_THRESHOLD)

num_commutes_in = df[(df['is_commute_in'])].shape[0]
num_commutes_out = df[(df['is_commute_out'])].shape[0]
num_tag_errors = df[(df['is_commute_in']) & (df['is_commute_out'])].shape[0]
num_not_commutes = df[(~df['is_commute_in']) & (~df['is_commute_out'])].shape[0]

# sense check the data
print(num_commutes_in,'commutes in (to office)')
print(num_commutes_out,'commutes out (from office)')
print(num_tag_errors,'trips tagged as both (SHOULD BE 0)')
print(num_not_commutes,'trips tagged as neither')
print(num_commutes_in+num_commutes_out+num_not_commutes,
      'trips tagged ( checksum:', df.shape[0],')')

67 commutes in (to office)
60 commutes out (from office)
0 trips tagged as both (SHOULD BE 0)
73 trips tagged as neither
200 trips tagged ( checksum: 200 )


In [9]:
# slice and reshape raw data into separate commutes in/out; only take columns needed for analysis
all_commutes = df[df['is_commute_in'] | df['is_commute_out']]