In [73]:
## Download the modules needed for the analysis below

from zipfile import ZipFile
import pandas as pd
import json
from datetime import datetime

In [74]:
## Set the path to import our file
%cd "C:\Fitness\Cycling Data\Raw data"

C:\Fitness\Cycling Data\Raw data


In [75]:
#### Before I go any further, this step below borrows heavily from the link below and I have obviously adapted it for my use 
### case but credit must go where credit is due to Maksym Kozlenko and his amazing illustration of how to easily import and
### load your Google Timeline Location History data. Please see the link below 
### https://betterprogramming.pub/loading-location-history-places-from-google-timeline-into-pandas-and-csv-c26cb0ac5e89


# path to Google Location History takeout file
history_data_file = "takeout-20230915T180410Z-001.zip"

# store all places into this array
cycle_trips = []

# Import the zip file and unzip the Google Timeline location history JSON files
with ZipFile(history_data_file) as myzip:
    for file in myzip.filelist[:]:
        filename = file.filename

# We want to look for the "Semantic Location History" files that contains the cycling distance data that we want to analyse below
        if "Semantic Location History" in filename:
            # process all files in "Semantic Location History" directory
            history_json = json.load(myzip.open(filename))
            
            # We need to locate the timeline object within the JSON file that contains the cycling distance data
            for timeline_object in history_json["timelineObjects"]:   
             
 # The key timeline object is "activitySegment" and we set up an object which basically contains the data stored as a dictionary
                if "activitySegment" in timeline_object:
                    cycle_trips_json = timeline_object["activitySegment"]                 
                    
# skip records where there are missing values for distance and if Google cannot record a 'LOW','MEDIUM'.'HIGH' categorisation for
# your recorded activity      
                    if not "distance" in cycle_trips_json or not 'confidence' in cycle_trips_json:
                        continue

                    activity_data = {
                        'Activity_Type': cycle_trips_json['activities'][0]['activityType'],
                        "Activity_Type_probability": cycle_trips_json['activities'][0]['probability'],
                        "distance": cycle_trips_json['distance'],
                        "confidence": cycle_trips_json['confidence'],
                        "startTimestamp": cycle_trips_json["duration"]["startTimestamp"],
                        "endTimestamp": cycle_trips_json["duration"]["endTimestamp"],
                    }                    
                    cycle_trips.append(activity_data)

In [76]:
cycle_trips_df = pd.DataFrame(cycle_trips)

In [77]:
df = cycle_trips_df.copy()

In [78]:
df

Unnamed: 0,Activity_Type,Activity_Type_probability,distance,confidence,startTimestamp,endTimestamp
0,CYCLING,62.95,1131,MEDIUM,2020-05-18T18:00:09.211Z,2020-05-18T18:16:18.919Z
1,CYCLING,82.46,944,HIGH,2020-05-18T19:34:05.079Z,2020-05-18T19:44:14.080Z
2,CYCLING,82.52,1261,HIGH,2020-05-19T09:20:20.388Z,2020-05-19T10:07:08.805Z
3,CYCLING,88.82,1133,HIGH,2020-05-19T13:28:45.485Z,2020-05-19T13:49:58.977Z
4,CYCLING,69.85,2124,MEDIUM,2020-05-20T07:25:32.727Z,2020-05-20T08:14:30.274Z
...,...,...,...,...,...,...
10646,WALKING,0.00,1101,LOW,2015-01-17T22:46:19.429Z,2015-01-19T10:37:09.002Z
10647,UNKNOWN_ACTIVITY_TYPE,0.00,1828,UNKNOWN_CONFIDENCE,2015-01-20T16:53:06.911Z,2015-01-21T20:26:28.553Z
10648,CYCLING,0.00,613,LOW,2015-01-21T20:26:28.553Z,2015-01-21T20:28:28.625Z
10649,IN_VEHICLE,0.00,3719,LOW,2015-01-27T19:17:14.586Z,2015-01-27T21:04:03.619Z


In [79]:
## This removes TZ info from the start/end Timestamp datetime columns and remove the nanoseconds 
df['startTimestamp'] = pd.to_datetime(df.startTimestamp).dt.tz_localize(None).dt.floor('S')
df['endTimestamp'] = pd.to_datetime(df.endTimestamp).dt.tz_localize(None).dt.floor('S')
df

Unnamed: 0,Activity_Type,Activity_Type_probability,distance,confidence,startTimestamp,endTimestamp
0,CYCLING,62.95,1131,MEDIUM,2020-05-18 18:00:09,2020-05-18 18:16:18
1,CYCLING,82.46,944,HIGH,2020-05-18 19:34:05,2020-05-18 19:44:14
2,CYCLING,82.52,1261,HIGH,2020-05-19 09:20:20,2020-05-19 10:07:08
3,CYCLING,88.82,1133,HIGH,2020-05-19 13:28:45,2020-05-19 13:49:58
4,CYCLING,69.85,2124,MEDIUM,2020-05-20 07:25:32,2020-05-20 08:14:30
...,...,...,...,...,...,...
10646,WALKING,0.00,1101,LOW,2015-01-17 22:46:19,2015-01-19 10:37:09
10647,UNKNOWN_ACTIVITY_TYPE,0.00,1828,UNKNOWN_CONFIDENCE,2015-01-20 16:53:06,2015-01-21 20:26:28
10648,CYCLING,0.00,613,LOW,2015-01-21 20:26:28,2015-01-21 20:28:28
10649,IN_VEHICLE,0.00,3719,LOW,2015-01-27 19:17:14,2015-01-27 21:04:03


In [80]:
df['day'] = df['endTimestamp'].dt.day
df['month'] = df['endTimestamp'].dt.month
df['year'] = df['endTimestamp'].dt.year
df['date'] = df['endTimestamp'].dt.date
df['day_name'] = df['endTimestamp'].dt.day_name()

In [81]:
df

Unnamed: 0,Activity_Type,Activity_Type_probability,distance,confidence,startTimestamp,endTimestamp,day,month,year,date,day_name
0,CYCLING,62.95,1131,MEDIUM,2020-05-18 18:00:09,2020-05-18 18:16:18,18,5,2020,2020-05-18,Monday
1,CYCLING,82.46,944,HIGH,2020-05-18 19:34:05,2020-05-18 19:44:14,18,5,2020,2020-05-18,Monday
2,CYCLING,82.52,1261,HIGH,2020-05-19 09:20:20,2020-05-19 10:07:08,19,5,2020,2020-05-19,Tuesday
3,CYCLING,88.82,1133,HIGH,2020-05-19 13:28:45,2020-05-19 13:49:58,19,5,2020,2020-05-19,Tuesday
4,CYCLING,69.85,2124,MEDIUM,2020-05-20 07:25:32,2020-05-20 08:14:30,20,5,2020,2020-05-20,Wednesday
...,...,...,...,...,...,...,...,...,...,...,...
10646,WALKING,0.00,1101,LOW,2015-01-17 22:46:19,2015-01-19 10:37:09,19,1,2015,2015-01-19,Monday
10647,UNKNOWN_ACTIVITY_TYPE,0.00,1828,UNKNOWN_CONFIDENCE,2015-01-20 16:53:06,2015-01-21 20:26:28,21,1,2015,2015-01-21,Wednesday
10648,CYCLING,0.00,613,LOW,2015-01-21 20:26:28,2015-01-21 20:28:28,21,1,2015,2015-01-21,Wednesday
10649,IN_VEHICLE,0.00,3719,LOW,2015-01-27 19:17:14,2015-01-27 21:04:03,27,1,2015,2015-01-27,Tuesday


In [82]:
onlycycling = df['Activity_Type'] == 'CYCLING'
not_non_or_low_confidence = df['confidence'].isin(['MEDIUM','HIGH'])
year_2023 = df['year'] == 2023
feb_to_may = df['month'].isin([2,3,4,5])

In [83]:
df = df.loc[onlycycling & not_non_or_low_confidence & year_2023 & feb_to_may].reset_index(drop=True)
df

Unnamed: 0,Activity_Type,Activity_Type_probability,distance,confidence,startTimestamp,endTimestamp,day,month,year,date,day_name
0,CYCLING,87.31,2418,HIGH,2023-05-01 06:42:25,2023-05-01 07:04:02,1,5,2023,2023-05-01,Monday
1,CYCLING,97.04,946,HIGH,2023-05-01 13:41:49,2023-05-01 13:51:20,1,5,2023,2023-05-01,Monday
2,CYCLING,99.27,5599,HIGH,2023-05-01 13:59:43,2023-05-01 14:30:28,1,5,2023,2023-05-01,Monday
3,CYCLING,99.05,5400,HIGH,2023-05-01 15:40:18,2023-05-01 16:15:45,1,5,2023,2023-05-01,Monday
4,CYCLING,96.38,1015,HIGH,2023-05-01 16:32:35,2023-05-01 16:41:59,1,5,2023,2023-05-01,Monday
...,...,...,...,...,...,...,...,...,...,...,...
427,CYCLING,97.60,1119,HIGH,2023-03-30 18:11:15,2023-03-30 18:24:21,30,3,2023,2023-03-30,Thursday
428,CYCLING,96.41,1140,HIGH,2023-03-30 19:37:14,2023-03-30 19:47:02,30,3,2023,2023-03-30,Thursday
429,CYCLING,93.84,291,HIGH,2023-03-30 19:55:04,2023-03-30 19:57:35,30,3,2023,2023-03-30,Thursday
430,CYCLING,95.16,1606,HIGH,2023-03-31 11:47:32,2023-03-31 11:57:20,31,3,2023,2023-03-31,Friday


In [84]:
df['distance'] = df['distance']/1000
df

Unnamed: 0,Activity_Type,Activity_Type_probability,distance,confidence,startTimestamp,endTimestamp,day,month,year,date,day_name
0,CYCLING,87.31,2.42,HIGH,2023-05-01 06:42:25,2023-05-01 07:04:02,1,5,2023,2023-05-01,Monday
1,CYCLING,97.04,0.95,HIGH,2023-05-01 13:41:49,2023-05-01 13:51:20,1,5,2023,2023-05-01,Monday
2,CYCLING,99.27,5.60,HIGH,2023-05-01 13:59:43,2023-05-01 14:30:28,1,5,2023,2023-05-01,Monday
3,CYCLING,99.05,5.40,HIGH,2023-05-01 15:40:18,2023-05-01 16:15:45,1,5,2023,2023-05-01,Monday
4,CYCLING,96.38,1.01,HIGH,2023-05-01 16:32:35,2023-05-01 16:41:59,1,5,2023,2023-05-01,Monday
...,...,...,...,...,...,...,...,...,...,...,...
427,CYCLING,97.60,1.12,HIGH,2023-03-30 18:11:15,2023-03-30 18:24:21,30,3,2023,2023-03-30,Thursday
428,CYCLING,96.41,1.14,HIGH,2023-03-30 19:37:14,2023-03-30 19:47:02,30,3,2023,2023-03-30,Thursday
429,CYCLING,93.84,0.29,HIGH,2023-03-30 19:55:04,2023-03-30 19:57:35,30,3,2023,2023-03-30,Thursday
430,CYCLING,95.16,1.61,HIGH,2023-03-31 11:47:32,2023-03-31 11:57:20,31,3,2023,2023-03-31,Friday


In [85]:
daily_metrics = df.groupby(['month','day'])[['distance']].agg('sum')
daily_metrics = daily_metrics.rename({'distance':'Total distance'}, axis = 1)
daily_metrics

Unnamed: 0_level_0,Unnamed: 1_level_0,Total distance
month,day,Unnamed: 2_level_1
2,1,5.73
2,2,6.63
2,3,19.55
2,4,7.87
2,5,1.96
...,...,...
5,27,6.38
5,28,10.42
5,29,15.39
5,30,0.71


In [95]:
weekday_analysis = df.groupby('day_name')['distance'].sum()
weekday_analysis

day_name
Friday      197.33
Monday      147.61
Saturday     96.17
Sunday       89.17
Thursday    118.47
Tuesday     135.96
Wednesday   173.05
Name: distance, dtype: float64

In [103]:
df_weekday = df.merge(weekday_analysis, how='inner', on=['day_name'], indicator = True)
df_weekday = df_weekday.groupby('day_name').head(1).reset_index(drop=True)
df_weekday['distance_y'] = df_weekday['distance_y']/17
df_weekday

Unnamed: 0,Activity_Type,Activity_Type_probability,distance_x,confidence,startTimestamp,endTimestamp,day,month,year,date,day_name,distance_y,_merge
0,CYCLING,87.31,2.42,HIGH,2023-05-01 06:42:25,2023-05-01 07:04:02,1,5,2023,2023-05-01,Monday,8.68,both
1,CYCLING,89.07,2.67,HIGH,2023-05-02 14:28:33,2023-05-02 14:52:03,2,5,2023,2023-05-02,Tuesday,8.0,both
2,CYCLING,97.78,4.6,HIGH,2023-05-03 14:20:45,2023-05-03 14:45:51,3,5,2023,2023-05-03,Wednesday,10.18,both
3,CYCLING,0.0,0.28,HIGH,2023-05-04 09:19:10,2023-05-04 09:21:12,4,5,2023,2023-05-04,Thursday,6.97,both
4,CYCLING,89.21,0.44,HIGH,2023-05-05 08:36:23,2023-05-05 08:40:31,5,5,2023,2023-05-05,Friday,11.61,both
5,CYCLING,0.0,1.23,HIGH,2023-05-06 07:54:11,2023-05-06 08:07:25,6,5,2023,2023-05-06,Saturday,5.66,both
6,CYCLING,95.46,1.09,HIGH,2023-05-07 07:53:50,2023-05-07 08:06:00,7,5,2023,2023-05-07,Sunday,5.25,both


In [56]:
df = df.merge(daily_metrics, how='inner', on=['month','day'], indicator = True)


In [57]:
df

Unnamed: 0,Activity_Type,Activity_Type_probability,distance,confidence,startTimestamp,endTimestamp,day,month,year,date,day_name,Total distance,_merge
0,CYCLING,87.31,2.42,HIGH,2023-05-01 06:42:25,2023-05-01 07:04:02,1,5,2023,2023-05-01,Monday,17.41,both
1,CYCLING,97.04,0.95,HIGH,2023-05-01 13:41:49,2023-05-01 13:51:20,1,5,2023,2023-05-01,Monday,17.41,both
2,CYCLING,99.27,5.60,HIGH,2023-05-01 13:59:43,2023-05-01 14:30:28,1,5,2023,2023-05-01,Monday,17.41,both
3,CYCLING,99.05,5.40,HIGH,2023-05-01 15:40:18,2023-05-01 16:15:45,1,5,2023,2023-05-01,Monday,17.41,both
4,CYCLING,96.38,1.01,HIGH,2023-05-01 16:32:35,2023-05-01 16:41:59,1,5,2023,2023-05-01,Monday,17.41,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...
427,CYCLING,97.60,1.12,HIGH,2023-03-30 18:11:15,2023-03-30 18:24:21,30,3,2023,2023-03-30,Thursday,2.55,both
428,CYCLING,96.41,1.14,HIGH,2023-03-30 19:37:14,2023-03-30 19:47:02,30,3,2023,2023-03-30,Thursday,2.55,both
429,CYCLING,93.84,0.29,HIGH,2023-03-30 19:55:04,2023-03-30 19:57:35,30,3,2023,2023-03-30,Thursday,2.55,both
430,CYCLING,95.16,1.61,HIGH,2023-03-31 11:47:32,2023-03-31 11:57:20,31,3,2023,2023-03-31,Friday,3.28,both


In [58]:
df = df.groupby(['month','day']).head(1).reset_index(drop=True)
df

Unnamed: 0,Activity_Type,Activity_Type_probability,distance,confidence,startTimestamp,endTimestamp,day,month,year,date,day_name,Total distance,_merge
0,CYCLING,87.31,2.42,HIGH,2023-05-01 06:42:25,2023-05-01 07:04:02,1,5,2023,2023-05-01,Monday,17.41,both
1,CYCLING,89.07,2.67,HIGH,2023-05-02 14:28:33,2023-05-02 14:52:03,2,5,2023,2023-05-02,Tuesday,4.82,both
2,CYCLING,97.78,4.60,HIGH,2023-05-03 14:20:45,2023-05-03 14:45:51,3,5,2023,2023-05-03,Wednesday,9.44,both
3,CYCLING,0.00,0.28,HIGH,2023-05-04 09:19:10,2023-05-04 09:21:12,4,5,2023,2023-05-04,Thursday,9.34,both
4,CYCLING,89.21,0.44,HIGH,2023-05-05 08:36:23,2023-05-05 08:40:31,5,5,2023,2023-05-05,Friday,1.31,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...
111,CYCLING,98.19,4.59,HIGH,2023-03-27 14:15:31,2023-03-27 14:41:48,27,3,2023,2023-03-27,Monday,9.58,both
112,CYCLING,93.62,1.19,HIGH,2023-03-28 10:07:31,2023-03-28 10:13:32,28,3,2023,2023-03-28,Tuesday,12.05,both
113,CYCLING,78.83,0.29,HIGH,2023-03-29 09:42:04,2023-03-29 09:45:33,29,3,2023,2023-03-29,Wednesday,12.46,both
114,CYCLING,97.60,1.12,HIGH,2023-03-30 18:11:15,2023-03-30 18:24:21,30,3,2023,2023-03-30,Thursday,2.55,both


In [59]:
pd.options.display.float_format = "{:.2f}".format

df['Total distance'].agg('mean')

8.256577586206896

In [60]:
df

Unnamed: 0,Activity_Type,Activity_Type_probability,distance,confidence,startTimestamp,endTimestamp,day,month,year,date,day_name,Total distance,_merge
0,CYCLING,87.31,2.42,HIGH,2023-05-01 06:42:25,2023-05-01 07:04:02,1,5,2023,2023-05-01,Monday,17.41,both
1,CYCLING,89.07,2.67,HIGH,2023-05-02 14:28:33,2023-05-02 14:52:03,2,5,2023,2023-05-02,Tuesday,4.82,both
2,CYCLING,97.78,4.60,HIGH,2023-05-03 14:20:45,2023-05-03 14:45:51,3,5,2023,2023-05-03,Wednesday,9.44,both
3,CYCLING,0.00,0.28,HIGH,2023-05-04 09:19:10,2023-05-04 09:21:12,4,5,2023,2023-05-04,Thursday,9.34,both
4,CYCLING,89.21,0.44,HIGH,2023-05-05 08:36:23,2023-05-05 08:40:31,5,5,2023,2023-05-05,Friday,1.31,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...
111,CYCLING,98.19,4.59,HIGH,2023-03-27 14:15:31,2023-03-27 14:41:48,27,3,2023,2023-03-27,Monday,9.58,both
112,CYCLING,93.62,1.19,HIGH,2023-03-28 10:07:31,2023-03-28 10:13:32,28,3,2023,2023-03-28,Tuesday,12.05,both
113,CYCLING,78.83,0.29,HIGH,2023-03-29 09:42:04,2023-03-29 09:45:33,29,3,2023,2023-03-29,Wednesday,12.46,both
114,CYCLING,97.60,1.12,HIGH,2023-03-30 18:11:15,2023-03-30 18:24:21,30,3,2023,2023-03-30,Thursday,2.55,both
