In [1]:
!pip install gpxpy
!pip install xlwt



In [2]:
import os
import pandas as pd
import numpy as np
import json
import os
import gpxpy
from datetime import datetime
from dateutil import parser

In [3]:
workouts_folder_path1 = "/content/drive/MyDrive/Dane z aplikacji/Endomondo/Workouts"
workouts_folder_path2 = "/content/drive/MyDrive/Dane z aplikacji/MapMyRun/user184238914_workout_history.csv"
workouts_folder_path3 = "/content/drive/MyDrive/Dane z aplikacji/Sports Tracker/Running workouts"
workouts_folder_path4 = "/content/drive/MyDrive/Dane z aplikacji/Garmin Connect/Aktywności/Bieganie.csv"

**df1 => Endomondo export**

In [4]:
dfs = []

files = [f for f in os.listdir(workouts_folder_path1) if f.endswith(".json")]

if files:
    for file_name in files:
        file_path = os.path.join(workouts_folder_path1, file_name)

        with open(file_path, 'r') as file:
            try:
                data = json.load(file)
                if isinstance(data, list) and len(data) > 0:
                    sport = 'Running' if data[0].get('sport') == 'RUNNING' else data[0].get('sport')
                    if sport == 'Running':
                        source = data[1].get('source')
                        source = source.replace("INPUT_MANUAL", "Endomondo (Manual)").replace("TRACK_MOBILE", "Endomondo (GPS)") if pd.notna(source) else 'Endomondo'
                        start_time = pd.to_datetime(data[3].get('start_time'))
                        end_time = pd.to_datetime(data[4].get('end_time'))
                        speed_avg_kmh = round(float(data[8].get('speed_avg_kmh')), 2) if data[8].get('speed_avg_kmh') is not None else None
                        duration = (str(data[5].get('duration_s')).split('.')[0])if data[5].get('duration_s') is not None else None
                        distance = round(float(data[6].get('distance_km')), 2) if duration is not None else None

                        dfs.append({
                            'sport': sport,
                            'source': source,
                            'date': start_time.date(),
                            'start_time': start_time.time(),
                            'end_time': end_time.time(),
                            'duration_s': duration,
                            'distance_km': distance,
                            'speed_avg_kmh': speed_avg_kmh
                        })

            except Exception as e:
                print(f"Error processing JSON file {file_name}: {e}")

    df1 = pd.DataFrame(dfs).sort_values(by='date').reset_index(drop=True)
    df1['date'] = df1['date'].astype(str).str.strip()
    df1['date'] = pd.to_datetime(df1['date'])
    df1['distance_km'] = df1['distance_km'].astype(float).round(2)
    df1['duration_s'] = pd.to_numeric(df1['duration_s'], errors='coerce').astype('Int64')
else:
    print("No JSON files found in the specified folder.")

**df2 => Map My Run export**

In [5]:
df = pd.read_csv(workouts_folder_path2, header=None, skip_blank_lines=False)

df2 = df[df[2] == 'Run'].copy()
df2.rename(columns={1: 'date', 2: 'sport', 4: 'distance_km', 5: 'duration_s', 8: 'speed_avg_kmh', 13: 'source2', 14:'url'}, inplace=True)
df2['sport'] = df2['sport'].apply(lambda x: 'Running' if x == 'Run' else x)
df2['source'] = 'Map My Run app'
df2.drop(columns=[0, 3, 6, 7, 9, 10, 11, 12], inplace=True)

df2['date'] = df2['date'].apply(lambda x: parser.parse(x).strftime('%Y-%m-%d'))
df2['date'] = pd.to_datetime(df2['date'])
df2['distance_km'] = df2['distance_km'].astype(float).round(2)
df2['duration_s'] = df2['duration_s'].astype(int)
df2['speed_avg_kmh'] = df2['speed_avg_kmh'].astype(float).round(2)

df2 = df2[['sport', 'source', 'date', 'duration_s', 'distance_km', 'speed_avg_kmh']]

**df3 => export from Sports Tracker app**

In [6]:
gpx_data_list = []

for filename in os.listdir(workouts_folder_path3):
    if filename.endswith(".gpx"):
        gpx_file_path = os.path.join(workouts_folder_path3, filename)

        with open(gpx_file_path, "r") as gpx_file:
            gpx_data = gpxpy.parse(gpx_file)

            if gpx_data.tracks and gpx_data.tracks[0].segments and gpx_data.tracks[0].segments[0].points:
                start_time = gpx_data.tracks[0].segments[0].points[0].time

                date = start_time.strftime("%Y-%m-%d")
                start_time = start_time.strftime("%H:%M:%S")
                end_time = gpx_data.tracks[0].segments[0].points[-1].time
                duration_seconds = int((end_time - gpx_data.tracks[0].segments[0].points[0].time).total_seconds())

                distance_km = round((gpx_data.length_3d() / 1000), 2)
                speed_avg_kmh = round(distance_km / (duration_seconds / 3600), 2)

                gpx_data_list.append({
                    "sport" : "Running",
                    "source" : "Sports Tracker",
                    "title": np.nan,
                    "date": date,
                    "start_time": start_time,
                    "duration_s": duration_seconds,
                    "distance_km": distance_km,
                    "speed_avg_kmh": speed_avg_kmh
                })

df3 = pd.DataFrame(gpx_data_list)
df3['date'] = pd.to_datetime(df3['date'])
mask = df3['date'] > '2020-12-01'
df3 = df3[~mask]

**df4 => export from Garmin Connect till the end of 2023**

In [7]:
def convert_pace_to_speed(pace):
    pace_in_seconds = int(pace.split(':')[0]) * 60 + int(pace.split(':')[1])
    speed_in_kph = 3600 / pace_in_seconds
    return speed_in_kph

df4 = pd.read_csv(workouts_folder_path4, skip_blank_lines=False)
df4['source'] = 'Garmin Connect'
df4['Typ aktywności'] = df4['Typ aktywności'].replace('Bieganie', 'Running')
df4['Czas'] = pd.to_timedelta(df4['Czas']).dt.total_seconds()
df4['Czas'] = df4['Czas'].astype('int64')

df4['Data'] = pd.to_datetime(df4['Data'])
df4['start_time'] = df4['Data'].dt.time
df4['Data'] = df4['Data'].dt.date

df4.rename(columns={'Typ aktywności': 'sport', "Data": 'date', "Czas": "duration_s", "Dystans": "distance_km", "Średnie tempo": "speed_avg_kmh", "Tytuł": "title"}, inplace=True)
df4['speed_avg_kmh'] = df4['speed_avg_kmh'].apply(convert_pace_to_speed).round(2)
df4 = df4[['sport', 'source', 'title', 'date', 'start_time', 'duration_s', 'distance_km', 'speed_avg_kmh']]
df4['date'] = pd.to_datetime(df4['date'])
df4['title'] = df4['title'].apply(lambda x: np.nan if x == 'Bieganie' else x)

**df5 => manual load from Google Sheet file**

In [8]:
data = [
    ['17-11-2020', '18:47:00', 'Olsztyn', '10,00', '00:58:53', '10,19'],
    ['19-11-2020', '17:58:00', 'Olsztyn', '9,96', '01:01:05', '9,78'],
    ['22-11-2020', '11:09:00', 'Olsztyn', '9,62', '00:59:39', '9,68'],
    ['27-11-2020', '15:59:00', 'Olsztyn', '9,86', '01:00:18', '9,81'],
    ['29-11-2020', '12:46:00', 'Olsztyn', '7,17', '00:41:30', '10,37']
]

df = pd.DataFrame(data, columns=['date', 'start_time', 'location', 'distance_km', 'duration', 'speed_avg_kmh'])
df['date'] = pd.to_datetime(df['date'])
df['distance_km'] = df['distance_km'].str.replace(',', '.').astype(float)
df['duration_s'] = pd.to_timedelta(df['duration']).dt.total_seconds()
df['sport'] = 'Running'
df['source'] = 'Manual load'
df['title'] = np.nan
df['end_time'] = np.nan
df5 = df[['sport', 'source', 'title', 'date', 'start_time', 'end_time', 'distance_km', 'duration_s', 'speed_avg_kmh']]

  df['date'] = pd.to_datetime(df['date'])


**df12 => merged df1 with df2**

In [9]:
merged_df = pd.merge(df2, df1, on=['date', 'distance_km'], how='left')

df12 = pd.DataFrame()
df12['sport'] = merged_df['sport_x']
df12['source'] = merged_df['source_x'] + ', ' + merged_df['source_y']
df12['date'] = merged_df['date']
df12['start_time'] = merged_df['start_time']
df12['end_time'] = merged_df['end_time']
df12['distance_km'] = merged_df['distance_km']
df12['duration_s'] = merged_df['duration_s_x'].combine_first(merged_df['duration_s_y'])
df12['speed_avg_kmh'] = merged_df['speed_avg_kmh_x'].combine_first(merged_df['speed_avg_kmh_y'])
df12['mismatch'] = ''

mask_duration_not_nan = ~merged_df['duration_s_x'].isna() & ~merged_df['duration_s_y'].isna()
mask_speed_not_nan = ~merged_df['speed_avg_kmh_x'].isna() & ~merged_df['speed_avg_kmh_y'].isna()

tolerance_speed = 0.01

duration_mismatch = mask_duration_not_nan & (merged_df['duration_s_x'] != merged_df['duration_s_y'])
speed_mismatch = mask_speed_not_nan & (abs(merged_df['speed_avg_kmh_x'] - merged_df['speed_avg_kmh_y']) > tolerance_speed)

df12.loc[duration_mismatch, 'mismatch'] += 'duration_s; '
df12.loc[speed_mismatch, 'mismatch'] += f'speed_avg_kmh, {tolerance_speed}, values ({merged_df.loc[speed_mismatch, "speed_avg_kmh_x"].astype(str)}, {merged_df.loc[speed_mismatch, "speed_avg_kmh_y"].astype(str)}); '

mismatches = df12[df12['mismatch'] != '']

if not mismatches.empty:
  display(mismatches)
else:
  df12 = df12.drop('mismatch', axis=1)
  df12['title'] = np.nan

**df123 => merged df12 with df3**

In [10]:
df123 = pd.concat([df12, df3], ignore_index=True)

**df1234 => merged df123 with df4**

In [11]:
df1234 = pd.concat([df123, df4], ignore_index=True)

**df12345 => merged df1234 with df5**

In [12]:
df12345 = pd.concat([df1234, df5], ignore_index=True)

In [13]:
df12345['type'] = df12345['title'].apply(lambda x: 'Workout' if pd.isna(x) else 'Sport event')
df12345 = df12345[['source', 'sport', 'type', 'title', 'date', 'start_time', 'end_time', 'distance_km', 'duration_s', 'speed_avg_kmh']]
df12345 = df12345.sort_values(by='date')

In [14]:
selected_dates = df12345.loc[df12345['title'].notna() & ~df12345['title'].str.contains('Bieganie', na=False), 'date']
rows_to_remove = df12345[(df12345['date'].isin(selected_dates)) & pd.isna(df12345['source'])]
df12345 = df12345.drop(rows_to_remove.index)

In [15]:
selected_dates = ['2018-06-09', '2020-12-17']
df12345 = df12345.drop(df12345[(df12345['date'].isin(selected_dates)) & pd.isna(df12345['source'])].index)

In [16]:
selected_dates = ['2020-01-01','2020-01-11','2020-01-12','2020-01-19', '2020-01-28']
df12345 = df12345.drop(df12345[(
    (df12345['date'].isin(selected_dates))
    & (df12345['source'] == 'Garmin Connect')
    & ((df12345['title'].isna()) | (df12345['title'] == 'Olsztyn Bieg'))
)].index)

In [17]:
duplicate_rows = df12345[df12345.duplicated(subset=['date', 'duration_s', 'distance_km'], keep=False)]

if not duplicate_rows.empty:
  condition = pd.isna(duplicate_rows['title'])
  df12345 = df12345[df12345.index.isin(duplicate_rows[condition].index)]
  print(f"Deleted {count} rows.")
else:
    print("No duplicated rows.")

No duplicated rows.


**Checks**

In [18]:
x = df12345.loc[df12345['type']== 'Sport event']
display(x)

Unnamed: 0,source,sport,type,title,date,start_time,end_time,distance_km,duration_s,speed_avg_kmh
873,Garmin Connect,Running,Sport event,15. PKO Cracovia Maraton,2016-05-15,09:00:00,,42.2,13376.0,11.36
872,Garmin Connect,Running,Sport event,III Bieg Uniwersytecki,2016-06-04,12:00:00,,10.0,2767.0,13.0
871,Garmin Connect,Running,Sport event,I Spartakiada Samorządowa,2016-08-26,11:15:00,,5.8,1524.0,13.69
870,Garmin Connect,Running,Sport event,Bieg Niepodległości z PKO Bankiem Polskim,2016-11-11,15:12:00,,10.0,2808.0,12.81
869,Garmin Connect,Running,Sport event,Bieg Mikołajów 2017,2017-12-02,12:28:00,,12.5,3515.0,12.81
868,Garmin Connect,Running,Sport event,SkyWayRun 2018,2018-06-10,01:40:00,,5.0,1396.0,12.9
867,Garmin Connect,Running,Sport event,B2Run Gdańsk,2018-06-12,17:00:00,,5.8,1439.0,14.52
866,Garmin Connect,Running,Sport event,Enea IRONMAN 70.3 Gdynia,2018-08-04,13:00:00,,5.0,1401.0,12.86
865,Garmin Connect,Running,Sport event,XVII Bieg Jakubowy,2019-04-28,11:00:00,,10.0,3146.0,11.43
864,Garmin Connect,Running,Sport event,(Za)dyszka ZETO Software,2019-07-28,11:00:00,,10.0,3566.0,10.08


In [19]:
df12345['start_time'] = pd.to_datetime(df12345['start_time'], format='%H:%M:%S').dt.time
df12345['start_time_minutes'] = df12345['start_time'].apply(lambda x: x.hour * 60 + x.minute if not pd.isnull(x) else None)

mask = (df12345.duplicated(subset=['date', 'start_time'], keep=False) &
        (df12345['start_time_minutes'].diff().abs() <= 61) &
        (df12345['source'] == 'Garmin Connect'))
df12345 = df12345[~mask]

In [20]:
df12345['year_month'] = df12345['date'].dt.to_period('M')

result1 = df12345.groupby('year_month')['distance_km'].sum().reset_index()
result1['year_month'] = result1['year_month'].dt.to_timestamp().dt.strftime('%Y-%m')

In [21]:
google_sheets_link = 'https://docs.google.com/spreadsheets/d/1sKQRBWY5xvY-erhFxXo0bC0pTpZT6QUUMx1fjLngGk0/edit?usp=drive_link'
document_key = google_sheets_link.split('/')[-2]
worksheet_link = f'https://docs.google.com/spreadsheets/d/{document_key}/export?format=csv'

dfgoogle = pd.read_csv(worksheet_link, header=1)
dfgoogle = dfgoogle.drop(columns=['Unnamed: 0'])
dfgoogle = dfgoogle.iloc[:, :-1]
dfgoogle = dfgoogle.iloc[:-1,:]
dfgoogle.replace('-', np.nan, inplace=True)

In [22]:
result2 = pd.DataFrame(columns=['year_month', 'distance_km'])

for index, row in dfgoogle.iterrows():
    for col in dfgoogle.columns[1:]:
        year_month = f'{col}-{index + 1:02d}'
        distance_km = row[col]
        result2 = pd.concat([result2, pd.DataFrame({'year_month': [year_month], 'distance_km': [distance_km]})], ignore_index=True)

result2 = result2.sort_values(by='year_month')
result2 = result2[result2['year_month'].apply(lambda x: int(x.split('-')[1])) != 13]

result2['distance_km'] = result2['distance_km'].str.replace(',', '.').astype(float)
result2['distance_km'] = pd.to_numeric(result2['distance_km'])

In [23]:
result1 = result1.rename(columns={'distance_km': 'result1_distance_km'})
result2 = result2.rename(columns={'distance_km': 'result2_distance_km'})

result = pd.concat([result1.set_index('year_month'), result2.set_index('year_month')], axis=1, join='outer')
result['absolute_difference'] = abs(result['result2_distance_km'] - result['result1_distance_km'])
filtered_df = result[result['absolute_difference'] > 0.03].copy()
display(filtered_df)

Unnamed: 0_level_0,result1_distance_km,result2_distance_km,absolute_difference
year_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-06,32.75,32.15,0.6
2020-12,63.55,63.38,0.17
2021-08,127.11,126.96,0.15
2021-09,120.99,120.55,0.44


In [24]:
#selected_rows = df12345[(df12345['date'] >= '2020-12-01') & (df12345['date'] <= '2020-12-31')]
#display(selected_rows)

In [25]:
x = df12345.loc[df12345['type']== 'Sport event']
display(x)

Unnamed: 0,source,sport,type,title,date,start_time,end_time,distance_km,duration_s,speed_avg_kmh,start_time_minutes,year_month
873,Garmin Connect,Running,Sport event,15. PKO Cracovia Maraton,2016-05-15,09:00:00,,42.2,13376.0,11.36,540.0,2016-05
872,Garmin Connect,Running,Sport event,III Bieg Uniwersytecki,2016-06-04,12:00:00,,10.0,2767.0,13.0,720.0,2016-06
871,Garmin Connect,Running,Sport event,I Spartakiada Samorządowa,2016-08-26,11:15:00,,5.8,1524.0,13.69,675.0,2016-08
870,Garmin Connect,Running,Sport event,Bieg Niepodległości z PKO Bankiem Polskim,2016-11-11,15:12:00,,10.0,2808.0,12.81,912.0,2016-11
869,Garmin Connect,Running,Sport event,Bieg Mikołajów 2017,2017-12-02,12:28:00,,12.5,3515.0,12.81,748.0,2017-12
868,Garmin Connect,Running,Sport event,SkyWayRun 2018,2018-06-10,01:40:00,,5.0,1396.0,12.9,100.0,2018-06
867,Garmin Connect,Running,Sport event,B2Run Gdańsk,2018-06-12,17:00:00,,5.8,1439.0,14.52,1020.0,2018-06
866,Garmin Connect,Running,Sport event,Enea IRONMAN 70.3 Gdynia,2018-08-04,13:00:00,,5.0,1401.0,12.86,780.0,2018-08
865,Garmin Connect,Running,Sport event,XVII Bieg Jakubowy,2019-04-28,11:00:00,,10.0,3146.0,11.43,660.0,2019-04
864,Garmin Connect,Running,Sport event,(Za)dyszka ZETO Software,2019-07-28,11:00:00,,10.0,3566.0,10.08,660.0,2019-07


In [26]:
excel_file_path = '/content/drive/MyDrive/Colab Notebooks/Tableau/Outputs/workouts.xls'
df12345 = df12345.drop('start_time_minutes', 'year_month', )

#df12345.to_excel(excel_file_path, index=False)

  df12345 = df12345.drop('start_time_minutes', 'year_month', )


ValueError: No axis named year_month for object type DataFrame