In [1]:
!pip install gpxpy
!pip install xlwt



In [2]:
import os
import pandas as pd
import numpy as np
import json
import os
import gpxpy
from datetime import datetime
from dateutil import parser

In [3]:
workouts_folder_path1 = "/content/drive/MyDrive/Dane z aplikacji/Endomondo/Workouts"
workouts_folder_path2 = "/content/drive/MyDrive/Dane z aplikacji/MapMyRun/user184238914_workout_history.csv"
workouts_folder_path3 = "/content/drive/MyDrive/Dane z aplikacji/Sports Tracker/Running workouts"
workouts_folder_path4 = "/content/drive/MyDrive/Dane z aplikacji/Garmin Connect/Aktywności/Bieganie.csv"

**df1 => Endomondo export**

In [4]:
dfs = []

files = [f for f in os.listdir(workouts_folder_path1) if f.endswith(".json")]

if files:
    for file_name in files:
        file_path = os.path.join(workouts_folder_path1, file_name)

        with open(file_path, 'r') as file:
            try:
                data = json.load(file)
                if isinstance(data, list) and len(data) > 0:
                    sport = 'Running' if data[0].get('sport') == 'RUNNING' else data[0].get('sport')
                    if sport == 'Running':
                        source = data[1].get('source')
                        source = source.replace("INPUT_MANUAL", "Endomondo (Manual)").replace("TRACK_MOBILE", "Endomondo (GPS)") if pd.notna(source) else 'Endomondo'
                        start_time = pd.to_datetime(data[3].get('start_time'))
                        end_time = pd.to_datetime(data[4].get('end_time'))
                        speed_avg_kmh = round(float(data[8].get('speed_avg_kmh')), 2) if data[8].get('speed_avg_kmh') is not None else None
                        duration = (str(data[5].get('duration_s')).split('.')[0])if data[5].get('duration_s') is not None else None
                        distance = round(float(data[6].get('distance_km')), 2) if duration is not None else None

                        dfs.append({
                            'sport': sport,
                            'source': source,
                            'date': start_time.date(),
                            'start_time': start_time.time(),
                            'end_time': end_time.time(),
                            'duration_s': duration,
                            'distance_km': distance,
                            'speed_avg_kmh': speed_avg_kmh
                        })

            except Exception as e:
                print(f"Error processing JSON file {file_name}: {e}")

    df1 = pd.DataFrame(dfs).sort_values(by='date').reset_index(drop=True)
    df1['date'] = df1['date'].astype(str).str.strip()
    df1['date'] = pd.to_datetime(df1['date'])
    df1['distance_km'] = df1['distance_km'].astype(float).round(2)
    df1['duration_s'] = pd.to_numeric(df1['duration_s'], errors='coerce').astype('Int64')
else:
    print("No JSON files found in the specified folder.")

**df2 => Map My Run export**

In [5]:
df = pd.read_csv(workouts_folder_path2, header=None, skip_blank_lines=False)

df2 = df[df[2] == 'Run'].copy()
df2.rename(columns={1: 'date', 2: 'sport', 4: 'distance_km', 5: 'duration_s', 8: 'speed_avg_kmh', 13: 'source2', 14:'url'}, inplace=True)
df2['sport'] = df2['sport'].apply(lambda x: 'Running' if x == 'Run' else x)
df2['source'] = 'Map My Run app'
df2.drop(columns=[0, 3, 6, 7, 9, 10, 11, 12], inplace=True)

df2['date'] = df2['date'].apply(lambda x: parser.parse(x).strftime('%Y-%m-%d'))
df2['date'] = pd.to_datetime(df2['date'])
df2['distance_km'] = df2['distance_km'].astype(float).round(2)
df2['duration_s'] = df2['duration_s'].astype(int)
df2['speed_avg_kmh'] = df2['speed_avg_kmh'].astype(float).round(2)

df2 = df2[['sport', 'source', 'date', 'duration_s', 'distance_km', 'speed_avg_kmh']]

**df3 => export from Sports Tracker app**

In [6]:
gpx_data_list = []

for filename in os.listdir(workouts_folder_path3):
    if filename.endswith(".gpx"):
        gpx_file_path = os.path.join(workouts_folder_path3, filename)

        with open(gpx_file_path, "r") as gpx_file:
            gpx_data = gpxpy.parse(gpx_file)

            if gpx_data.tracks and gpx_data.tracks[0].segments and gpx_data.tracks[0].segments[0].points:
                start_time = gpx_data.tracks[0].segments[0].points[0].time

                date = start_time.strftime("%Y-%m-%d")
                start_time = start_time.strftime("%H:%M:%S")
                end_time = gpx_data.tracks[0].segments[0].points[-1].time
                duration_seconds = int((end_time - gpx_data.tracks[0].segments[0].points[0].time).total_seconds())

                distance_km = round((gpx_data.length_3d() / 1000), 2)
                speed_avg_kmh = round(distance_km / (duration_seconds / 3600), 2)

                gpx_data_list.append({
                    "sport" : "Running",
                    "source" : "Sports Tracker",
                    "title": np.nan,
                    "date": date,
                    "start_time": start_time,
                    "duration_s": duration_seconds,
                    "distance_km": distance_km,
                    "speed_avg_kmh": speed_avg_kmh
                })

df3 = pd.DataFrame(gpx_data_list)
df3['date'] = pd.to_datetime(df3['date'])
mask = df3['date'] >= '2020-12-19'
df3 = df3[~mask]


**df4 => export from Garmin Connect till the end of 2023**

In [7]:
def convert_pace_to_speed(pace):
    pace_in_seconds = int(pace.split(':')[0]) * 60 + int(pace.split(':')[1])
    speed_in_kph = 3600 / pace_in_seconds
    return speed_in_kph

df4 = pd.read_csv(workouts_folder_path4, skip_blank_lines=False)
df4['source'] = 'Garmin Connect'
df4['Typ aktywności'] = df4['Typ aktywności'].replace('Bieganie', 'Running')
df4['Czas'] = pd.to_timedelta(df4['Czas']).dt.total_seconds()
df4['Czas'] = df4['Czas'].astype('int64')

df4['Data'] = pd.to_datetime(df4['Data'])
df4['start_time'] = df4['Data'].dt.time
df4['Data'] = df4['Data'].dt.date

df4.rename(columns={'Typ aktywności': 'sport', "Data": 'date', "Czas": "duration_s", "Dystans": "distance_km", "Średnie tempo": "speed_avg_kmh", "Tytuł": "title"}, inplace=True)
df4['speed_avg_kmh'] = df4['speed_avg_kmh'].apply(convert_pace_to_speed).round(2)
df4 = df4[['sport', 'source', 'title', 'date', 'start_time', 'duration_s', 'distance_km', 'speed_avg_kmh']]
df4['date'] = pd.to_datetime(df4['date'])
df4['title'] = df4['title'].apply(lambda x: np.nan if x == 'Bieganie' else x)

**df12 => merged df1 with df2**

In [8]:
merged_df = pd.merge(df2, df1, on=['date', 'distance_km'], how='left')

df12 = pd.DataFrame()
df12['sport'] = merged_df['sport_x']
df12['source'] = merged_df['source_x'] + ', ' + merged_df['source_y']
df12['date'] = merged_df['date']
df12['start_time'] = merged_df['start_time']
df12['end_time'] = merged_df['end_time']
df12['distance_km'] = merged_df['distance_km']
df12['duration_s'] = merged_df['duration_s_x'].combine_first(merged_df['duration_s_y'])
df12['speed_avg_kmh'] = merged_df['speed_avg_kmh_x'].combine_first(merged_df['speed_avg_kmh_y'])
df12['mismatch'] = ''

mask_duration_not_nan = ~merged_df['duration_s_x'].isna() & ~merged_df['duration_s_y'].isna()
mask_speed_not_nan = ~merged_df['speed_avg_kmh_x'].isna() & ~merged_df['speed_avg_kmh_y'].isna()

tolerance_speed = 0.01

duration_mismatch = mask_duration_not_nan & (merged_df['duration_s_x'] != merged_df['duration_s_y'])
speed_mismatch = mask_speed_not_nan & (abs(merged_df['speed_avg_kmh_x'] - merged_df['speed_avg_kmh_y']) > tolerance_speed)

df12.loc[duration_mismatch, 'mismatch'] += 'duration_s; '
df12.loc[speed_mismatch, 'mismatch'] += f'speed_avg_kmh, {tolerance_speed}, values ({merged_df.loc[speed_mismatch, "speed_avg_kmh_x"].astype(str)}, {merged_df.loc[speed_mismatch, "speed_avg_kmh_y"].astype(str)}); '

mismatches = df12[df12['mismatch'] != '']

if not mismatches.empty:
  display(mismatches)
else:
  df12 = df12.drop('mismatch', axis=1)
  df12['title'] = np.nan

**df123 => merged df12 with df3**

In [9]:
df123 = pd.concat([df12, df3], ignore_index=True)
df123 = df123[['sport', 'source', 'title', 'date', 'start_time', 'end_time', 'distance_km', 'duration_s', 'speed_avg_kmh']]
df123 = df123.sort_values(by='date')

**df1234 => merged df123 with df4**

In [10]:
df1234 = pd.concat([df123, df4], ignore_index=True)
df1234 = df1234.sort_values(by='date')

In [11]:
selected_dates = df1234.loc[df1234['title'].notna() & ~df1234['title'].str.contains('Bieganie', na=False), 'date']
rows_to_remove = df1234[(df1234['date'].isin(selected_dates)) & pd.isna(df1234['source'])]
print("Rows to be removed:")
print(rows_to_remove)

df1234 = df1234.drop(rows_to_remove.index)

Rows to be removed:
       sport source title       date start_time end_time  distance_km  \
385  Running    NaN   NaN 2016-05-15        NaN      NaN         42.2   
389  Running    NaN   NaN 2016-06-04        NaN      NaN         10.0   
410  Running    NaN   NaN 2016-08-26        NaN      NaN          5.8   
422  Running    NaN   NaN 2016-11-11        NaN      NaN         10.0   
499  Running    NaN   NaN 2017-12-02        NaN      NaN         12.5   
535  Running    NaN   NaN 2018-06-12        NaN      NaN          5.2   
543  Running    NaN   NaN 2018-08-04        NaN      NaN          5.0   
572  Running    NaN   NaN 2019-04-28        NaN      NaN         10.0   
591  Running    NaN   NaN 2019-07-28        NaN      NaN         10.0   
628  Running    NaN   NaN 2020-01-12        NaN      NaN          1.0   
630  Running    NaN   NaN 2020-01-12        NaN      NaN          1.0   

     duration_s  speed_avg_kmh  
385       13376          11.36  
389        2767          13.01  
410 

In [12]:
selected_dates = ['2018-06-09']
df1234 = df1234.drop(df1234[(df1234['date'].isin(selected_dates)) & pd.isna(df1234['source'])].index)

In [13]:
selected_dates = ['2020-01-01','2020-01-11','2020-01-12','2020-01-19', '2020-01-28']
df1234 = df1234.drop(df1234[(
    (df1234['date'].isin(selected_dates))
    & (df1234['source'] == 'Garmin Connect')
    & ((df1234['title'].isna()) | (df1234['title'] == 'Olsztyn Bieg'))
)].index)


In [14]:
unique_titles = df1234.loc[df1234['title'].notna() & ~df1234['title'].str.contains('Bieganie', na=False), 'title'].unique()
print(unique_titles)

['15. PKO Cracovia Maraton' 'III Bieg Uniwersytecki'
 'I Spartakiada Samorządowa' 'Bieg Niepodległości z PKO Bankiem Polskim'
 'Bieg Mikołajów 2017' 'SkyWayRun 2018' 'B2Run Gdańsk'
 'Enea IRONMAN 70.3 Gdynia' 'XVII Bieg Jakubowy'
 '(Za)dyszka ZETO Software' '8 Bieg WOŚP Policz się z cukrzycą (Bieg 7)'
 '8 Bieg WOŚP Policz się z cukrzycą (Bieg 8)'
 'Auto Idea XVIII Bieg Jakubowy' '43. Maraton Warszawski'
 'Poland Business Run 2023 (wirtualnie)']


**Checks**

In [15]:
df1234['start_time'] = pd.to_datetime(df1234['start_time'], format='%H:%M:%S').dt.time
df1234['start_time_minutes'] = df1234['start_time'].apply(lambda x: x.hour * 60 + x.minute if not pd.isnull(x) else None)

mask = (df1234.duplicated(subset=['date', 'start_time'], keep=False) &
        (df1234['start_time_minutes'].diff().abs() <= 61) &
        (df1234['source'] == 'Garmin Connect'))
df1234 = df1234[~mask]

In [16]:
df1234['year_month'] = df1234['date'].dt.to_period('M')

result1 = df1234.groupby('year_month')['distance_km'].sum().reset_index()
result1['year_month'] = result1['year_month'].dt.to_timestamp().dt.strftime('%Y-%m')

In [17]:
google_sheets_link = 'https://docs.google.com/spreadsheets/d/1sKQRBWY5xvY-erhFxXo0bC0pTpZT6QUUMx1fjLngGk0/edit?usp=drive_link'
document_key = google_sheets_link.split('/')[-2]
worksheet_link = f'https://docs.google.com/spreadsheets/d/{document_key}/export?format=csv'

dfgoogle = pd.read_csv(worksheet_link, header=1)
dfgoogle = dfgoogle.drop(columns=['Unnamed: 0'])
dfgoogle = dfgoogle.iloc[:, :-1]
dfgoogle = dfgoogle.iloc[:-1,:]
dfgoogle.replace('-', np.nan, inplace=True)

In [18]:
result2 = pd.DataFrame(columns=['year_month', 'distance_km'])

for index, row in dfgoogle.iterrows():
    for col in dfgoogle.columns[1:]:
        year_month = f'{col}-{index + 1:02d}'
        distance_km = row[col]
        result2 = pd.concat([result2, pd.DataFrame({'year_month': [year_month], 'distance_km': [distance_km]})], ignore_index=True)

result2 = result2.sort_values(by='year_month')
result2 = result2[result2['year_month'].apply(lambda x: int(x.split('-')[1])) != 13]

result2['distance_km'] = result2['distance_km'].str.replace(',', '.').astype(float)
result2['distance_km'] = pd.to_numeric(result2['distance_km'])

In [19]:
result1 = result1.rename(columns={'distance_km': 'result1_distance_km'})
result2 = result2.rename(columns={'distance_km': 'result2_distance_km'})

result = pd.concat([result1.set_index('year_month'), result2.set_index('year_month')], axis=1, join='outer')
result['absolute_difference'] = abs(result['result2_distance_km'] - result['result1_distance_km'])
filtered_df = result[result['absolute_difference'] > 0.03].copy()
display(filtered_df)

Unnamed: 0_level_0,result1_distance_km,result2_distance_km,absolute_difference
year_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-06,32.75,32.15,0.6
2020-11,27.97,74.58,46.61
2020-12,65.16,63.38,1.78
2021-08,127.11,126.96,0.15
2021-09,120.99,120.55,0.44


In [20]:
selected_rows = df1234[(df1234['date'] >= '2020-11-01') & (df1234['date'] <= '2020-11-30')]
display(selected_rows)

Unnamed: 0,sport,source,title,date,start_time,end_time,distance_km,duration_s,speed_avg_kmh,start_time_minutes,year_month
714,Running,"Map My Run app, Endomondo (GPS)",,2020-11-01,16:52:07,16:52:10,9.44,3239,10.49,1012.0,2020-11
715,Running,"Map My Run app, Endomondo (GPS)",,2020-11-04,19:53:52,19:53:55,9.37,3370,10.01,1193.0,2020-11
716,Running,"Map My Run app, Endomondo (Manual)",,2020-11-08,17:29:00,17:29:03,9.16,3163,10.43,1049.0,2020-11


In [21]:
duplicate_rows = df1234[df1234.duplicated(subset=['date', 'duration_s', 'distance_km'], keep=False)]

if not duplicate_rows.empty:
    count = int(len(duplicate_rows) / 2)
    print(f"{count} duplicated rows to be deleted")
    y = input("Confirm deletion ")

    if y.upper() == 'Y':
        condition = pd.isna(duplicate_rows['title'])
        df1234 = df1234[df1234.index.isin(duplicate_rows[condition].index)]
        print(df1234)
        print(f"Deleted {count} rows.")
        duplicate_rows = df1234[df1234.duplicated(subset=['date', 'duration_s', 'distance_km'], keep=False)]
        if not duplicate_rows.empty:
          count = int(len(duplicate_rows) / 2)
          print(f"Found {count} duplicated rows.")
        else:
          print("No duplicated rows.")
          display(df1234)
else:
    print("No duplicated rows.")
    display(df1234)


No duplicated rows.


Unnamed: 0,sport,source,title,date,start_time,end_time,distance_km,duration_s,speed_avg_kmh,start_time_minutes,year_month
0,Running,"Map My Run app, Endomondo (Manual)",,2012-08-15,10:07:00,10:33:00,4.21,1560,9.72,607.0,2012-08
1,Running,"Map My Run app, Endomondo (Manual)",,2012-08-18,18:07:00,19:04:00,8.48,3420,8.93,1087.0,2012-08
2,Running,"Map My Run app, Endomondo (Manual)",,2012-08-20,18:30:00,19:19:00,6.73,2940,8.24,1110.0,2012-08
3,Running,"Map My Run app, Endomondo (Manual)",,2012-08-22,18:30:00,19:24:00,7.77,3240,8.63,1110.0,2012-08
4,Running,"Map My Run app, Endomondo (Manual)",,2012-08-26,18:21:00,19:13:00,7.77,3120,8.97,1101.0,2012-08
...,...,...,...,...,...,...,...,...,...,...,...
723,Running,Garmin Connect,Olsztyn Bieganie,2023-11-05,09:38:17,,5.65,2029,10.03,578.0,2023-11
722,Running,Garmin Connect,Olsztyn Bieganie,2023-12-16,15:46:52,,6.12,2389,9.21,946.0,2023-12
721,Running,Garmin Connect,Olsztyn Bieganie,2023-12-17,17:28:29,,5.99,2293,9.40,1048.0,2023-12
720,Running,Garmin Connect,,2023-12-20,12:48:45,,7.02,2775,9.11,768.0,2023-12


In [22]:
#excel_file_path = '/content/drive/MyDrive/Colab Notebooks/Tableau/Outputs/workouts.xls'
#df1234.to_excel(excel_file_path, index=False)