In [1]:
# Install the gpxpy package for working with GPX files
!pip install gpxpy



In [2]:
import os
import pandas as pd
import numpy as np
import json
import os
import gpxpy
from datetime import datetime
from dateutil import parser

In [3]:
workouts_folder_path1 = "/content/drive/MyDrive/Dane z aplikacji/Endomondo/Workouts"
workouts_folder_path2 = "/content/drive/MyDrive/Dane z aplikacji/MapMyRun/user184238914_workout_history.csv"
workouts_folder_path3 = "/content/drive/MyDrive/Dane z aplikacji/Sports Tracker/Running workouts"
workouts_folder_path4 = "/content/drive/MyDrive/Dane z aplikacji/Garmin Connect/Aktywności/Bieganie.csv"
google_sheets_link_1 = 'https://docs.google.com/spreadsheets/d/1I6XjfT990f6rArjYedIf0nw5vsBye_L6vkMILD8YjZI/edit?usp=sharing'
google_sheets_link_2 = 'https://docs.google.com/spreadsheets/d/1sKQRBWY5xvY-erhFxXo0bC0pTpZT6QUUMx1fjLngGk0/edit?usp=drive_link'

**df1 => Endomondo export**

In [4]:
dfs = []

files = [f for f in os.listdir(workouts_folder_path1) if f.endswith(".json")]

if files:
    for file_name in files:
        file_path = os.path.join(workouts_folder_path1, file_name)

        with open(file_path, 'r') as file:
            try:
                data = json.load(file)
                if isinstance(data, list) and len(data) > 0:
                    sport = 'Running' if data[0].get('sport') == 'RUNNING' else data[0].get('sport')
                    if sport == 'Running':
                        source = data[1].get('source')
                        source = source.replace("INPUT_MANUAL", "Endomondo (Manual)").replace("TRACK_MOBILE", "Endomondo (GPS)") if pd.notna(source) else 'Endomondo'
                        start_time = pd.to_datetime(data[3].get('start_time'))
                        end_time = pd.to_datetime(data[4].get('end_time'))
                        speed_avg_kmh = round(float(data[8].get('speed_avg_kmh')), 2) if data[8].get('speed_avg_kmh') is not None else None
                        duration = (str(data[5].get('duration_s')).split('.')[0])if data[5].get('duration_s') is not None else None
                        distance = round(float(data[6].get('distance_km')), 2) if duration is not None else None

                        dfs.append({
                            'sport': sport,
                            'source': source,
                            'date': start_time.date(),
                            'start_time': start_time.time(),
                            'end_time': end_time.time(),
                            'duration_s': duration,
                            'distance_km': distance,
                            'speed_avg_kmh': speed_avg_kmh
                        })

            except Exception as e:
                print(f"Error processing JSON file {file_name}: {e}")

    df1 = pd.DataFrame(dfs).sort_values(by='date').reset_index(drop=True)
    df1['date'] = df1['date'].astype(str).str.strip()
    df1['date'] = pd.to_datetime(df1['date'])
    df1['distance_km'] = df1['distance_km'].astype(float).round(2)
    df1['duration_s'] = pd.to_numeric(df1['duration_s'], errors='coerce').astype('Int64')
else:
    print("No JSON files found in the specified folder.")


**df2 => Map My Run export**

In [5]:
df = pd.read_csv(workouts_folder_path2, header=None, skip_blank_lines=False)

df2 = df[df[2] == 'Run'].copy()
df2.rename(columns={1: 'date', 2: 'sport', 4: 'distance_km', 5: 'duration_s', 8: 'speed_avg_kmh', 13: 'source2', 14:'url'}, inplace=True)
df2['sport'] = df2['sport'].apply(lambda x: 'Running' if x == 'Run' else x)
df2['source'] = 'Map My Run app'
df2.drop(columns=[0, 3, 6, 7, 9, 10, 11, 12], inplace=True)

df2['date'] = df2['date'].apply(lambda x: parser.parse(x).strftime('%Y-%m-%d'))
df2['date'] = pd.to_datetime(df2['date'])
df2['distance_km'] = df2['distance_km'].astype(float).round(2)
df2['duration_s'] = df2['duration_s'].astype(int)
df2['speed_avg_kmh'] = df2['speed_avg_kmh'].astype(float).round(2)

df2 = df2[['sport', 'source', 'date', 'duration_s', 'distance_km', 'speed_avg_kmh']]

**df3 => export from Sports Tracker app**

In [6]:
gpx_data_list = []

for filename in os.listdir(workouts_folder_path3):
    if filename.endswith(".gpx"):
        gpx_file_path = os.path.join(workouts_folder_path3, filename)

        with open(gpx_file_path, "r") as gpx_file:
            gpx_data = gpxpy.parse(gpx_file)

            if gpx_data.tracks and gpx_data.tracks[0].segments and gpx_data.tracks[0].segments[0].points:
                start_time = gpx_data.tracks[0].segments[0].points[0].time

                date = start_time.strftime("%Y-%m-%d")
                start_time = start_time.strftime("%H:%M:%S")
                end_time = gpx_data.tracks[0].segments[0].points[-1].time
                duration_seconds = int((end_time - gpx_data.tracks[0].segments[0].points[0].time).total_seconds())

                distance_km = round((gpx_data.length_3d() / 1000), 2)
                speed_avg_kmh = round(distance_km / (duration_seconds / 3600), 2)

                gpx_data_list.append({
                    "sport" : "Running",
                    "source" : "Sports Tracker",
                    "title": np.nan,
                    "date": date,
                    "start_time": start_time,
                    "duration_s": duration_seconds,
                    "distance_km": distance_km,
                    "speed_avg_kmh": speed_avg_kmh
                })

df3 = pd.DataFrame(gpx_data_list)
df3['date'] = pd.to_datetime(df3['date'])

**df4 => export from Garmin Connect till the end of 2023**

In [7]:
def convert_pace_to_speed(pace):
    pace_in_seconds = int(pace.split(':')[0]) * 60 + int(pace.split(':')[1])
    speed_in_kph = 3600 / pace_in_seconds
    return speed_in_kph

df4 = pd.read_csv(workouts_folder_path4, skip_blank_lines=False)
df4['source'] = 'Garmin Connect'
df4['Typ aktywności'] = df4['Typ aktywności'].replace('Bieganie', 'Running')
df4['Czas'] = pd.to_timedelta(df4['Czas']).dt.total_seconds()
df4['Czas'] = df4['Czas'].astype('int64')

df4['Data'] = pd.to_datetime(df4['Data'])
df4['start_time'] = df4['Data'].dt.time
df4['Data'] = df4['Data'].dt.date

df4.rename(columns={'Typ aktywności': 'sport', "Data": 'date', "Czas": "duration_s", "Dystans": "distance_km", "Średnie tempo": "speed_avg_kmh", "Tytuł": "title"}, inplace=True)
df4['speed_avg_kmh'] = df4['speed_avg_kmh'].apply(convert_pace_to_speed).round(2)
df4 = df4[['sport', 'source', 'title', 'date', 'start_time', 'duration_s', 'distance_km', 'speed_avg_kmh']]
df4['date'] = pd.to_datetime(df4['date'])
df4['title'] = df4['title'].apply(lambda x: np.nan if 'Bieganie' in x else x)

**df5 => manual input from Google Sheet file**

In [8]:
data = [
    ['17-11-2020', '18:47:00', 'Olsztyn', '10,00', '00:58:53', '10,19'],
    ['19-11-2020', '17:58:00', 'Olsztyn', '9,96', '01:01:05', '9,78'],
    ['22-11-2020', '11:09:00', 'Olsztyn', '9,62', '00:59:39', '9,68'],
    ['27-11-2020', '15:59:00', 'Olsztyn', '9,86', '01:00:18', '9,81'],
    ['29-11-2020', '12:46:00', 'Olsztyn', '7,17', '00:41:30', '10,37'],
    ['01-12-2020', '20:06:00', 'Olsztyn', '10,08', '00:58:28', '10,34']
]

df = pd.DataFrame(data, columns=['date', 'start_time', 'location', 'distance_km', 'duration', 'speed_avg_kmh'])
df['date'] = pd.to_datetime(df['date'], format='%d-%m-%Y')
df['distance_km'] = df['distance_km'].str.replace(',', '.').astype(float)
df['duration_s'] = pd.to_timedelta(df['duration']).dt.total_seconds()
df['sport'] = 'Running'
df['source'] = 'Manual input'
df['title'] = np.nan
df['end_time'] = np.nan
df5 = df[['sport', 'source', 'title', 'date', 'start_time', 'end_time', 'distance_km', 'duration_s', 'speed_avg_kmh']]

**df12 => merged df1 with df2**

In [9]:
merged_df = pd.merge(df2, df1, on=['date', 'distance_km'], how='left')

df12 = pd.DataFrame()
df12['sport'] = merged_df['sport_x']
df12['source'] = merged_df['source_x'] + ', ' + merged_df['source_y']
df12['date'] = merged_df['date']
df12['start_time'] = merged_df['start_time']
df12['end_time'] = merged_df['end_time']
df12['distance_km'] = merged_df['distance_km']
df12['duration_s'] = merged_df['duration_s_x'].combine_first(merged_df['duration_s_y'])
df12['speed_avg_kmh'] = merged_df['speed_avg_kmh_x'].combine_first(merged_df['speed_avg_kmh_y'])
df12['mismatch'] = ''

mask_duration_not_nan = ~merged_df['duration_s_x'].isna() & ~merged_df['duration_s_y'].isna()
mask_speed_not_nan = ~merged_df['speed_avg_kmh_x'].isna() & ~merged_df['speed_avg_kmh_y'].isna()

tolerance_speed = 0.01

duration_mismatch = mask_duration_not_nan & (merged_df['duration_s_x'] != merged_df['duration_s_y'])
speed_mismatch = mask_speed_not_nan & (abs(merged_df['speed_avg_kmh_x'] - merged_df['speed_avg_kmh_y']) > tolerance_speed)

df12.loc[duration_mismatch, 'mismatch'] += 'duration_s; '
df12.loc[speed_mismatch, 'mismatch'] += f'speed_avg_kmh, {tolerance_speed}, values ({merged_df.loc[speed_mismatch, "speed_avg_kmh_x"].astype(str)}, {merged_df.loc[speed_mismatch, "speed_avg_kmh_y"].astype(str)}); '

mismatches = df12[df12['mismatch'] != '']

if not mismatches.empty:
  display(mismatches)
else:
  df12 = df12.drop('mismatch', axis=1)
  df12['title'] = np.nan

**df123 => merged df12 with df3**

In [10]:
df123 = pd.concat([df12, df3], ignore_index=True)
df123 = df123[df123['source'] != 'Sports Tracker']

**df1234 => merged df123 with df4**

In [11]:
df1234 = pd.concat([df123, df4], ignore_index=True)

**df12345 => merged df1234 with df5**

In [12]:
df12345 = pd.concat([df1234, df5], ignore_index=True)
df12345['type'] = df12345['title'].apply(lambda x: 'Workout' if pd.isna(x) else 'Sport event')
df12345 = df12345[['source', 'sport', 'type', 'title', 'date', 'start_time', 'end_time', 'distance_km', 'duration_s', 'speed_avg_kmh']]
df12345 = df12345.sort_values(by='date')

In [13]:
selected_dates = df12345.loc[df12345['title'].notna() & ~df12345['title'].str.contains('Bieganie', na=False), 'date']
rows_to_remove = df12345[(df12345['date'].isin(selected_dates)) & pd.isna(df12345['source'])]
df123456 = df12345.drop(rows_to_remove.index)

In [14]:
selected_dates = ['2018-06-09', '2020-12-17']
df1234567 = df123456.drop(df123456[(df123456['date'].isin(selected_dates)) & pd.isna(df123456['source'])].index)

In [15]:
selected_dates = ['2020-01-01','2020-01-11','2020-01-12','2020-01-19', '2020-01-28']
df12345678 = df1234567.drop(df1234567[(
    (df1234567['date'].isin(selected_dates))
    & (df1234567['source'] == 'Garmin Connect')
    & ((df1234567['title'].isna()) | (df1234567['title'] == 'Olsztyn Bieg'))
)].index)

In [16]:
duplicate_rows = df12345678[df12345678.duplicated(subset=['date', 'duration_s', 'distance_km'], keep=False)]

if not duplicate_rows.empty:
  condition = pd.isna(duplicate_rows['title'])
  df123456789 = df12345678[df12345.index.isin(duplicate_rows[condition].index)]
else:
    df123456789 = df12345678

In [17]:
df123456789.loc[df123456789['title'] == '43. Maraton Warszawski', 'distance_km'] = 42.20
df123456789.loc[df123456789['title'] == '43. Maraton Warszawski', 'duration_s'] = 15849
df123456789.loc[df123456789['title'] == '43. Maraton Warszawski', 'source'] = df12345['source'] + ' | Manual input'
df123456789['speed_avg_kmh'] = (df123456789['distance_km'] / (df123456789['duration_s'] / 3600)).round(2)

In [18]:
document_key = google_sheets_link_1.split('/')[-2]
worksheet_link = f'https://docs.google.com/spreadsheets/d/{document_key}/export?format=csv'

dfgoogle_1 = pd.read_csv(worksheet_link, header=1)
dfgoogle_1['data'] = pd.to_datetime(dfgoogle_1['data'], format='%d-%m-%Y')

In [19]:
df_nans = df123456789.loc[df123456789['source'].isna()]
merged_df = pd.merge(df_nans, dfgoogle_1[['data', 'nazwa zawodów']], left_on='date', right_on='data', how='inner')
merged_df['title'] = merged_df['nazwa zawodów']
merged_df['source'] = "Google Sheets"
merged_df['type'] = "Sport event"
merged_df = merged_df.drop(['data', 'nazwa zawodów'], axis=1)


In [42]:
dates_to_remove = merged_df['date'].tolist()
df_filtered = df123456789[~((df123456789['date'].isin(dates_to_remove)) & (df123456789['source'].isna()))]
final = pd.concat([df_filtered, merged_df], ignore_index=True)
final.loc[(final['title'] == "(Za)dyszka ZETO Software") & (final['date'] == '2021-08-28'), 'distance_km'] = 10.00
final.loc[(final['title'] == "Auto Idea XVIII Bieg Jakubowy") & (final['date'] == '2021-08-28'), 'distance_km'] = 10.00
final.loc[(final['title'] == "Auto Idea XVIII Bieg Jakubowy") & (final['date'] == '2021-08-28'), 'duration_s'] = 2927

**Checks**

In [43]:
x = final.loc[final['type']== 'Sport event']
sorted_df = x.sort_values(by='date', ascending=False)
display(sorted_df)

Unnamed: 0,source,sport,type,title,date,start_time,end_time,distance_km,duration_s,speed_avg_kmh
824,Garmin Connect,Running,Sport event,Poland Business Run 2023 (wirtualnie),2023-09-03,18:16:03,,4.02,1276.0,11.34
759,Garmin Connect | Manual input,Running,Sport event,43. Maraton Warszawski,2021-09-26,08:30:00,,42.2,15849.0,9.59
754,Garmin Connect,Running,Sport event,Auto Idea XVIII Bieg Jakubowy,2021-09-05,11:00:48,,10.03,2929.0,12.33
749,Garmin Connect,Running,Sport event,(Za)dyszka ZETO Software,2021-08-28,11:05:00,,10.0,3227.0,11.32
599,Garmin Connect,Running,Sport event,8 Bieg WOŚP Policz się z cukrzycą (Bieg 8),2020-01-12,14:45:00,,1.0,254.0,14.17
600,Garmin Connect,Running,Sport event,8 Bieg WOŚP Policz się z cukrzycą (Bieg 7),2020-01-12,14:30:00,,1.0,246.0,14.63
563,Garmin Connect,Running,Sport event,(Za)dyszka ZETO Software,2019-07-28,11:00:00,,10.0,3566.0,10.1
544,Garmin Connect,Running,Sport event,XVII Bieg Jakubowy,2019-04-28,11:00:00,,10.0,3146.0,11.44
515,Garmin Connect,Running,Sport event,Enea IRONMAN 70.3 Gdynia,2018-08-04,13:00:00,,5.0,1401.0,12.85
507,Garmin Connect,Running,Sport event,B2Run Gdańsk,2018-06-12,17:00:00,,5.8,1439.0,14.51


In [44]:
final['start_time'] = pd.to_datetime(final['start_time'], format='%H:%M:%S').dt.time
final['start_time_minutes'] = final['start_time'].apply(lambda x: x.hour * 60 + x.minute if not pd.isnull(x) else None)

mask = (final.duplicated(subset=['date', 'start_time'], keep=False) &
        (final['start_time_minutes'].diff().abs() <= 61) &
        (final['source'] == 'Garmin Connect'))

final = final[~mask]

In [45]:
final['year_month'] = final['date'].dt.to_period('M')

result1 = final.groupby('year_month')['distance_km'].sum().reset_index()
result1['year_month'] = result1['year_month'].dt.to_timestamp().dt.strftime('%Y-%m')

In [46]:
document_key = google_sheets_link_2.split('/')[-2]
worksheet_link = f'https://docs.google.com/spreadsheets/d/{document_key}/export?format=csv'

dfgoogle2 = pd.read_csv(worksheet_link, header=1)
dfgoogle2 = dfgoogle2.drop(columns=['Unnamed: 0'])
dfgoogle2 = dfgoogle2.iloc[:, :-1]
dfgoogle2 = dfgoogle2.iloc[:-1,:]
dfgoogle2.replace('-', np.nan, inplace=True)

In [47]:
result2 = pd.DataFrame(columns=['year_month', 'distance_km'])

for index, row in dfgoogle2.iterrows():
    for col in dfgoogle2.columns[1:]:
        year_month = f'{col}-{index + 1:02d}'
        distance_km = row[col]
        result2 = pd.concat([result2, pd.DataFrame({'year_month': [year_month], 'distance_km': [distance_km]})], ignore_index=True)

result2 = result2.sort_values(by='year_month')
result2 = result2[result2['year_month'].apply(lambda x: int(x.split('-')[1])) != 13]

result2['distance_km'] = result2['distance_km'].str.replace(',', '.').astype(float)
result2['distance_km'] = pd.to_numeric(result2['distance_km'])

In [48]:
result1 = result1.rename(columns={'distance_km': 'result1_distance_km'})
result2 = result2.rename(columns={'distance_km': 'result2_distance_km'})

result = pd.concat([result1.set_index('year_month'), result2.set_index('year_month')], axis=1, join='outer')
result['absolute_difference'] = abs(result['result2_distance_km'] - result['result1_distance_km'])
filtered_df = result[result['absolute_difference'] > 0.02].copy()
display(filtered_df)

Unnamed: 0_level_0,result1_distance_km,result2_distance_km,absolute_difference
year_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-06,32.75,32.15,0.6
2021-09,120.58,120.55,0.03


In [51]:
filtered_df = final[(final['date'].dt.year == 2021) & (final['date'].dt.month == 9)]
filtered_df

Unnamed: 0,source,sport,type,title,date,start_time,end_time,distance_km,duration_s,speed_avg_kmh,start_time_minutes,year_month
751,Garmin Connect,Running,Workout,,2021-09-02,20:18:40,,9.58,3270.0,10.55,1218.0,2021-09
752,Garmin Connect,Running,Workout,,2021-09-04,16:56:22,,9.0,3382.0,9.58,1016.0,2021-09
753,Garmin Connect,Running,Workout,,2021-09-04,16:47:49,,0.48,165.0,10.47,1007.0,2021-09
754,Garmin Connect,Running,Sport event,Auto Idea XVIII Bieg Jakubowy,2021-09-05,11:00:48,,10.03,2929.0,12.33,660.0,2021-09
755,Garmin Connect,Running,Workout,,2021-09-09,19:29:32,,9.19,3179.0,10.41,1169.0,2021-09
756,Garmin Connect,Running,Workout,,2021-09-12,07:40:45,,30.22,10781.0,10.09,460.0,2021-09
757,Garmin Connect,Running,Workout,,2021-09-19,16:29:00,,1.88,660.0,10.25,989.0,2021-09
758,Garmin Connect,Running,Workout,,2021-09-19,16:40:02,,8.0,2766.0,10.41,1000.0,2021-09
759,Garmin Connect | Manual input,Running,Sport event,43. Maraton Warszawski,2021-09-26,08:30:00,,42.2,15849.0,9.59,510.0,2021-09


In [50]:
filtered_df = df1[(df1['date'].dt.year == 2018) & (df1['date'].dt.month == 6)]
filtered_df

Unnamed: 0,sport,source,date,start_time,end_time,duration_s,distance_km,speed_avg_kmh
474,Running,Endomondo (GPS),2018-06-09,20:54:19,21:01:42,427,1.22,
475,Running,Endomondo (GPS),2018-06-17,07:09:20,07:09:23,3389,8.58,
476,Running,Endomondo (GPS),2018-06-28,18:24:31,19:41:49,4510,12.15,


In [29]:
filtered_df = df123[(df123['date'].dt.year == 2015) & (df123['date'].dt.month == 11)]
filtered_df

Unnamed: 0,sport,source,date,start_time,end_time,distance_km,duration_s,speed_avg_kmh,title
378,Running,"Map My Run app, Endomondo (GPS)",2015-11-29,20:17:47,21:03:39,8.21,2751,10.74,
379,Running,"Map My Run app, Endomondo (Manual)",2015-11-21,08:20:00,11:26:00,31.47,11160,10.15,
380,Running,"Map My Run app, Endomondo (GPS)",2015-11-15,08:20:09,10:06:14,19.42,6324,11.06,
381,Running,"Map My Run app, Endomondo (Manual)",2015-11-11,07:55:00,08:37:35,7.77,2555,10.95,
382,Running,"Map My Run app, Endomondo (Manual)",2015-11-08,08:24:00,10:20:13,21.95,6973,11.33,
383,Running,"Map My Run app, Endomondo (Manual)",2015-11-07,09:08:00,09:33:15,4.18,1515,9.93,
384,Running,,2015-11-07,,,5.0,1361,13.23,
385,Running,"Map My Run app, Endomondo (GPS)",2015-11-01,09:18:06,11:18:04,19.6,6981,10.11,


In [30]:
filtered_df = df1234[(df1234['date'].dt.year == 2015) & (df1234['date'].dt.month == 11)]
filtered_df

Unnamed: 0,sport,source,date,start_time,end_time,distance_km,duration_s,speed_avg_kmh,title
378,Running,"Map My Run app, Endomondo (GPS)",2015-11-29,20:17:47,21:03:39,8.21,2751,10.74,
379,Running,"Map My Run app, Endomondo (Manual)",2015-11-21,08:20:00,11:26:00,31.47,11160,10.15,
380,Running,"Map My Run app, Endomondo (GPS)",2015-11-15,08:20:09,10:06:14,19.42,6324,11.06,
381,Running,"Map My Run app, Endomondo (Manual)",2015-11-11,07:55:00,08:37:35,7.77,2555,10.95,
382,Running,"Map My Run app, Endomondo (Manual)",2015-11-08,08:24:00,10:20:13,21.95,6973,11.33,
383,Running,"Map My Run app, Endomondo (Manual)",2015-11-07,09:08:00,09:33:15,4.18,1515,9.93,
384,Running,,2015-11-07,,,5.0,1361,13.23,
385,Running,"Map My Run app, Endomondo (GPS)",2015-11-01,09:18:06,11:18:04,19.6,6981,10.11,


In [31]:
filtered_df = df12345[(df12345['date'].dt.year == 2015) & (df12345['date'].dt.month == 11)]
filtered_df

Unnamed: 0,source,sport,type,title,date,start_time,end_time,distance_km,duration_s,speed_avg_kmh
385,"Map My Run app, Endomondo (GPS)",Running,Workout,,2015-11-01,09:18:06,11:18:04,19.6,6981.0,10.11
384,,Running,Workout,,2015-11-07,,,5.0,1361.0,13.23
383,"Map My Run app, Endomondo (Manual)",Running,Workout,,2015-11-07,09:08:00,09:33:15,4.18,1515.0,9.93
382,"Map My Run app, Endomondo (Manual)",Running,Workout,,2015-11-08,08:24:00,10:20:13,21.95,6973.0,11.33
381,"Map My Run app, Endomondo (Manual)",Running,Workout,,2015-11-11,07:55:00,08:37:35,7.77,2555.0,10.95
380,"Map My Run app, Endomondo (GPS)",Running,Workout,,2015-11-15,08:20:09,10:06:14,19.42,6324.0,11.06
379,"Map My Run app, Endomondo (Manual)",Running,Workout,,2015-11-21,08:20:00,11:26:00,31.47,11160.0,10.15
378,"Map My Run app, Endomondo (GPS)",Running,Workout,,2015-11-29,20:17:47,21:03:39,8.21,2751.0,10.74


In [32]:
filtered_df = df123456[(df123456['date'].dt.year == 2015) & (df123456['date'].dt.month == 11)]
filtered_df

Unnamed: 0,source,sport,type,title,date,start_time,end_time,distance_km,duration_s,speed_avg_kmh
385,"Map My Run app, Endomondo (GPS)",Running,Workout,,2015-11-01,09:18:06,11:18:04,19.6,6981.0,10.11
384,,Running,Workout,,2015-11-07,,,5.0,1361.0,13.23
383,"Map My Run app, Endomondo (Manual)",Running,Workout,,2015-11-07,09:08:00,09:33:15,4.18,1515.0,9.93
382,"Map My Run app, Endomondo (Manual)",Running,Workout,,2015-11-08,08:24:00,10:20:13,21.95,6973.0,11.33
381,"Map My Run app, Endomondo (Manual)",Running,Workout,,2015-11-11,07:55:00,08:37:35,7.77,2555.0,10.95
380,"Map My Run app, Endomondo (GPS)",Running,Workout,,2015-11-15,08:20:09,10:06:14,19.42,6324.0,11.06
379,"Map My Run app, Endomondo (Manual)",Running,Workout,,2015-11-21,08:20:00,11:26:00,31.47,11160.0,10.15
378,"Map My Run app, Endomondo (GPS)",Running,Workout,,2015-11-29,20:17:47,21:03:39,8.21,2751.0,10.74


In [33]:
filtered_df = df1234567[(df1234567['date'].dt.year == 2015) & (df1234567['date'].dt.month == 11)]
filtered_df

Unnamed: 0,source,sport,type,title,date,start_time,end_time,distance_km,duration_s,speed_avg_kmh
385,"Map My Run app, Endomondo (GPS)",Running,Workout,,2015-11-01,09:18:06,11:18:04,19.6,6981.0,10.11
384,,Running,Workout,,2015-11-07,,,5.0,1361.0,13.23
383,"Map My Run app, Endomondo (Manual)",Running,Workout,,2015-11-07,09:08:00,09:33:15,4.18,1515.0,9.93
382,"Map My Run app, Endomondo (Manual)",Running,Workout,,2015-11-08,08:24:00,10:20:13,21.95,6973.0,11.33
381,"Map My Run app, Endomondo (Manual)",Running,Workout,,2015-11-11,07:55:00,08:37:35,7.77,2555.0,10.95
380,"Map My Run app, Endomondo (GPS)",Running,Workout,,2015-11-15,08:20:09,10:06:14,19.42,6324.0,11.06
379,"Map My Run app, Endomondo (Manual)",Running,Workout,,2015-11-21,08:20:00,11:26:00,31.47,11160.0,10.15
378,"Map My Run app, Endomondo (GPS)",Running,Workout,,2015-11-29,20:17:47,21:03:39,8.21,2751.0,10.74


In [34]:
filtered_df = df12345678[(df12345678['date'].dt.year == 2015) & (df12345678['date'].dt.month == 11)]
filtered_df

Unnamed: 0,source,sport,type,title,date,start_time,end_time,distance_km,duration_s,speed_avg_kmh
385,"Map My Run app, Endomondo (GPS)",Running,Workout,,2015-11-01,09:18:06,11:18:04,19.6,6981.0,10.11
384,,Running,Workout,,2015-11-07,,,5.0,1361.0,13.23
383,"Map My Run app, Endomondo (Manual)",Running,Workout,,2015-11-07,09:08:00,09:33:15,4.18,1515.0,9.93
382,"Map My Run app, Endomondo (Manual)",Running,Workout,,2015-11-08,08:24:00,10:20:13,21.95,6973.0,11.33
381,"Map My Run app, Endomondo (Manual)",Running,Workout,,2015-11-11,07:55:00,08:37:35,7.77,2555.0,10.95
380,"Map My Run app, Endomondo (GPS)",Running,Workout,,2015-11-15,08:20:09,10:06:14,19.42,6324.0,11.06
379,"Map My Run app, Endomondo (Manual)",Running,Workout,,2015-11-21,08:20:00,11:26:00,31.47,11160.0,10.15
378,"Map My Run app, Endomondo (GPS)",Running,Workout,,2015-11-29,20:17:47,21:03:39,8.21,2751.0,10.74


In [35]:
filtered_df = df123456789[(df123456789['date'].dt.year == 2015) & (df123456789['date'].dt.month == 11)]
filtered_df

Unnamed: 0,source,sport,type,title,date,start_time,end_time,distance_km,duration_s,speed_avg_kmh
385,"Map My Run app, Endomondo (GPS)",Running,Workout,,2015-11-01,09:18:06,11:18:04,19.6,6981.0,10.11
384,,Running,Workout,,2015-11-07,,,5.0,1361.0,13.23
383,"Map My Run app, Endomondo (Manual)",Running,Workout,,2015-11-07,09:08:00,09:33:15,4.18,1515.0,9.93
382,"Map My Run app, Endomondo (Manual)",Running,Workout,,2015-11-08,08:24:00,10:20:13,21.95,6973.0,11.33
381,"Map My Run app, Endomondo (Manual)",Running,Workout,,2015-11-11,07:55:00,08:37:35,7.77,2555.0,10.95
380,"Map My Run app, Endomondo (GPS)",Running,Workout,,2015-11-15,08:20:09,10:06:14,19.42,6324.0,11.06
379,"Map My Run app, Endomondo (Manual)",Running,Workout,,2015-11-21,08:20:00,11:26:00,31.47,11160.0,10.15
378,"Map My Run app, Endomondo (GPS)",Running,Workout,,2015-11-29,20:17:47,21:03:39,8.21,2751.0,10.74


In [36]:
filtered_df = final[(final['date'].dt.year == 2015) & (final['date'].dt.month == 11)]
filtered_df

Unnamed: 0,source,sport,type,title,date,start_time,end_time,distance_km,duration_s,speed_avg_kmh,distance,start_time_minutes,year_month
306,"Map My Run app, Endomondo (GPS)",Running,Workout,,2015-11-01,09:18:06,11:18:04,19.6,6981.0,10.11,,558.0,2015-11
307,"Map My Run app, Endomondo (Manual)",Running,Workout,,2015-11-07,09:08:00,09:33:15,4.18,1515.0,9.93,,548.0,2015-11
308,"Map My Run app, Endomondo (Manual)",Running,Workout,,2015-11-08,08:24:00,10:20:13,21.95,6973.0,11.33,,504.0,2015-11
309,"Map My Run app, Endomondo (Manual)",Running,Workout,,2015-11-11,07:55:00,08:37:35,7.77,2555.0,10.95,,475.0,2015-11
310,"Map My Run app, Endomondo (GPS)",Running,Workout,,2015-11-15,08:20:09,10:06:14,19.42,6324.0,11.06,,500.0,2015-11
311,"Map My Run app, Endomondo (Manual)",Running,Workout,,2015-11-21,08:20:00,11:26:00,31.47,11160.0,10.15,,500.0,2015-11
312,"Map My Run app, Endomondo (GPS)",Running,Workout,,2015-11-29,20:17:47,21:03:39,8.21,2751.0,10.74,,1217.0,2015-11
858,Google Sheets,Running,Sport event,Grand Prix Olsztyn City Trail,2015-11-07,NaT,,5.0,1361.0,13.23,,,2015-11


In [37]:
excel_file_path = '/content/drive/MyDrive/Colab Notebooks/Tableau/Outputs/workouts.xls'

columns_to_drop = ['start_time_minutes', 'year_month']
existing_columns = set(df12345.columns)
columns_to_drop_exist = set(columns_to_drop).issubset(existing_columns)

if columns_to_drop_exist:
    df12345 = df12345.drop(columns_to_drop, axis=1)
    print("Dropped columns:", columns_to_drop)
else:
    print("Some columns to drop do not exist in the DataFrame.")
!pip install xlwt


df12345.to_excel(excel_file_path, index=False)



Some columns to drop do not exist in the DataFrame.


  df12345.to_excel(excel_file_path, index=False)
