<a href="https://colab.research.google.com/github/slazur83/Tableau/blob/main/Sport%20Activity/prepare_running_workouts_for_tableau.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
!pip install gpxpy garminconnect

import os
import json
import pandas as pd
import numpy as np
import gpxpy
from datetime import date, timedelta
from dateutil import parser
from garminconnect import Garmin

# =========================
# PATHS
# =========================
ENDO_PATH = "/content/drive/MyDrive/Dane z aplikacji/Endomondo/Workouts"
MAPMY_FILE = "/content/drive/MyDrive/Dane z aplikacji/MapMyRun/user184238914_workout_history.csv"
GPX_PATH = "/content/drive/MyDrive/Dane z aplikacji/Sports Tracker/Running workouts"
GOOGLE_SHEET = "https://docs.google.com/spreadsheets/d/1I6XjfT990f6rArjYedIf0nw5vsBye_L6vkMILD8YjZI/edit"
GSHEET_CSV = GOOGLE_SHEET.replace("/edit", "/export?format=csv")

# =========================
# ENDOMONDO
# =========================
endo_rows = []

for fn in os.listdir(ENDO_PATH):
    if not fn.endswith(".json"):
        continue
    with open(os.path.join(ENDO_PATH, fn)) as f:
        data = json.load(f)
        if data[0].get("sport") != "RUNNING":
            continue

        start = pd.to_datetime(data[3]["start_time"])
        end = pd.to_datetime(data[4]["end_time"])

        duration_s = None
        try:
            duration_s = int(data[5]["duration_s"]) if data[5].get("duration_s") is not None else None
        except Exception:
            duration_s = None

        distance_km = None
        try:
            distance_km = round(float(data[6].get("distance_km", np.nan)), 2)
        except Exception:
            distance_km = None

        # Compute speed if missing: speed = distance (km) / (duration (s) / 3600)
        speed_avg_kmh = None
        try:
            raw_speed = data[8].get("speed_avg_kmh") if isinstance(data[8], dict) else None
        except Exception:
            raw_speed = None
        if raw_speed is not None:
            try:
                speed_avg_kmh = round(float(raw_speed), 2)
            except Exception:
                speed_avg_kmh = None
        elif distance_km is not None and duration_s and duration_s > 0:
            speed_avg_kmh = round(distance_km / (duration_s / 3600.0), 2)

        endo_rows.append({
            "sport": "Running",
            "source": data[1]["source"]
                        .replace("INPUT_MANUAL", "Endomondo (Manual)")
                        .replace("TRACK_MOBILE", "Endomondo (GPS)"),
            "date": pd.to_datetime(start.date()),
            "start_time": start.time(),
            "end_time": end.time(),
            "duration_s": duration_s,
            "distance_km": distance_km,
            "speed_avg_kmh": speed_avg_kmh,
            "title": np.nan,
            "type": "Workout"
        })

df_endo = pd.DataFrame(endo_rows)

# =========================
# MAP MY RUN
# =========================
df = pd.read_csv(MAPMY_FILE, header=None)
df = df[df[2] == "Run"]

df_map = pd.DataFrame({
    "sport": "Running",
    "source": "Map My Run app",
    "date": pd.to_datetime(df[1].apply(parser.parse).dt.date),
    "start_time": None,
    "end_time": None,
    "distance_km": pd.to_numeric(df[4], errors="coerce").round(2),
    "duration_s": pd.to_numeric(df[5], errors="coerce").astype("Int64"),
    "speed_avg_kmh": pd.to_numeric(df[8], errors="coerce").round(2),
    "title": np.nan,
    "type": "Workout",
})

# =========================
# MERGE ENDOMONDO + MAPMYRUN (improved matching)
# =========================

def merge_endo_mapmy(a, b, tol_distance=0.01, tol_speed=0.01):
    # Normalize numeric precision to two decimals before matching
    a = a.copy()
    b = b.copy()
    a["distance_km"] = pd.to_numeric(a["distance_km"], errors="coerce").round(2)
    b["distance_km"] = pd.to_numeric(b["distance_km"], errors="coerce").round(2)
    a["duration_s"] = pd.to_numeric(a["duration_s"], errors="coerce")
    b["duration_s"] = pd.to_numeric(b["duration_s"], errors="coerce")
    a["speed_avg_kmh"] = pd.to_numeric(a["speed_avg_kmh"], errors="coerce").round(2)
    b["speed_avg_kmh"] = pd.to_numeric(b["speed_avg_kmh"], errors="coerce").round(2)

    combined = pd.concat([a, b], ignore_index=True)
    used = set()
    out = []

    def close_with_missing(x, y, tol):
        # True if both present and close, or one is missing (allow merge to enrich)
        if pd.isna(x) and pd.isna(y):
            return False
        if pd.isna(x) or pd.isna(y):
            return True
        return abs(float(x) - float(y)) <= tol

    for i, r in combined.iterrows():
        if i in used:
            continue
        group = [r]
        used.add(i)

        for j, o in combined.iterrows():
            if j in used:
                continue

            same_distance = close_with_missing(r["distance_km"], o["distance_km"], tol_distance)
            same_duration = (pd.notna(r["duration_s"]) and pd.notna(o["duration_s"]) and r["duration_s"] == o["duration_s"]) or (pd.isna(r["duration_s"]) or pd.isna(o["duration_s"]))
            same_speed = close_with_missing(r["speed_avg_kmh"], o["speed_avg_kmh"], tol_speed)
            close_date = abs((pd.to_datetime(r["date"]) - pd.to_datetime(o["date"])).days) <= 1

            if same_distance and same_duration and same_speed and close_date:
                group.append(o)
                used.add(j)

        g = pd.DataFrame(group)
        base = g.iloc[0].copy()
        base["source"] = " | ".join(sorted(set(g["source"])))

        endo = g[g["source"].str.contains("Endomondo")]
        if not endo.empty:
            base["date"] = endo.iloc[0]["date"]
        # Prefer populated fields when available
        for col in ["duration_s", "distance_km", "speed_avg_kmh", "title", "start_time", "end_time"]:
            if pd.isna(base.get(col)) and g[col].notna().any():
                base[col] = g[col].dropna().iloc[0]

        out.append(base)

    return pd.DataFrame(out)

df_endo_map = merge_endo_mapmy(df_endo, df_map)

# =========================
# SPORTS TRACKER GPX
# =========================
gpx_rows = []

for fn in os.listdir(GPX_PATH):
    if not fn.endswith(".gpx"):
        continue
    with open(os.path.join(GPX_PATH, fn)) as f:
        gpx = gpxpy.parse(f)
        pts = gpx.tracks[0].segments[0].points
        start, end = pts[0].time, pts[-1].time
        dur = int((end - start).total_seconds())
        dist = round(gpx.length_3d() / 1000, 2)

        gpx_rows.append({
            "sport": "Running",
            "source": "Sports Tracker",
            "date": pd.to_datetime(start.date()),
            "start_time": start.time(),
            "end_time": end.time(),
            "duration_s": dur,
            "distance_km": dist,
            "speed_avg_kmh": round(dist / (dur / 3600), 2),
            "title": np.nan,
            "type": "Workout"
        })

df_gpx = pd.DataFrame(gpx_rows)

# =========================
# GARMIN (paged fetch to respect 1000 limit)
# =========================

def load_garmin(token_dir, start_date):
    g = Garmin()
    # Try token login; if fails, leave a note (requires prior CLI login)
    try:
        g.login(tokenstore=token_dir)
    except Exception:
        # Fallback: attempt standard login via env vars if provided
        user = os.environ.get("GARMIN_USER")
        pwd = os.environ.get("GARMIN_PASS")
        if user and pwd:
            g.login(user, pwd)
        else:
            # Cannot login; return empty
            return pd.DataFrame(columns=["sport","source","date","start_time","end_time","duration_s","distance_km","speed_avg_kmh","title","type"])

    rows = []
    start_idx = 0
    page_size = 1000  # MAX per API

    while True:
        try:
            activities = g.get_activities(start_idx, page_size)
        except Exception:
            break
        if not activities:
            break

        stop = False
        for act in activities:
            try:
                if act.get("activityType", {}).get("typeKey") != "running":
                    continue

                st = pd.to_datetime(act.get("startTimeLocal"))
                if st.date() < start_date:
                    stop = True
                    break

                dur = int(act.get("duration", 0))
                dist = round(float(act.get("distance", 0.0)) / 1000.0, 2)
                speed = round(dist / (dur / 3600.0), 2) if dur > 0 and dist is not None else None

                rows.append({
                    "sport": "Running",
                    "source": "Garmin Connect",
                    "date": pd.to_datetime(st.date()),
                    "start_time": st.time(),
                    "end_time": (st + pd.to_timedelta(dur, unit='s')).time() if dur else None,
                    "duration_s": dur,
                    "distance_km": dist,
                    "speed_avg_kmh": speed,
                    "title": act.get("activityName"),
                    "type": "Workout",
                })
            except Exception:
                continue

        if stop:
            break
        start_idx += page_size

    return pd.DataFrame(rows)


df_garmin = load_garmin(
    "/content/drive/MyDrive/Moje projekty/Garmin Connect/Garmin tokens",
    date(2015, 1, 1),
)

# =========================
# GOOGLE SHEETS (CSV export)
# =========================
try:
    df_gsheet_raw = pd.read_csv(GSHEET_CSV)
    # Expect columns: date, start_time, end_time, duration_s, distance_km, speed_avg_kmh, title, type, source
    # Normalize and coerce
    df_gsheet = pd.DataFrame({
        "sport": df_gsheet_raw.get("sport", pd.Series(["Running"] * len(df_gsheet_raw))),
        "source": df_gsheet_raw.get("source", pd.Series(["Google Sheets"] * len(df_gsheet_raw))),
        "date": pd.to_datetime(df_gsheet_raw.get("date", pd.NaT), errors="coerce").dt.date,
        "start_time": pd.to_datetime(df_gsheet_raw.get("start_time", pd.NaT), errors="coerce").dt.time,
        "end_time": pd.to_datetime(df_gsheet_raw.get("end_time", pd.NaT), errors="coerce").dt.time,
        "duration_s": pd.to_numeric(df_gsheet_raw.get("duration_s", pd.NA), errors="coerce"),
        "distance_km": pd.to_numeric(df_gsheet_raw.get("distance_km", pd.NA), errors="coerce").round(2),
        "speed_avg_kmh": pd.to_numeric(df_gsheet_raw.get("speed_avg_kmh", pd.NA), errors="coerce").round(2),
        "title": df_gsheet_raw.get("title", pd.NA),
        "type": df_gsheet_raw.get("type", pd.Series(["Workout"] * len(df_gsheet_raw))),
    })
except Exception:
    df_gsheet = pd.DataFrame(columns=["sport","source","date","start_time","end_time","duration_s","distance_km","speed_avg_kmh","title","type"])

# =========================
# FINAL MERGE
# =========================
final = pd.concat(
    [df_endo_map, df_gpx, df_garmin, df_gsheet],
    ignore_index=True,
)

# Ensure date has no time component
final["date"] = pd.to_datetime(final["date"], errors="coerce").dt.date

# Enforce two-decimal precision for distance and avg speed
final["distance_km"] = pd.to_numeric(final["distance_km"], errors="coerce").round(2)
final["speed_avg_kmh"] = pd.to_numeric(final["speed_avg_kmh"], errors="coerce").round(2)

# Remove same-day duplicates when Garmin exists (keep Garmin and Google Sheets; drop others)
is_garmin = final["source"].astype(str).str.contains("Garmin Connect", case=False, na=False)
keep_google = final["source"].astype(str).str.contains("Google Sheets", case=False, na=False)

garmin_dates = set(final.loc[is_garmin, "date"].dropna().unique().tolist())

final = final[ keep_google | is_garmin | (~final["date"].isin(garmin_dates)) ]

final = final.sort_values(["date", "start_time"]).reset_index(drop=True)

# =========================
# TYPE RULE: title empty or contains 'Bieganie' => 'Workout', else 'Sport Event'
# =========================
final["type"] = np.where(
    final["title"].isna() | final["title"].astype(str).str.contains("Bieganie", case=False, na=False),
    "Workout",
    "Sport Event",
)

# =========================
# SAVE
# =========================
out = "/content/drive/MyDrive/Moje projekty/Tableau/Source data/workouts.xlsx"
final.to_excel(out, index=False)
print("Zapisano:", out)



  final = pd.concat(


Zapisano: /content/drive/MyDrive/Moje projekty/Tableau/Source data/workouts.xlsx
