In [None]:
import pandas as pd
import glob
import os
from datetime import datetime


OUTPUT_DIR = "/content/drive/MyDrive/Moje projekty/Tableau/Source data"
EXPORT_DIR = "/content/drive/MyDrive/Moje projekty/Garmin Connect/Garmin exports"


if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    print(f"Created output directory → {OUTPUT_DIR}")


if not os.path.exists(EXPORT_DIR):
    os.makedirs(EXPORT_DIR)
    print(f"Created export directory → {EXPORT_DIR}")


def get_season(d):
    m, day = d.month, d.day
    if (m == 12 and day >= 21) or m in [1,2] or (m == 3 and day < 21):
        return 'Winter'
    elif (m == 3 and day >= 21) or m in [4,5] or (m == 6 and day < 21):
        return 'Spring'
    elif (m == 6 and day >= 21) or m in [7,8] or (m == 9 and day < 23):
        return 'Summer'
    elif (m == 9 and day >= 23) or m in [10,11] or (m == 12 and day < 21):
        return 'Autumn'
    return 'Unknown'


raw_files = [f for f in glob.glob(os.path.join(EXPORT_DIR, "*.csv"))
             if "enriched" not in os.path.basename(f)]


if raw_files:
    print("\n=== RAW FILES FOUND — PROCESSING ===")
else:
    print("\n=== NO RAW FILES — SKIPPING ===")


for f in raw_files:
    name = os.path.basename(f)
    print(f"\nProcessing RAW file → {name}")

    df = pd.read_csv(f)
    if "Date" not in df.columns:
        raise ValueError(f"RAW file {f} must contain 'Date'")
    df = df.rename(columns={"Date": "New_date"})

    df["New_date"] = pd.to_datetime(df["New_date"], errors="coerce")
    df = df[df["New_date"].notna()]
    df = df[df["Steps"] > 0]
    df = df.sort_values("New_date").drop_duplicates("New_date", keep="last")

    df["Month"] = df["New_date"].dt.month_name()
    df["Weekday"] = df["New_date"].dt.day_name()
    df["Season"] = df["New_date"].apply(get_season)

    min_d, max_d = df["New_date"].min(), df["New_date"].max()
    full = pd.date_range(min_d, max_d, freq="D")
    missing = full.difference(df["New_date"])
    if len(missing) > 0:
        print(f"Missing {len(missing)} days — filling with NaN")
        missing_df = pd.DataFrame({
            "New_date": missing,
            "Steps": [float("nan")] * len(missing),
        })
        df = pd.concat([df, missing_df])

    df = df.sort_values("New_date").reset_index(drop=True)
    min_date, max_date = df["New_date"].min(), df["New_date"].max()

    if min_date.month == 1 and min_date.day == 1 and max_date.month == 12 and max_date.day == 31:
      enriched_name = f"steps_export_{min_date.year}_enriched.csv"
    else:
      enriched_name = f"steps_export_{min_date.strftime('%Y%m%d')}_{max_date.strftime('%Y%m%d')}_enriched.csv"

    enriched_path = os.path.join(EXPORT_DIR, enriched_name)
    df.to_csv(enriched_path, index=False)
    print(f"Saved enriched file → {enriched_name}")
    os.remove(f)
    print(f"Removed RAW file → {name}")

enriched_files = [f for f in glob.glob(os.path.join(EXPORT_DIR, "*_enriched.csv"))]

print("\n=== CLEANING enriched FILES ===")

dfs = []
for f in enriched_files:
    name = os.path.basename(f)
    df = pd.read_csv(f)
    df["New_date"] = pd.to_datetime(df["New_date"])

    print("\n-------------------------------------------")
    print("FILE:", name)
    print("Rows:", len(df))
    print("Unique dates:", df["New_date"].nunique())

    duplicates = df[df.duplicated("New_date", keep=False)]
    if len(duplicates) > 0:
        print(f"!!! WARNING: {len(duplicates)} duplicates")
        df = df.sort_values("New_date").drop_duplicates("New_date", keep="last")
        df.to_csv(f, index=False)
        print(f"Duplicates removed → overwritten {name}")
    else:
        print("No duplicates inside file.")

    min_date, max_date = df["New_date"].min(), df["New_date"].max()
    print("Date range:", min_date.date(), "→", max_date.date())

    full_range = pd.date_range(min_date, max_date, freq="D")
    missing = full_range.difference(df["New_date"])
    if len(missing) == 0:
        print("Missing dates: NONE")
    else:
        print(f"Missing dates ({len(missing)}):")
        for d in missing:
            print(" -", d.date())
        missing_df = pd.DataFrame({
            "New_date": missing,
            "Steps": [float("nan")] * len(missing),
        })
        df = pd.concat([df, missing_df])

    dfs.append(df)

print("\n=== REMOVING narrower enriched files ===")

range_files = []
for f in enriched_files:
    base = os.path.basename(f)

    if base.startswith("steps_export_") and base.endswith("_enriched.csv"):
        try:
            part = base.replace("steps_export_", "").replace("_enriched.csv", "")
            start_s, end_s = part.split("_")

            start_d = datetime.strptime(start_s, "%Y%m%d")
            end_d = datetime.strptime(end_s, "%Y%m%d")

            range_files.append((f, start_d, end_d))
        except:
            pass

from collections import defaultdict
groups = defaultdict(list)
for f, start_d, end_d in range_files:
    groups[start_d].append((f, end_d))

for start_d, file_list in groups.items():
    if len(file_list) <= 1:
        continue

    file_list_sorted = sorted(file_list, key=lambda x: x[1], reverse=True)
    keep_file, keep_end = file_list_sorted[0]

    print(f"\nKEEP: {os.path.basename(keep_file)}")

    for f_rm, end_rm in file_list_sorted[1:]:
        print(f"REMOVE: {os.path.basename(f_rm)}")
        os.remove(f_rm)

enriched_files = [f for f in glob.glob(os.path.join(EXPORT_DIR, "*_enriched.csv"))]

print("\n=== MERGING ALL enriched FILES ===")

final = pd.concat(dfs, ignore_index=True)
final = final.sort_values("New_date").drop_duplicates("New_date", keep="last").reset_index(drop=True)
final["Month"] = final["New_date"].dt.month_name()
final["Weekday"] = final["New_date"].dt.day_name()
final["Season"] = final["New_date"].apply(get_season)

if "Unnamed: 0" in final.columns:
    final = final.drop(columns=["Unnamed: 0"])

final.insert(0, "ID", range(1, len(final) + 1))

today_str = datetime.today().strftime("%Y-%m-%d")
final_path = os.path.join(OUTPUT_DIR, "steps_all_years_merged.csv")
final.to_csv(final_path, index=False)

start_date = final["New_date"].min().strftime("%Y-%m-%d")
end_date = final["New_date"].max().strftime("%Y-%m-%d")

print("FINAL DATASET CREATED")
print("Date range:", start_date, "→", end_date)
print("Total days:", len(final))
print("Total steps:", final["Steps"].sum())

print("\nAverage steps per year:")
final["Year"] = final["New_date"].dt.year
for yr, grp in final.groupby("Year"):
    print(f"{yr}: {int(grp['Steps'].mean()) if not grp['Steps'].isna().all() else 'NaN'}")

print("\nSaved to:")
print(final_path)
