# Description
In 2024 some of the the dataset from month 9 does not include Itime in their attributes 
in the following code i wiil correct the dataset

In [None]:
# File system and configuration management
import os
from netCDF4 import Dataset
import netCDF4
# Data manipulation
import xarray as xr
import os, glob, re, sys
from datetime import datetime, timezone
import numpy as np
FVCOM_DIR = '/mnt/hydroglg/Data/External_Models/Outputs/GLCFS/LakeHuron/2024_corrected'



In [None]:
# first rename the file that start with nos.lmhofs to lmhofs

for fname in os.listdir(FVCOM_DIR):
    if fname.startswith("nos.lmhofs") and fname.endswith(".nc"):
        old_path = os.path.join(FVCOM_DIR, fname)
        new_name = fname.replace("nos.", "", 1)  # remove only the first 'nos.'
        new_path = os.path.join(FVCOM_DIR, new_name)

        # Rename the file
        os.rename(old_path, new_path)
        print(f"‚úÖ Renamed: {fname} ‚Üí {new_name}")

print("üéØ Done! All 'nos.lmhofs...' files renamed to 'lmhofs...'")

# Add Itime unit to the nc files does not include Itime

In [None]:
# correct Itime attribute in the nc files
for fname in sorted(os.listdir(FVCOM_DIR)):
    if fname.endswith(".nc"):
        fpath = os.path.join(FVCOM_DIR, fname)
        print(f"Fixing time in: {fname}")

        try:
            ds = xr.open_dataset(fpath, decode_times=False)

            if "time" not in ds.variables:
                print("  ‚ö†Ô∏è Skipping, no 'time' variable.")
                ds.close()
                continue

            # --- Get original time info ---
            units = ds.time.attrs["units"]
            calendar = ds.time.attrs.get("calendar", "standard")
            time_vals = ds.time.values

            # --- Convert to numeric days since 2018-01-01 (same as original) ---
            times = netCDF4.num2date(time_vals, units, calendar=calendar)
            numeric_days = netCDF4.date2num(times, units, calendar=calendar)

            # --- Split into integer + fractional parts ---
            itime_vals = np.floor(numeric_days).astype(int)
            itime2_vals = (numeric_days - np.floor(numeric_days)) * 86400.0  # seconds

            # --- Overwrite or create ---
            ds["Itime"] = ("time", itime_vals)
            ds["Itime2"] = ("time", itime2_vals)

            # --- Correct attributes ---
            ds["Itime"].attrs["units"] = units  # keep same epoch as 'time'
            ds["Itime"].attrs["long_name"] = "integer days since 2018-01-01 00:00:00"
            ds["Itime2"].attrs["units"] = "seconds since start of day"
            ds["Itime2"].attrs["long_name"] = "fractional seconds of the day"

            # --- Save safely ---
            tmp = fpath + ".tmp"
            ds.to_netcdf(tmp)
            ds.close()
            os.replace(tmp, fpath)
            print("  ‚úÖ Fixed and saved.")

        except Exception as e:
            print(f"  ‚ùå Error fixing {fname}: {e}")

In [36]:
import os
import re
import numpy as np
import xarray as xr
import netCDF4
from datetime import datetime, timezone

# === CONFIG ===
FVCOM_DIR = "/mnt/hydroglg/Data/External_Models/Outputs/GLCFS/LakeHuron/rename"
DRY_RUN = False  # set True to preview without writing

# Patterns:
# 1) lmhofs.fields.n000.20240831.t18z.nc  -> date=20240831, hour=18
PAT1 = re.compile(r"lmhofs\.fields\.n\d+\.(\d{8})\.t(\d{2})z\.nc$")
# 2) lmhofs.t00z.20241031.fields.n000.nc  -> date=20241031, hour=00
PAT2 = re.compile(r"lmhofs\.t(\d{2})z\.(\d{8})\.fields\.n\d+\.nc$")

def infer_dt_from_name(fname: str) -> datetime | None:
    """Return a timezone-aware UTC datetime from an FVCOM filename."""
    m1 = PAT1.search(fname)
    if m1:
        ymd, hh = m1.group(1), m1.group(2)
        y, m, d = int(ymd[:4]), int(ymd[4:6]), int(ymd[6:8])
        return datetime(y, m, d, int(hh), 0, 0, tzinfo=timezone.utc)
    m2 = PAT2.search(fname)
    if m2:
        hh, ymd = m2.group(1), m2.group(2)
        y, m, d = int(ymd[:4]), int(ymd[4:6]), int(ymd[6:8])
        return datetime(y, m, d, int(hh), 0, 0, tzinfo=timezone.utc)
    return None

def fix_file_times(fpath: str) -> None:
    fname = os.path.basename(fpath)
    target_dt = infer_dt_from_name(fname)
    if target_dt is None:
        print(f"  ‚ö†Ô∏è  Skip (no recognized timestamp in name): {fname}")
        return

    print(f"Fixing: {fname}  ‚Üí  {target_dt.isoformat()}")

    # Open without decoding so we fully control values
    with xr.open_dataset(fpath, decode_times=False) as ds:
        # Ensure there is a time coordinate (size 1 is fine)
        if "time" not in ds.variables:
            # Create an empty time variable with length 1 if missing
            ds = ds.assign_coords(time=("time", np.zeros(1, dtype=np.float64)))

        # Get time units & calendar (fallbacks if missing)
        time_var = ds["time"]
        units = time_var.attrs.get("units", "days since 2018-01-01 00:00:00")
        calendar = time_var.attrs.get("calendar", "standard")

        # Convert expected datetime to numeric using the dataset's units/calendar
        new_time_val = netCDF4.date2num(target_dt, units=units, calendar=calendar)

        # Guarantee shape matches (most of your files have ntime=1)
        if ds.sizes.get("time", 0) == 0:
            raise ValueError(f"{fname}: time dimension has size 0.")
        if ds.sizes["time"] != 1:
            # If multiple frames exist, we set all frames to same timestamp (rare).
            print(f"  ‚ö†Ô∏è  time dimension is {ds.sizes['time']}; setting all to {target_dt.isoformat()}")

        new_time = np.full(ds.sizes["time"], new_time_val, dtype=np.float64)

        # Compute Itime/Itime2 from the new time numeric value (days since epoch)
        # Itime = floor(days), Itime2 = seconds since start of day (here 0 exactly)
        # We purposely snap exactly to the top of the hour ‚Üí seconds = 0.
        # (Already guaranteed by building from filename)
        # But compute via date2num split to follow CF convention precisely.
        # Convert back to days to avoid binary drift:
        day_counts = new_time  # numeric "days since <epoch>"
        itime_vals = np.floor(day_counts).astype("i4")
        # Seconds in day: (fractional day) * 86400, snapped to int
        itime2_vals = np.rint((day_counts - np.floor(day_counts)) * 86400.0).astype("i4")

        # For safety: force seconds to multiples of 3600 if you want 6-hour cadence only.
        # Comment out if not desired.
        # itime2_vals = (itime2_vals // 3600) * 3600

        # Ensure seconds==0 for your requirement:
        itime2_vals[:] = (itime2_vals // 1) * 1  # no-op, kept for clarity
        # Explicitly zero seconds (your requirement):
        itime2_vals[:] = 0

        # Prepare encodings (keep time dtype; ensure integer for Itime/Itime2)
        enc = {k: {} for k in ds.variables}
        enc["time"] = {"dtype": "float64"}
        enc["Itime"] = {"dtype": "int32"}
        enc["Itime2"] = {"dtype": "int32"}

        # Assign variables and attributes
        ds = ds.assign_coords(time=("time", new_time))
        ds["time"].attrs.update({"units": units, "calendar": calendar})

        ds["Itime"] = ("time", itime_vals)
        ds["Itime"].attrs.update({
            "long_name": f"integer {units}",
            "units": units,
        })

        ds["Itime2"] = ("time", itime2_vals)
        ds["Itime2"].attrs.update({
            "long_name": "seconds since start of day",
            "units": "seconds since start of day",
        })

        # Write out
        if DRY_RUN:
            print("  (dry-run) would write fixed time/Itime/Itime2")
            return

        tmp = fpath + ".tmp"
        ds.to_netcdf(tmp, encoding=enc)
    os.replace(tmp, fpath)
    print("  ‚úÖ saved")

def main():
    nc_files = sorted(
        f for f in os.listdir(FVCOM_DIR)
        if f.endswith(".nc")
    )
    if not nc_files:
        print(f"No .nc files under {FVCOM_DIR}")
        return

    fixed, skipped, failed = 0, 0, 0
    for fname in nc_files:
        fpath = os.path.join(FVCOM_DIR, fname)
        try:
            fix_file_times(fpath)
            fixed += 1
        except Exception as e:
            failed += 1
            print(f"  ‚ùå {fname}: {e}")
    print(f"\nDone. Fixed: {fixed}  |  Failed: {failed}  |  Skipped: {skipped}")

if __name__ == "__main__":
    main()


Fixing: lmhofs.fields.n000.20240801.t00z.nc  ‚Üí  2024-08-01T00:00:00+00:00
  ‚úÖ saved
Fixing: lmhofs.fields.n000.20240801.t06z.nc  ‚Üí  2024-08-01T06:00:00+00:00
  ‚úÖ saved
Fixing: lmhofs.fields.n000.20240801.t12z.nc  ‚Üí  2024-08-01T12:00:00+00:00
  ‚úÖ saved
Fixing: lmhofs.fields.n000.20240801.t18z.nc  ‚Üí  2024-08-01T18:00:00+00:00
  ‚úÖ saved
Fixing: lmhofs.fields.n000.20240802.t00z.nc  ‚Üí  2024-08-02T00:00:00+00:00
  ‚úÖ saved
Fixing: lmhofs.fields.n000.20240802.t06z.nc  ‚Üí  2024-08-02T06:00:00+00:00
  ‚úÖ saved
Fixing: lmhofs.fields.n000.20240802.t12z.nc  ‚Üí  2024-08-02T12:00:00+00:00
  ‚úÖ saved
Fixing: lmhofs.fields.n000.20240802.t18z.nc  ‚Üí  2024-08-02T18:00:00+00:00
  ‚úÖ saved
Fixing: lmhofs.fields.n000.20240803.t00z.nc  ‚Üí  2024-08-03T00:00:00+00:00
  ‚úÖ saved
Fixing: lmhofs.fields.n000.20240803.t06z.nc  ‚Üí  2024-08-03T06:00:00+00:00
  ‚úÖ saved
Fixing: lmhofs.fields.n000.20240803.t12z.nc  ‚Üí  2024-08-03T12:00:00+00:00
  ‚úÖ saved
Fixing: lmhofs.fields.n000.20240

In [None]:
# check the attrtibutes
ds = xr.open_dataset(FVCOM_DIR +'/lmhofs.fields.n000.20240905.t00z.nc',decode_times=False)

print(ds["Itime"])
print(ds["Itime"].attrs)

In [40]:
# Pick the first and last files you have
files = sorted([f for f in os.listdir(FVCOM_DIR) if f.endswith(".nc")])
first_file = os.path.join(FVCOM_DIR, 'lmhofs.fields.n000.20240803.t00z.nc')
last_file  = os.path.join(FVCOM_DIR, 'lmhofs.fields.n000.20240930.t00z.nc')

for f in [first_file, last_file]:
    ds = xr.open_dataset(f, decode_times=False)
    units = ds.time.attrs['units']
    calendar = ds.time.attrs.get('calendar', 'standard')
    datetimes = netCDF4.num2date(ds.time.values, units, calendar)
    print(f"{f} ‚Üí {datetimes}")

/mnt/hydroglg/Data/External_Models/Outputs/GLCFS/LakeHuron/rename/lmhofs.fields.n000.20240803.t00z.nc ‚Üí [cftime.DatetimeGregorian(2024, 8, 2, 18, 0, 0, 0, has_year_zero=False)]
/mnt/hydroglg/Data/External_Models/Outputs/GLCFS/LakeHuron/rename/lmhofs.fields.n000.20240930.t00z.nc ‚Üí [cftime.DatetimeGregorian(2024, 9, 30, 0, 0, 0, 0, has_year_zero=False)]


In [24]:
"""
Build a chronologically sorted FVCOM file list and verify time coverage.

- Handles mixed filename orders:
  * lmhofs.fields.n000.YYYYMMDD.t18z.nc
  * lmhofs.t00z.YYYYMMDD.fields.n006.nc

- Robust time decoding:
  * CF-compliant 'time' if present
  * Fallback to FVCOM 'Itime' (days since epoch) + 'Itime2' (sec/msec since midnight)

Outputs:
  - fvcom_files.txt : sorted list of full paths
  - Prints coverage, timestep, gaps, duplicates, and safe simulation end time.

Run inside your PyLag env (no internet needed).
"""

import os, glob, re, sys
from datetime import datetime, timezone, timedelta
import numpy as np

# ========== SETTINGS ==========

OUT_LIST  = os.path.join(FVCOM_DIR, "fvcom_files.txt")
VERIFY_INFILE_TIMES = True      # requires xarray (but we open with decode_times=False)
GAP_FACTOR = 1.5                # dt > median*GAP_FACTOR => considered a gap
# ==============================

# ---------- 1) Collect files from both patterns ----------
pat1 = os.path.join(FVCOM_DIR, "lmhofs.fields.n???.????????.t??z.nc")
pat2 = os.path.join(FVCOM_DIR, "lmhofs.t??z.????????.fields.n???.nc")
candidates = sorted(set(glob.glob(pat1) + glob.glob(pat2)))
if not candidates:
    sys.exit(f"No FVCOM files found under: {FVCOM_DIR}")

# ---------- 2) Parse datetime from filename (works for both orders) ----------
rx1 = re.compile(r"lmhofs\.fields\.n(?P<n>\d{3})\.(?P<date>\d{8})\.t(?P<hour>\d{2})z\.nc$")
rx2 = re.compile(r"lmhofs\.t(?P<hour>\d{2})z\.(?P<date>\d{8})\.fields\.n(?P<n>\d{3})\.nc$")

def fname_to_dt(path: str):
    name = os.path.basename(path)
    m = rx1.match(name) or rx2.match(name)
    if not m:
        return None
    date = m.group("date")  # YYYYMMDD
    hour = m.group("hour")  # HH
    try:
        return datetime.strptime(date + hour, "%Y%m%d%H").replace(tzinfo=timezone.utc)
    except Exception:
        return None

parsed = []
skipped = []
for p in candidates:
    dt = fname_to_dt(p)
    if dt is None:
        skipped.append(p)
    else:
        parsed.append((dt, p))

if not parsed:
    sys.exit("No filenames matched expected patterns/time parsing.")

# De-dupe by datetime (keep first path seen)
by_dt = {}
for dt, p in parsed:
    by_dt.setdefault(dt, p)

# Sort by datetime
items = sorted(by_dt.items(), key=lambda x: x[0])
print(f"Discovered {len(items)} files spanning {items[0][0]} ‚Üí {items[-1][0]} UTC")
if skipped:
    print(f"Skipped {len(skipped)} non-matching names (showing up to 5):")
    for s in skipped[:5]:
        print("  -", os.path.basename(s))

# ---------- 3) Write the sorted list ----------
with open(OUT_LIST, "w") as f:
    for _, p in items:
        f.write(p + "\n")
print(f"‚úÖ Wrote sorted file list: {OUT_LIST}  (lines: {len(items)})")

# ---------- 4) Robust time verification (FVCOM-friendly) ----------
if VERIFY_INFILE_TIMES:
    try:
        import xarray as xr

        def _epoch_from_units(units: str) -> datetime:
            """
            Parse 'days since YYYY-MM-DD [HH:MM:SS]'.
            If missing/unknown, default to Modified Julian Date epoch (1858-11-17).
            """
            if not isinstance(units, str):
                return datetime(1858, 11, 17, tzinfo=timezone.utc)
            m = re.search(
                r"since\s+(\d{4})-(\d{2})-(\d{2})(?:[ T](\d{2}):(\d{2}):(\d{2}))?",
                units,
            )
            if m:
                Y, M, D = map(int, m.groups()[:3])
                hh = int(m.group(4) or 0)
                mm = int(m.group(5) or 0)
                ss = int(m.group(6) or 0)
                return datetime(Y, M, D, hh, mm, ss, tzinfo=timezone.utc)
            return datetime(1858, 11, 17, tzinfo=timezone.utc)

        def _decode_times(ds: xr.Dataset) -> np.ndarray:
            """
            Returns np.array of datetime64[ns] UTC.
            Priority: CF 'time' ‚Üí fallback to Itime/Itime2.
            Open dataset with decode_times=False before calling this.
            """
            # 1) Try CF 'time'
            if "time" in ds:
                try:
                    t = xr.conventions.decode_cf_datetime(
                        ds["time"],
                        ds["time"].attrs.get("units"),
                        ds["time"].attrs.get("calendar", "standard"),
                    )
                    return np.array(t, dtype="datetime64[ns]")
                except Exception:
                    pass  # fall through

            # 2) FVCOM Itime (+ optional Itime2)
            if "Itime" in ds:
                it = ds["Itime"].values  # integer days
                epoch = _epoch_from_units(ds["Itime"].attrs.get("units", ""))

                if "Itime2" in ds:
                    it2 = ds["Itime2"].values
                    # Unit detection or magnitude inference
                    u2 = (ds["Itime2"].attrs.get("units", "") or "").lower()
                    if "msec" in u2 or "millisecond" in u2:
                        sec_of_day = it2 / 1000.0
                    elif "sec" in u2:
                        sec_of_day = it2.astype("float64")
                    else:
                        vmax = np.nanmax(it2) if np.size(it2) else 0
                        sec_of_day = (it2 / 1000.0) if vmax > 10 * 86400 else it2.astype("float64")
                else:
                    sec_of_day = np.zeros_like(it, dtype="float64")

                py_times = [
                    epoch + timedelta(days=float(d)) + timedelta(seconds=float(s))
                    for d, s in zip(it, sec_of_day)
                ]
                return np.array(py_times, dtype="datetime64[ns]")

            raise RuntimeError("No decodable 'time' or 'Itime/Itime2' found.")

        all_times = []
        cadence_s = []

        for _, fp in items:
            with xr.open_dataset(fp, decode_times=False) as ds:  # IMPORTANT
                tt = _decode_times(ds)  # datetime64[ns] UTC
                if tt.size:
                    all_times.append(tt)
                    if tt.size >= 2:
                        dt = np.diff(tt).astype("timedelta64[s]").astype(int)
                        if dt.size:
                            cadence_s.append(int(np.median(dt)))

        if not all_times:
            print("‚ö†Ô∏è Could not read any time coordinates from files.")
        else:
            times = np.concatenate(all_times)
            tmin, tmax = times.min(), times.max()
            print(f"FVCOM in-file coverage (robust): {tmin} ‚Üí {tmax} (UTC)")

            safe_end_msg = ""
            if cadence_s:
                med = int(np.median(cadence_s))
                print(f"Median timestep ‚âà {med} s (~{med/3600:.2f} h)")
                dts = np.diff(times).astype("timedelta64[s]").astype(int)

                gaps = np.where(dts > med * GAP_FACTOR)[0]
                dups = np.where(dts == 0)[0]

                if gaps.size:
                    i = gaps[0]
                    print(f"‚ö†Ô∏è Detected {gaps.size} gap(s); e.g., {times[i]} ‚Üí {times[i+1]}")
                else:
                    print("No obvious gaps across file boundaries.")

                if dups.size:
                    i = dups[0]
                    print(f"‚ö†Ô∏è Detected {dups.size} duplicate step(s); e.g., {times[i]} == {times[i+1]}")

                # Latest safe end is one cadence BEFORE tmax (for interpolation)
                latest_safe_end = (tmax.astype("datetime64[s]").astype("int") - med)
                latest_safe_end = np.datetime64(latest_safe_end, "s")
                print(f"Latest SAFE simulation end (tmax - cadence): {latest_safe_end} UTC")
            else:
                print("‚ö†Ô∏è Could not infer cadence (only single time per file?).")
                print("Tip: ensure files contain multiple time steps to estimate cadence.")

    except ImportError:
        print("‚è≠Ô∏è Skipping verification: xarray not installed.")
    except Exception as e:
        print("Time verification failed:", repr(e))

print("\nNext steps:")
print("1) Point PyLag to fvcom_files.txt for chronological forcing.")
print("2) Ensure all release/start times ‚â• coverage start (tmin).")
print("3) Ensure final requested time < (coverage end - cadence) to avoid 'Time out of range'.")

Discovered 272 files spanning 2024-08-01 00:00:00+00:00 ‚Üí 2024-12-31 00:00:00+00:00 UTC
‚úÖ Wrote sorted file list: /mnt/hydroglg/Data/External_Models/Outputs/GLCFS/LakeHuron/rename/fvcom_files.txt  (lines: 272)


  return np.array(py_times, dtype="datetime64[ns]")


Time verification failed: OverflowError('date value out of range')

Next steps:
1) Point PyLag to fvcom_files.txt for chronological forcing.
2) Ensure all release/start times ‚â• coverage start (tmin).
3) Ensure final requested time < (coverage end - cadence) to avoid 'Time out of range'.


In [8]:
# Replace your _decode_times() with this vectorized, tz-safe version
def _decode_times(ds):
    """
    Returns np.array of datetime64[ns] UTC.
    Priority: CF 'time' ‚Üí fallback to FVCOM 'Itime'+'Itime2'.
    Open with decode_times=False before calling.
    """
    import xarray as xr
    import numpy as np
    import re
    from datetime import datetime, timezone

    # 1) Try CF 'time'
    if "time" in ds:
        try:
            t = xr.conventions.decode_cf_datetime(
                ds["time"],
                ds["time"].attrs.get("units"),
                ds["time"].attrs.get("calendar", "standard"),
            )
            # ensure ns precision and naive (UTC) numpy datetime64
            t64 = np.array(t, dtype="datetime64[ns]")
            return t64
        except Exception:
            pass  # fall back

    # 2) FVCOM Itime (+ optional Itime2)
    if "Itime" in ds:
        it = np.asarray(ds["Itime"].values, dtype="int64")  # days
        # Parse epoch from units: 'days since YYYY-MM-DD[ HH:MM:SS]'
        units = (ds["Itime"].attrs.get("units", "") or "")
        m = re.search(r"since\s+(\d{4})-(\d{2})-(\d{2})(?:[ T](\d{2}):(\d{2}):(\d{2}))?", units)
        if m:
            Y, M, D = map(int, m.groups()[:3])
            hh = int(m.group(4) or 0); mm = int(m.group(5) or 0); ss = int(m.group(6) or 0)
            epoch_str = f"{Y:04d}-{M:02d}-{D:02d}T{hh:02d}:{mm:02d}:{ss:02d}"
        else:
            # MJD epoch fallback (UTC)
            epoch_str = "1858-11-17T00:00:00"

        epoch64 = np.datetime64(epoch_str, "s")  # base unit seconds (safe for adding sec)
        # Itime2: seconds (or msec) since start of day
        if "Itime2" in ds:
            it2 = np.asarray(ds["Itime2"].values)
            u2 = (ds["Itime2"].attrs.get("units", "") or "").lower()
            if "msec" in u2 or "millisecond" in u2:
                sec_of_day = (it2.astype("float64") / 1000.0)
            elif "sec" in u2:
                sec_of_day = it2.astype("float64")
            else:
                vmax = np.nanmax(it2) if it2.size else 0
                sec_of_day = (it2.astype("float64") / 1000.0) if vmax > 10 * 86400 else it2.astype("float64")
        else:
            sec_of_day = np.zeros_like(it, dtype="float64")

        # Vectorized compose: epoch + days + seconds  (all as numpy timedeltas)
        t_days = it.astype("timedelta64[D]")
        t_secs = sec_of_day.astype("timedelta64[s]")
        t64 = (epoch64 + t_days + t_secs).astype("datetime64[ns]")
        return t64

    raise RuntimeError("No decodable 'time' or 'Itime/Itime2' found.")


In [29]:
#!/usr/bin/env python3
import re, os, sys
from datetime import datetime, timedelta, timezone

# --- set these to your config values ---
FILELIST = "/mnt/hydroglg/Data/External_Models/Outputs/GLCFS/LakeHuron/rename/fvcom_files.txt"
start_datetime = datetime(2024, 9, 1, 12, 0, 0, tzinfo=timezone.utc)
end_datetime   = datetime(2024, 9, 29, 19, 0, 0, tzinfo=timezone.utc)
particle_release_interval_h = 6
number_of_particle_releases = 1   # if >1, releases every interval starting at start_datetime
dt_model_s = 100                  # your Adv timestep; adjust if you want to simulate stepping
# ---------------------------------------

RX1 = re.compile(r".*lmhofs\.fields\.n\d{3}\.(\d{8})\.t(\d{2})z\.nc$")
RX2 = re.compile(r".*lmhofs\.t(\d{2})z\.(\d{8})\.fields\.n\d{3}\.nc$")

def parse_dt(path):
    m1 = RX1.match(path)
    m2 = RX2.match(path)
    if m1:
        date, hour = m1.group(1), m1.group(2)
    elif m2:
        hour, date = m2.group(1), m2.group(2)
    else:
        return None
    return datetime.strptime(date + hour, "%Y%m%d%H").replace(tzinfo=timezone.utc)

# 1) Load file times (chronologically)
with open(FILELIST, "r") as f:
    paths = [ln.strip() for ln in f if ln.strip().endswith(".nc")]
times = [parse_dt(p) for p in paths]
times = [t for t in times if t is not None]
times.sort()

tmin, tmax = times[0], times[-1]
# smallest interval
dtmin = min((times[i+1]-times[i] for i in range(len(times)-1)), key=lambda d: d.total_seconds())

print(f"Files coverage: {tmin} ‚Üí {tmax}  | smallest Œît = {dtmin}")

# 2) Build requested "key" times to test: all releases + your end time
key_times = []
if number_of_particle_releases <= 1:
    key_times.append(start_datetime)
else:
    k = 0
    t = start_datetime
    while t <= end_datetime:
        key_times.append(t)
        k += 1
        t = start_datetime + timedelta(hours=k*particle_release_interval_h)

# Always include the end time you asked for
key_times.append(end_datetime)

# 3) For each key time, ensure: t >= tmin and t < tmax, and has a NEXT frame
def find_bracket(t):
    # find largest file time <= t
    lo_idx = None
    hi_idx = None
    # binary search
    lo, hi = 0, len(times)-1
    if t < times[0]:
        return None, 0
    if t >= times[-1]:
        return len(times)-1, None
    while lo <= hi:
        mid = (lo + hi)//2
        if times[mid] <= t:
            lo_idx = mid
            lo = mid + 1
        else:
            hi_idx = mid
            hi = mid - 1
    if hi_idx is None and lo_idx is not None and lo_idx+1 < len(times):
        hi_idx = lo_idx + 1
    return lo_idx, hi_idx

problems = []
for t in key_times:
    lo, hi = find_bracket(t)
    if lo is None:
        problems.append((t, "before_coverage", None))
    elif hi is None:
        problems.append((t, "at_or_beyond_last_frame", times[lo]))
    else:
        # ok; optionally report how far from brackets
        d_lo = (t - times[lo]).total_seconds()
        d_hi = (times[hi] - t).total_seconds()
        print(f"{t} OK  | bracketing: {times[lo]}  ..  {times[hi]}  (‚àí{d_lo/3600:.2f}h / +{d_hi/3600:.2f}h)")

if problems:
    print("\n‚ö†Ô∏è Issues:")
    for t, kind, ref in problems:
        if kind == "before_coverage":
            print(f"  {t} is before first available frame {tmin}")
        elif kind == "at_or_beyond_last_frame":
            print(f"  {t} is ‚â• last available frame {tmax}; last lower bracket {ref}")
else:
    print("\nAll release/end times have valid bracketing frames (good for interpolation).")

# 4) Optional: safe upper bound for end time
safe_last = tmax - dtmin
print(f"\nLatest SAFE end time (tmax - smallest Œît): {safe_last}")


Files coverage: 2024-08-01 00:00:00+00:00 ‚Üí 2024-12-31 00:00:00+00:00  | smallest Œît = 6:00:00
2024-09-01 12:00:00+00:00 OK  | bracketing: 2024-09-01 12:00:00+00:00  ..  2024-09-01 18:00:00+00:00  (‚àí0.00h / +6.00h)
2024-09-29 19:00:00+00:00 OK  | bracketing: 2024-09-29 00:00:00+00:00  ..  2024-09-30 00:00:00+00:00  (‚àí19.00h / +5.00h)

All release/end times have valid bracketing frames (good for interpolation).

Latest SAFE end time (tmax - smallest Œît): 2024-12-30 18:00:00+00:00


In [42]:
# save as: inspect_infile_times.py
import os, re, sys
import numpy as np
import xarray as xr
from datetime import timedelta

FILELIST = "/mnt/hydroglg/Data/External_Models/Outputs/GLCFS/LakeHuron/rename/fvcom_files.txt"

def decode_times(ds):
    # Try CF 'time'
    if "time" in ds:
        try:
            t = xr.conventions.decode_cf_datetime(
                ds["time"], ds["time"].attrs.get("units"),
                ds["time"].attrs.get("calendar","standard")
            )
            return np.array(t, dtype="datetime64[ns]")
        except Exception:
            pass
    # Fallback to FVCOM Itime/Itime2
    import re
    from datetime import datetime, timezone, timedelta
    if "Itime" in ds:
        it = np.asarray(ds["Itime"].values, dtype="int64")
        units = (ds["Itime"].attrs.get("units","") or "")
        m = re.search(r"since\s+(\d{4})-(\d{2})-(\d{2})(?:[ T](\d{2}):(\d{2}):(\d{2}))?", units)
        if m:
            Y,M,D = map(int, m.groups()[:3])
            hh = int(m.group(4) or 0); mm = int(m.group(5) or 0); ss = int(m.group(6) or 0)
            epoch = f"{Y:04d}-{M:02d}-{D:02d}T{hh:02d}:{mm:02d}:{ss:02d}"
        else:
            epoch = "1858-11-17T00:00:00"  # MJD fallback
        epoch64 = np.datetime64(epoch,"s")
        if "Itime2" in ds:
            it2 = np.asarray(ds["Itime2"].values)
            u2 = (ds["Itime2"].attrs.get("units","") or "").lower()
            if "msec" in u2 or "millisecond" in u2:
                sec_of_day = it2.astype("float64")/1000.0
            elif "sec" in u2:
                sec_of_day = it2.astype("float64")
            else:
                vmax = np.nanmax(it2) if it2.size else 0
                sec_of_day = (it2.astype("float64")/1000.0) if vmax>864000 else it2.astype("float64")
        else:
            sec_of_day = np.zeros_like(it, dtype="float64")
        t = (epoch64 + it.astype("timedelta64[D]") + sec_of_day.astype("timedelta64[s]")).astype("datetime64[ns]")
        return t
    raise RuntimeError("No decodable time var found")

def main():
    with open(FILELIST) as f:
        files = [ln.strip() for ln in f if ln.strip().endswith(".nc")]
    print(f"Files: {len(files)}\n")
    rows = []
    for fp in files:
        try:
            with xr.open_dataset(fp, decode_times=False) as ds:
                t = decode_times(ds)
            n = t.size
            t0 = str(t.min()) if n else "NA"
            t1 = str(t.max()) if n else "NA"
            rows.append((fp, n, t0, t1))
        except Exception as e:
            rows.append((fp, 0, "ERR", f"{type(e).__name__}: {e}"))

    # Print a concise summary; look particularly after 2024-09-09
    for fp, n, t0, t1 in rows:
        print(f"{os.path.basename(fp):40s}  ntime={n:2d}  first={t0}  last={t1}")

if __name__ == "__main__":
    main()


Files: 820

lmhofs.fields.n000.20240801.t00z.nc       ntime= 1  first=2024-08-01T00:00:00.000000000  last=2024-08-01T00:00:00.000000000
lmhofs.fields.n000.20240801.t06z.nc       ntime= 1  first=2024-08-01T06:00:00.000000000  last=2024-08-01T06:00:00.000000000
lmhofs.fields.n000.20240801.t12z.nc       ntime= 1  first=2024-08-01T12:00:00.000000000  last=2024-08-01T12:00:00.000000000
lmhofs.fields.n000.20240801.t18z.nc       ntime= 1  first=2024-08-01T18:00:00.000000000  last=2024-08-01T18:00:00.000000000
lmhofs.fields.n000.20240802.t00z.nc       ntime= 1  first=2024-08-02T00:00:00.000000000  last=2024-08-02T00:00:00.000000000
lmhofs.fields.n000.20240802.t06z.nc       ntime= 1  first=2024-08-02T06:00:00.000000000  last=2024-08-02T06:00:00.000000000
lmhofs.fields.n000.20240802.t12z.nc       ntime= 1  first=2024-08-02T12:00:00.000000000  last=2024-08-02T12:00:00.000000000
lmhofs.fields.n000.20240802.t18z.nc       ntime= 1  first=2024-08-02T18:00:00.000000000  last=2024-08-02T18:00:00.000000

In [39]:
#!/usr/bin/env python3
import os, re, sys
import numpy as np
import xarray as xr
from collections import Counter

# ---- config ----
FILELIST = "/mnt/hydroglg/Data/External_Models/Outputs/GLCFS/LakeHuron/rename/fvcom_files.txt"
TOL_SEC = 1.0  # allow tiny numeric tolerance (seconds)

# ---- helpers ----
FN_PATTS = [
    re.compile(r".*?(\d{8})\.t(\d{2})z\.nc$", re.IGNORECASE),         # lmhofs.fields.n000.20240801.t06z.nc
    re.compile(r".*?t(\d{2})z\.(\d{8})\..*\.nc$", re.IGNORECASE),     # lmhofs.t00z.20241001.fields.n000.nc
]

def expected_time_from_name(path):
    bn = os.path.basename(path)
    for patt in FN_PATTS:
        m = patt.match(bn)
        if m:
            if patt is FN_PATTS[0]:
                ymd, hh = m.group(1), m.group(2)
            else:
                hh, ymd = m.group(1), m.group(2)
            Y, M, D = int(ymd[0:4]), int(ymd[4:6]), int(ymd[6:8])
            H = int(hh)
            # Use naive ISO string (assume UTC everywhere); avoid timezone-aware parsing
            iso = f"{Y:04d}-{M:02d}-{D:02d}T{H:02d}:00:00"
            return np.datetime64(iso, "s")
    return None

def decode_cf_time(ds):
    # Prefer CF 'time'
    if "time" in ds:
        try:
            t = xr.conventions.decode_cf_datetime(
                ds["time"], ds["time"].attrs.get("units"),
                ds["time"].attrs.get("calendar","standard")
            )
            return np.array(t, dtype="datetime64[ns]")
        except Exception:
            pass
    return None

def decode_from_itime(ds):
    if "Itime" not in ds:
        return None
    it = np.asarray(ds["Itime"].values, dtype="int64")
    units = (ds["Itime"].attrs.get("units","") or "")
    m = re.search(r"since\s+(\d{4})-(\d{2})-(\d{2})(?:[ T](\d{2}):(\d{2}):(\d{2}))?", units)
    if m:
        Y,M,D = map(int, m.groups()[:3])
        hh = int(m.group(4) or 0); mm = int(m.group(5) or 0); ss = int(m.group(6) or 0)
        iso_epoch = f"{Y:04d}-{M:02d}-{D:02d}T{hh:02d}:{mm:02d}:{ss:02d}"
    else:
        iso_epoch = "1858-11-17T00:00:00"  # MJD fallback
    # Construct as naive ISO; avoid tz-aware -> datetime64 conversion
    epoch64 = np.datetime64(iso_epoch, "s")

    if "Itime2" in ds:
        it2 = np.asarray(ds["Itime2"].values)
        u2 = (ds["Itime2"].attrs.get("units","") or "").lower()
        if "msec" in u2 or "millisecond" in u2:
            sec_of_day = it2.astype("float64")/1000.0
        elif "sec" in u2:
            sec_of_day = it2.astype("float64")
        else:
            vmax = np.nanmax(it2) if it2.size else 0
            sec_of_day = (it2.astype("float64")/1000.0) if vmax>864000 else it2.astype("float64")
    else:
        sec_of_day = np.zeros_like(it, dtype="float64")

    t = (epoch64 +
         it.astype("timedelta64[D]") +
         sec_of_day.astype("timedelta64[s]")).astype("datetime64[ns]")
    return t

def sec_of_day_from_dt64(t64):
    t = np.datetime64(t64, "s")
    day = t.astype("datetime64[D]")
    sod = (t - day).astype("timedelta64[s]").astype(int)
    return sod

def main():
    if not os.path.exists(FILELIST):
        print(f"Missing file list: {FILELIST}", file=sys.stderr)
        sys.exit(1)

    with open(FILELIST) as f:
        files = [ln.strip() for ln in f if ln.strip().endswith(".nc")]

    print(f"Found {len(files)} files\n")
    issues = []
    all_times = []

    for fp in files:
        bn = os.path.basename(fp)
        exp = expected_time_from_name(fp)
        if exp is None:
            issues.append((bn, "NAME_PARSE", "Could not parse date/hour from filename"))
            continue

        try:
            with xr.open_dataset(fp, decode_times=False) as ds:
                ntime = int(ds["time"].size) if "time" in ds else (int(ds["Itime"].size) if "Itime" in ds else 0)
                if ntime != 1:
                    issues.append((bn, "NTIME", f"ntime={ntime}, expected 1"))
                t_cf = decode_cf_time(ds)
                t_it = decode_from_itime(ds)
        except Exception as e:
            issues.append((bn, "OPEN", f"{type(e).__name__}: {e}"))
            continue

        t_primary = t_cf[0] if (t_cf is not None and t_cf.size) else (t_it[0] if (t_it is not None and t_it.size) else None)
        method = "CF" if (t_cf is not None and t_cf.size) else ("ITIME" if (t_it is not None and t_it.size) else "NA")

        if t_primary is None:
            issues.append((bn, "DECODE", "Could not decode time from CF or Itime"))
            continue

        # Compare at second precision
        dt_sec = abs((t_primary.astype("datetime64[s]") - exp).astype("timedelta64[s]").astype(int))
        if dt_sec > TOL_SEC:
            issues.append((bn, "MISMATCH", f"{method} decoded {str(t_primary)} vs expected {str(exp)} (|Œî|={dt_sec}s)"))

        # Require exact HH:MM:SS = 00
        sod = sec_of_day_from_dt64(t_primary)
        if (sod % 60) != 0:
            issues.append((bn, "SECONDS", f"seconds-of-day={sod} (not exact minute)"))
        elif (sod % 3600) != 0:
            issues.append((bn, "SECONDS", f"seconds-of-day={sod} (not on exact hour)"))

        if t_cf is not None and t_it is not None and t_cf.size and t_it.size:
            dd = abs((t_cf[0].astype("datetime64[s]") - t_it[0].astype("datetime64[s]")).astype("timedelta64[s]").astype(int))
            if dd > TOL_SEC:
                issues.append((bn, "CFvsIT", f"CF {str(t_cf[0])} ‚â† Itime {str(t_it[0])} (Œî={dd}s)"))

        all_times.append((bn, t_primary.astype("datetime64[s]"), fp))

    # Cadence & coverage
    if all_times:
        all_times.sort(key=lambda x: x[1])
        tvals = [t for _, t, _ in all_times]
        first, last = tvals[0], tvals[-1]
        print(f"Coverage: {str(first)}  ‚Üí  {str(last)}  (files: {len(tvals)})")

        dts = []
        for i in range(1, len(tvals)):
            d = (tvals[i] - tvals[i-1]).astype("timedelta64[s]").astype(int)
            dts.append(d)
        if dts:
            from collections import Counter
            counts = Counter(dts)
            print("\nUnique intervals between consecutive files (seconds ‚Üí count):")
            for sec, cnt in sorted(counts.items()):
                hrs = sec/3600.0
                print(f"  {sec:7d} s  (~{hrs:4.2f} h) : {cnt}")
        else:
            print("\nOnly one timestamp found; skipping interval analysis.")

        if len(set(tvals)) != len(tvals):
            issues.append(("<all>", "DUPLICATES", "Duplicate timestamps exist across files"))
        if any(d <= 0 for d in dts):
            issues.append(("<all>", "MONOTONIC", "Non-increasing time steps detected"))

    if issues:
        print("\nIssues found:")
        for bn, kind, msg in issues:
            print(f"  [{kind:9s}] {bn}: {msg}")
    else:
        print("\n‚úÖ All checked files look consistent: decoded time matches filename, seconds=0, cadence is monotonic.")

if __name__ == "__main__":
    main()


Found 272 files

Coverage: 2024-08-01T00:00:00  ‚Üí  2040-04-04T14:41:51  (files: 272)

Unique intervals between consecutive files (seconds ‚Üí count):
        0 s  (~0.00 h) : 1
    21600 s  (~6.00 h) : 156
    43200 s  (~12.00 h) : 2
    86400 s  (~24.00 h) : 110
   172800 s  (~48.00 h) : 1
  481560111 s  (~133766.70 h) : 1

Issues found:
  [MISMATCH ] lmhofs.fields.n000.20240803.t00z.nc: ITIME decoded 2024-08-02T18:00:00.000000000 vs expected 2024-08-03T00:00:00 (|Œî|=21600s)
  [MISMATCH ] lmhofs.fields.n000.20240930.t00z.nc: ITIME decoded 2040-04-04T14:41:51.577038848 vs expected 2024-09-30T00:00:00 (|Œî|=489508911s)
  [SECONDS  ] lmhofs.fields.n000.20240930.t00z.nc: seconds-of-day=52911 (not exact minute)
  [DUPLICATES] <all>: Duplicate timestamps exist across files
  [MONOTONIC] <all>: Non-increasing time steps detected


In [46]:
import xarray as xr, numpy as np, re, os
p="/mnt/hydroglg/Data/External_Models/Outputs/GLCFS/LakeHuron/rename/lmhofs.fields.n000.20241010.t00z.nc"
ds=xr.open_dataset(p, decode_times=False, engine="netcdf4")
it=int(ds["Itime"].values[0]); it2=int(ds["Itime2"].values[0])  # it2 must be millis
epoch=np.datetime64("1858-11-17T00:00:00","s")
t = (epoch + np.timedelta64(it,"D") + np.timedelta64(it2,"ms")).astype("datetime64[s]")
print("decoded:", str(t))  # expect 2024-09-01T06:00:00

decoded: 2024-10-10T00:00:00
