# METBK Intercomparison (Irminger 12)
## Data Processing

This data comes from the Revelle (not the Armstrong)

In [1]:
import numpy as np
import pandas as pd
import os, sys
import matplotlib.pyplot as plt
%matplotlib inline
sys.path.append("../..")
from utils import Ship, Buoy, md2vect, vect2md, bpr_adjust, rh2q
import re
import requests
import csv

In [2]:
# Cruise start and end dates
T1 = pd.to_datetime("2025-07-18")
T2 = pd.to_datetime("2025-08-07")

In [3]:
# Platforms
ship = Ship()
sumo11 = Buoy()
sumo12 = Buoy()

## Ship Data 

* located on the Raw Data Repo: cruise_data/Irminger_Sea/Irminger_Sea-12_RR2505_2025-07-18/Ship_Data/metacq/data/
* The metbk files are organized differently than they are when from the Armstrong - there is a 3 file series per day
    1) .DCC files - decoded SAMOS data from the ship's met package
    2) .COR files - corrected SAMOS data
    3) .LOG files

In [5]:
BASE_URL = "https://rawdata.oceanobservatories.org/files/cruise_data/Irminger_Sea/Irminger_Sea-12_RR2505_2025-07-18/Ship_Data/metacq/data/"
DOWNLOAD_DIR = "ship_files/dcc_files"
OUTPUT_CSV = "ship_files/merged_dcc.csv"

def get_dcc_file_list(base_url: str):
    """
    Fetch the directory listing at base_url and return a list of .DCC filenames.
    Assumes an Apache-style (or similar) directory listing with hrefs.
    """
    resp = requests.get(base_url)
    resp.raise_for_status()
    html = resp.text

    # Find href="something.DCC"
    pattern = r'href="([^"]+\.DCC)"'
    files = re.findall(pattern, html, flags=re.IGNORECASE)

    # Deduplicate, preserve order
    seen = set()
    dcc_files = []
    for f in files:
        if f not in seen:
            seen.add(f)
            dcc_files.append(f)
    return dcc_files


def download_dcc_files(base_url: str, filenames, download_dir: str):
    """
    Download each .DCC file if it doesn't already exist in download_dir.
    """
    os.makedirs(download_dir, exist_ok=True)

    for fname in filenames:
        local_path = os.path.join(download_dir, fname)
        if os.path.exists(local_path):
            print(f"Already exists, skipping: {fname}")
            continue

        url = base_url + fname
        print(f"Downloading: {url}")
        resp = requests.get(url, stream=True)
        resp.raise_for_status()

        with open(local_path, "wb") as f:
            for chunk in resp.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)

        print(f"Saved to {local_path}")


def parse_dcc_line(line: str, source_file: str):
    """
    Parse one .DCC line like:
    $SAMOS:001,CS:KAOU,YMD:20250716,...

    Returns a dict of key -> value including:
    - 'record_type' for the first token (e.g., '$SAMOS:001')
    - 'source_file' with the original filename
    """
    line = line.strip()
    if not line:
        return None
    if not line.startswith("$"):
        return None

    parts = line.split(",")
    if not parts:
        return None

    row = {}
    # first token: e.g. '$SAMOS:001'
    row["record_type"] = parts[0]
    row["source_file"] = os.path.basename(source_file)

    for token in parts[1:]:
        if ":" not in token:
            continue
        key, value = token.split(":", 1)
        key = key.strip()
        value = value.strip()
        if key:
            row[key] = value

    return row


def merge_dcc_to_csv(download_dir: str, output_csv: str):
    """
    Read all .DCC files in download_dir, parse each line,
    and write a merged CSV with all unique keys as columns.
    """
    all_rows = []
    all_keys = set()

    # Find all .DCC files
    for fname in sorted(os.listdir(download_dir)):
        if not fname.upper().endswith(".DCC"):
            continue
        path = os.path.join(download_dir, fname)
        print(f"Parsing {path}")

        with open(path, "r", encoding="utf-8", errors="replace") as f:
            for line in f:
                row = parse_dcc_line(line, source_file=path)
                if row:
                    all_rows.append(row)
                    all_keys.update(row.keys())

    if not all_rows:
        print("No data rows parsed; CSV will not be created.")
        return

    # Ensure a stable column order: record_type, source_file, then sorted rest
    base_cols = ["record_type", "source_file"]
    other_cols = sorted(k for k in all_keys if k not in base_cols)
    fieldnames = base_cols + other_cols

    print(f"Writing {len(all_rows)} rows to {output_csv}")
    with open(output_csv, "w", newline="", encoding="utf-8") as out_f:
        writer = csv.DictWriter(out_f, fieldnames=fieldnames)
        writer.writeheader()
        for row in all_rows:
            writer.writerow(row)

    print("Done.")

In [6]:
dcc_files = get_dcc_file_list(BASE_URL)
print(f"Found {len(dcc_files)} .DCC files")

download_dcc_files(BASE_URL, dcc_files, DOWNLOAD_DIR)
merge_dcc_to_csv(DOWNLOAD_DIR, OUTPUT_CSV)

print(f"\nAll done. Merged CSV: {OUTPUT_CSV}")

Found 23 .DCC files
Downloading: https://rawdata.oceanobservatories.org/files/cruise_data/Irminger_Sea/Irminger_Sea-12_RR2505_2025-07-18/Ship_Data/metacq/data/250716.DCC
Saved to dcc_files/250716.DCC
Downloading: https://rawdata.oceanobservatories.org/files/cruise_data/Irminger_Sea/Irminger_Sea-12_RR2505_2025-07-18/Ship_Data/metacq/data/250717.DCC
Saved to dcc_files/250717.DCC
Downloading: https://rawdata.oceanobservatories.org/files/cruise_data/Irminger_Sea/Irminger_Sea-12_RR2505_2025-07-18/Ship_Data/metacq/data/250718.DCC
Saved to dcc_files/250718.DCC
Downloading: https://rawdata.oceanobservatories.org/files/cruise_data/Irminger_Sea/Irminger_Sea-12_RR2505_2025-07-18/Ship_Data/metacq/data/250719.DCC
Saved to dcc_files/250719.DCC
Downloading: https://rawdata.oceanobservatories.org/files/cruise_data/Irminger_Sea/Irminger_Sea-12_RR2505_2025-07-18/Ship_Data/metacq/data/250720.DCC
Saved to dcc_files/250720.DCC
Downloading: https://rawdata.oceanobservatories.org/files/cruise_data/Irminger_S

In [8]:
# Validation of the resulting CSV

def count_dcc_data_lines(file_path):
    """
    Count lines beginning with '$' in a .DCC file (true data lines).
    """
    count = 0
    with open(file_path, "r", encoding="utf-8", errors="replace") as f:
        for line in f:
            if line.strip().startswith("$"):
                count += 1
    return count


def validate_merged_csv(dcc_dir, merged_csv):
    df = pd.read_csv(merged_csv)

    # Group merged CSV rows by source file
    merged_counts = df["source_file"].value_counts().to_dict()

    print("=== Validation Report ===\n")

    all_good = True

    for fname in sorted(os.listdir(dcc_dir)):
        if not fname.upper().endswith(".DCC"):
            continue

        dcc_path = os.path.join(dcc_dir, fname)
        dcc_line_count = count_dcc_data_lines(dcc_path)
        csv_line_count = merged_counts.get(fname, 0)

        print(f"File: {fname}")
        print(f"  Lines in DCC:       {dcc_line_count}")
        print(f"  Rows in merged CSV: {csv_line_count}")

        if dcc_line_count != csv_line_count:
            all_good = False
            print("  ‚ùå MISMATCH ‚Äî some rows may be missing")
        else:
            print("  ‚úÖ OK ‚Äî all rows accounted for")

        print()

    if all_good:
        print("üéâ All DCC files are fully represented in the merged CSV!")
    else:
        print("‚ö†Ô∏è Some files appear incomplete. Investigate mismatches above.")


# Run the validation
validate_merged_csv(DOWNLOAD_DIR, OUTPUT_CSV)

=== Validation Report ===

File: 250716.DCC
  Lines in DCC:       1440
  Rows in merged CSV: 1440
  ‚úÖ OK ‚Äî all rows accounted for

File: 250717.DCC
  Lines in DCC:       1440
  Rows in merged CSV: 1440
  ‚úÖ OK ‚Äî all rows accounted for

File: 250718.DCC
  Lines in DCC:       1440
  Rows in merged CSV: 1440
  ‚úÖ OK ‚Äî all rows accounted for

File: 250719.DCC
  Lines in DCC:       1440
  Rows in merged CSV: 1440
  ‚úÖ OK ‚Äî all rows accounted for

File: 250720.DCC
  Lines in DCC:       1440
  Rows in merged CSV: 1440
  ‚úÖ OK ‚Äî all rows accounted for

File: 250721.DCC
  Lines in DCC:       1440
  Rows in merged CSV: 1440
  ‚úÖ OK ‚Äî all rows accounted for

File: 250722.DCC
  Lines in DCC:       1439
  Rows in merged CSV: 1439
  ‚úÖ OK ‚Äî all rows accounted for

File: 250723.DCC
  Lines in DCC:       1440
  Rows in merged CSV: 1440
  ‚úÖ OK ‚Äî all rows accounted for

File: 250724.DCC
  Lines in DCC:       1440
  Rows in merged CSV: 1440
  ‚úÖ OK ‚Äî all rows accounted for

F

## Buoy Data

In [10]:
def get_metbk_file_list(base_url, sensor_tag, T1, T2):
    """
    Fetch the directory listing at base_url and return a list of
    (filename, date) for files like YYYYMMDD.<sensor_tag>.log
    within the [T1, T2] date range.

    sensor_tag is usually "metbk1" or "metbk2".
    T1, T2 should be datetime.date objects.
    """
    resp = requests.get(base_url)
    resp.raise_for_status()
    html = resp.text

    # href="YYYYMMDD.metbkX.log"
    href_pattern = rf'href="([^"]+\.{sensor_tag}\.log)"'
    candidates = re.findall(href_pattern, html, flags=re.IGNORECASE)

    # Regex to extract date from filename
    fname_pattern = re.compile(rf'(\d{{8}})\.{sensor_tag}\.log$', re.IGNORECASE)

    selected = []
    for fname in candidates:
        m = fname_pattern.search(fname)
        if not m:
            continue
        datestr = m.group(1)
        file_date = datetime.strptime(datestr, "%Y%m%d").date()
        if T1 <= file_date <= T2:
            selected.append((fname, file_date))

    # Deduplicate and sort by date
    selected = sorted(set(selected), key=lambda x: x[1])
    return selected


def download_files(base_url, file_list, download_dir):
    """
    Download each file in file_list if not already in download_dir.
    file_list: list of (filename, date)
    """
    os.makedirs(download_dir, exist_ok=True)

    for fname, fdate in file_list:
        local_path = os.path.join(download_dir, fname)
        if os.path.exists(local_path):
            print(f"[skip] {fname} already exists")
            continue

        url = base_url + fname
        print(f"[download] {url}")
        resp = requests.get(url, stream=True)
        resp.raise_for_status()

        with open(local_path, "wb") as f:
            for chunk in resp.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)

        print(f"  -> saved to {local_path}")


def parse_metbk_line(line, source_file):
    """
    Parse a METBK ASIMET non-label mode line:

    yyyy/mm/dd HH:MM:SS.sss  BP  RH  RH_T  LWR  PRC  SeaT  Cond  SWR  We  Wn  Bat1  Bat2

    Returns a dict with:
      datetime, BP, RH, RH_T, LWR, PRC, SeaT, Cond, SWR, We, Wn, Bat1, Bat2, source_file
    or None if the line doesn't match expectations.
    """
    line = line.strip()
    if not line:
        return None

    parts = line.split()
    if len(parts) < 14:
        # not enough tokens to be a full record
        return None

    date_str = parts[0]
    time_str = parts[1]
    try:
        dt = datetime.strptime(date_str + " " + time_str, "%Y/%m/%d %H:%M:%S.%f")
    except ValueError:
        # fallback if no milliseconds
        try:
            dt = datetime.strptime(date_str + " " + time_str, "%Y/%m/%d %H:%M:%S")
        except ValueError:
            return None

    # The next 12 values should be numeric fields in the documented order
    numeric_vals = parts[2:14]
    if len(numeric_vals) != 12:
        return None

    try:
        (
            BP,      # mbar
            RH,      # %
            RH_T,    # degC
            LWR,     # W/m^2
            PRC,     # mm
            SeaT,    # degC
            Cond,    # S/m
            SWR,     # W/m^2
            We,      # m/s
            Wn,      # m/s
            Bat1,    # V
            Bat2,    # V
        ) = map(float, numeric_vals)
    except ValueError:
        return None

    return {
        "datetime": dt,
        "BP_mbar": BP,
        "RH_pct": RH,
        "RH_T_degC": RH_T,
        "LWR_W_m2": LWR,
        "PRC_mm": PRC,
        "SeaT_degC": SeaT,
        "Cond_S_m": Cond,
        "SWR_W_m2": SWR,
        "We_m_s": We,
        "Wn_m_s": Wn,
        "Bat1_V": Bat1,
        "Bat2_V": Bat2,
        "source_file": os.path.basename(source_file),
    }


def parse_all_metbk_logs(download_dir, sensor_tag):
    """
    Parse all *.{sensor_tag}.log files in download_dir into a single pandas DataFrame.
    """
    rows = []

    for fname in sorted(os.listdir(download_dir)):
        if not fname.lower().endswith(f".{sensor_tag}.log"):
            continue

        path = os.path.join(download_dir, fname)
        print(f"[parse] {path}")
        with open(path, "r", encoding="utf-8", errors="replace") as f:
            for line in f:
                rec = parse_metbk_line(line, source_file=path)
                if rec is not None:
                    rows.append(rec)

    if not rows:
        print("No valid METBK records parsed.")
        return pd.DataFrame()

    df = pd.DataFrame(rows)
    df = df.sort_values("datetime").reset_index(drop=True)
    return df


# -------------------------------------------------------------------
# High-level function you can call for *any* buoy & metbk{1,2}
# -------------------------------------------------------------------

def fetch_metbk_range(
    base_url,
    sensor_tag,
    T1,
    T2,
    download_dir,
    output_csv=None,
):
    """
    Fetch, download, and parse METBK non-label mode logs for a given sensor_tag
    ("metbk1" or "metbk2") between dates T1 and T2 (inclusive).

    Parameters
    ----------
    base_url : str
        Directory URL where the log files live.
    sensor_tag : str
        e.g., "metbk1" or "metbk2".
    T1, T2 : pandas.Timestamp or datetime.date
        Start and end of date range (inclusive).
    download_dir : str
        Local directory to store logs.
    output_csv : str or None
        If provided, save merged DataFrame to this CSV.

    Returns
    -------
    df : pandas.DataFrame
        Parsed data for that sensor & date range.
    """
    # Normalize T1/T2 to date objects
    T1_date = pd.to_datetime(T1).date()
    T2_date = pd.to_datetime(T2).date()

    print(f"\n=== {sensor_tag} | {T1_date} ‚Üí {T2_date} ===")

    files = get_metbk_file_list(base_url, sensor_tag, T1_date, T2_date)
    print(f"Found {len(files)} {sensor_tag} log files in range:")
    for fname, fdate in files:
        print(f"  {fname}  ({fdate})")

    if not files:
        print("No matching files; returning empty DataFrame.")
        return pd.DataFrame()

    download_files(base_url, files, download_dir)

    df = parse_all_metbk_logs(download_dir, sensor_tag)
    print(f"\nParsed {len(df)} METBK records for {sensor_tag}.")

    if output_csv and not df.empty:
        df.to_csv(output_csv, index=False)
        print(f"Saved merged CSV to: {output_csv}")

    return df


## Recovered buoy (SUMO-11) metbk data

In [12]:
R_metbk1_base = "https://rawdata.oceanobservatories.org/files/GI01SUMO/R00011/cg_data/dcl11/metbk1/"  
R_metbk2_base = "https://rawdata.oceanobservatories.org/files/GI01SUMO/R00011/cg_data/dcl12/metbk2/"  

df_R_metbk1 = fetch_metbk_range(
    base_url=R_metbk1_base,
    sensor_tag="metbk1",
    T1=T1,
    T2=T2,
    download_dir="buoy_files/recovered/metbk1",
    output_csv="R_metbk1_20250718_20250807.csv",
)

df_R_metbk2 = fetch_metbk_range(
    base_url=R_metbk2_base,
    sensor_tag="metbk2",
    T1=T1,
    T2=T2,
    download_dir="buoy_files/recovered/metbk2",
    output_csv="R_metbk2_20250718_20250807.csv",
)


=== metbk1 | 2025-07-18 ‚Üí 2025-08-07 ===
Found 3 metbk1 log files in range:
  20250730.metbk1.log  (2025-07-30)
  20250731.metbk1.log  (2025-07-31)
  20250801.metbk1.log  (2025-08-01)
[download] https://rawdata.oceanobservatories.org/files/GI01SUMO/R00011/cg_data/dcl11/metbk1/20250730.metbk1.log
  -> saved to buoy_files/recovered/metbk1/20250730.metbk1.log
[download] https://rawdata.oceanobservatories.org/files/GI01SUMO/R00011/cg_data/dcl11/metbk1/20250731.metbk1.log
  -> saved to buoy_files/recovered/metbk1/20250731.metbk1.log
[download] https://rawdata.oceanobservatories.org/files/GI01SUMO/R00011/cg_data/dcl11/metbk1/20250801.metbk1.log
  -> saved to buoy_files/recovered/metbk1/20250801.metbk1.log
[parse] buoy_files/recovered/metbk1/20250730.metbk1.log
[parse] buoy_files/recovered/metbk1/20250731.metbk1.log
[parse] buoy_files/recovered/metbk1/20250801.metbk1.log

Parsed 3106 METBK records for metbk1.
Saved merged CSV to: R_metbk1_20250718_20250807.csv

=== metbk2 | 2025-07-18 ‚Üí 

## Deployed Buoy (SUMO-12) metbk data

In [13]:
D_metbk1_base = "https://rawdata.oceanobservatories.org/files/GI01SUMO/D00012/cg_data/dcl11/metbk1/"
D_metbk2_base = "https://rawdata.oceanobservatories.org/files/GI01SUMO/D00012/cg_data/dcl12/metbk2/"

df_B_metbk1 = fetch_metbk_range(
    base_url=D_metbk1_base,
    sensor_tag="metbk1",
    T1=T1,
    T2=T2,
    download_dir="buoy_files/deployed/metbk1",
    output_csv="D_metbk1_20250718_20250807.csv",
)

df_B_metbk2 = fetch_metbk_range(
    base_url=D_metbk2_base,
    sensor_tag="metbk2",
    T1=T1,
    T2=T2,
    download_dir="buoy_files/deployed/metbk1",
    output_csv="D_metbk2_20250718_20250807.csv",
)


=== metbk1 | 2025-07-18 ‚Üí 2025-08-07 ===
Found 21 metbk1 log files in range:
  20250718.metbk1.log  (2025-07-18)
  20250719.metbk1.log  (2025-07-19)
  20250720.metbk1.log  (2025-07-20)
  20250721.metbk1.log  (2025-07-21)
  20250722.metbk1.log  (2025-07-22)
  20250723.metbk1.log  (2025-07-23)
  20250724.metbk1.log  (2025-07-24)
  20250725.metbk1.log  (2025-07-25)
  20250726.metbk1.log  (2025-07-26)
  20250727.metbk1.log  (2025-07-27)
  20250728.metbk1.log  (2025-07-28)
  20250729.metbk1.log  (2025-07-29)
  20250730.metbk1.log  (2025-07-30)
  20250731.metbk1.log  (2025-07-31)
  20250801.metbk1.log  (2025-08-01)
  20250802.metbk1.log  (2025-08-02)
  20250803.metbk1.log  (2025-08-03)
  20250804.metbk1.log  (2025-08-04)
  20250805.metbk1.log  (2025-08-05)
  20250806.metbk1.log  (2025-08-06)
  20250807.metbk1.log  (2025-08-07)
[download] https://rawdata.oceanobservatories.org/files/GI01SUMO/D00012/cg_data/dcl11/metbk1/20250718.metbk1.log
  -> saved to buoy_files/deployed/metbk1/20250718.m

  -> saved to buoy_files/deployed/metbk1/20250727.metbk2.log
[download] https://rawdata.oceanobservatories.org/files/GI01SUMO/D00012/cg_data/dcl12/metbk2/20250728.metbk2.log
  -> saved to buoy_files/deployed/metbk1/20250728.metbk2.log
[download] https://rawdata.oceanobservatories.org/files/GI01SUMO/D00012/cg_data/dcl12/metbk2/20250729.metbk2.log
  -> saved to buoy_files/deployed/metbk1/20250729.metbk2.log
[download] https://rawdata.oceanobservatories.org/files/GI01SUMO/D00012/cg_data/dcl12/metbk2/20250730.metbk2.log
  -> saved to buoy_files/deployed/metbk1/20250730.metbk2.log
[download] https://rawdata.oceanobservatories.org/files/GI01SUMO/D00012/cg_data/dcl12/metbk2/20250731.metbk2.log
  -> saved to buoy_files/deployed/metbk1/20250731.metbk2.log
[download] https://rawdata.oceanobservatories.org/files/GI01SUMO/D00012/cg_data/dcl12/metbk2/20250801.metbk2.log
  -> saved to buoy_files/deployed/metbk1/20250801.metbk2.log
[download] https://rawdata.oceanobservatories.org/files/GI01SUMO/D000