In [2]:
import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import pandas as pd
from datetime import datetime

### Retrieve recovered hdr files from the Raw Data Archive

In [3]:
# User-defined variables
recovery_string = "R00001"
ifcb_number     = "sn199"     # as the ifcb number appears in the raw data archive directory structure for ifcb data (format: snXXX)
download_folder = f"Recovered_PLIMS_Data/{recovery_string}_HDRs/"
os.makedirs(download_folder, exist_ok=True)

# Parent (no wildcard in URL)
parent_url = f"https://rawdata.oceanobservatories.org/files/CP10CNSM/{recovery_string}/instruments/dcl27/"


In [4]:
YEAR_RE = re.compile(r"^\d{4}$", re.ASCII)

filenames = []

def _fetch_soup(url, session, timeout=30):
    r = session.get(url, timeout=timeout)
    r.raise_for_status()
    return BeautifulSoup(r.text, "html.parser")

def _list_entries(url, session):
    """
    Return (dirs, files) as absolute URLs found on an autoindex page.
    """
    soup = _fetch_soup(url, session)
    dirs, files = [], []
    for a in soup.find_all("a"):
        href = a.get("href")
        if not href or href.startswith("?") or href.startswith("/"):
            continue
        abs_url = urljoin(url, href)
        if href.endswith("/"):
            if "beads" in href.lower():
                continue
            dirs.append(abs_url)
        else:
            files.append(abs_url)
    return dirs, files

def _basename(url):
    path = urlparse(url).path.rstrip("/")
    return path.rsplit("/", 1)[-1]

def _find_plims_dirs(session):
    """
    Find PLIMS* directories under parent_url.
    If ifcb_number is set (e.g., 'sn199'), keep only PLIMS_sn199.
    """
    dirs, _ = _list_entries(parent_url, session)
    plims_dirs = [d for d in dirs if _basename(d).startswith("PLIMS")]
    if ifcb_number:
        target = f"PLIMS_{ifcb_number}"
        plims_dirs = [d for d in plims_dirs if _basename(d) == target]
    return plims_dirs

def _iter_year_dirs(plims_dir, session):
    """
    Yield subdirectories whose basename is a 4-digit year.
    """
    dirs, _ = _list_entries(plims_dir, session)
    for d in dirs:
        name = _basename(d)
        if YEAR_RE.match(name):
            yield d

def _recurse_and_download_hdrs(root_url, session, download_dir):
    """
    DFS: list files here, download .hdr, then recurse into subdirs.
    """
    dirs, files = _list_entries(root_url, session)

    # Download .hdr files at this level
    for f in files:
        if f.endswith(".hdr"):
            fname = _basename(f)
            fpath = os.path.join(download_dir, fname)
            if not os.path.exists(fpath):
                print(f"‚¨áÔ∏è  Downloading: {fname}")
                with session.get(f, timeout=60, stream=True) as r:
                    r.raise_for_status()
                    with open(fpath, "wb") as out:
                        for chunk in r.iter_content(chunk_size=1 << 15):
                            if chunk:
                                out.write(chunk)
                    filenames.append(fname)
            else:
                print(f"‚úÖ Already exists: {fname}")

    # Recurse into all subdirs under this node
    for d in dirs:
        _recurse_and_download_hdrs(d, session, download_dir)
        

def run():
    with requests.Session() as s:
        s.headers["User-Agent"] = "ooi-plims-crawler/1.0 (+local)"
        plims_dirs = _find_plims_dirs(s)

        if not plims_dirs:
            print("‚ö†Ô∏è  No PLIMS directories found matching your criteria.")
            return

        for plims in plims_dirs:
            print(f"üîé Scanning PLIMS dir: {plims}")
            for ydir in _iter_year_dirs(plims, s):
                print(f"üìÅ Entering year dir: {ydir}")
                _recurse_and_download_hdrs(ydir, s, download_folder)

    print("‚úÖ All .hdr files downloaded.")

if __name__ == "__main__":
    run()

üîé Scanning PLIMS dir: https://rawdata.oceanobservatories.org/files/CP10CNSM/R00001/instruments/dcl27/PLIMS_sn199/
üìÅ Entering year dir: https://rawdata.oceanobservatories.org/files/CP10CNSM/R00001/instruments/dcl27/PLIMS_sn199/2024/
‚¨áÔ∏è  Downloading: D20240403T141609_IFCB199.hdr
‚¨áÔ∏è  Downloading: D20240403T152620_IFCB199.hdr
‚¨áÔ∏è  Downloading: D20240403T182514_IFCB199.hdr
‚¨áÔ∏è  Downloading: D20240403T212511_IFCB199.hdr
‚¨áÔ∏è  Downloading: D20240404T002511_IFCB199.hdr
‚¨áÔ∏è  Downloading: D20240404T032512_IFCB199.hdr
‚¨áÔ∏è  Downloading: D20240404T092532_IFCB199.hdr
‚¨áÔ∏è  Downloading: D20240404T122511_IFCB199.hdr
‚¨áÔ∏è  Downloading: D20240404T152510_IFCB199.hdr
‚¨áÔ∏è  Downloading: D20240404T182510_IFCB199.hdr
‚¨áÔ∏è  Downloading: D20240404T212511_IFCB199.hdr
‚¨áÔ∏è  Downloading: D20240405T002510_IFCB199.hdr
‚¨áÔ∏è  Downloading: D20240405T032510_IFCB199.hdr
‚¨áÔ∏è  Downloading: D20240405T062512_IFCB199.hdr
‚¨áÔ∏è  Downloading: D20240405T122530_IFCB199.hdr
‚¨áÔ∏è  Down

KeyboardInterrupt: 

### Setting up the variables and structure for the metadata csv

In [5]:
# these are the variables that will need to be updated per recovery 
cnsm_surveyed_anchor_position_lat = 35.949883
cnsm_surveyed_anchor_position_lon = -75.119433
cruise = 'AR87'
ifcb_instrument_num = 'IFCB199' # as the ifcb sn appears in the metadata log (format: IFCBXXX)
metadata_rows = []

columns_in_metadata_csv = ['filename', 'Latitude', 'Longitude', 'Depth', 'sample_type', 'Cruise', 'Instrument', 'tag1', 'tag2']

for fname in filenames:
    row = {
        'filename': fname.replace('.hdr', ''),
        'Latitude': cnsm_surveyed_anchor_position_lat,
        'Longitude': cnsm_surveyed_anchor_position_lon,
        'Depth': 7,
        'sample_type': 'moored',
        'Cruise': '',
        'Instrument': ifcb_instrument_num,
        'tag1': 'site_CP10CNSM',
        'tag2': 'targetdepth_7m'
    }
    metadata_rows.append(row)

plims_metadata_df = pd.DataFrame(metadata_rows)

plims_metadata_df

Unnamed: 0,filename,Latitude,Longitude,Depth,sample_type,Cruise,Instrument,tag1,tag2
0,D20240403T141609_IFCB199,35.949883,-75.119433,7,moored,,IFCB199,site_CP10CNSM,targetdepth_7m
1,D20240403T152620_IFCB199,35.949883,-75.119433,7,moored,,IFCB199,site_CP10CNSM,targetdepth_7m
2,D20240403T182514_IFCB199,35.949883,-75.119433,7,moored,,IFCB199,site_CP10CNSM,targetdepth_7m
3,D20240403T212511_IFCB199,35.949883,-75.119433,7,moored,,IFCB199,site_CP10CNSM,targetdepth_7m
4,D20240404T002511_IFCB199,35.949883,-75.119433,7,moored,,IFCB199,site_CP10CNSM,targetdepth_7m
...,...,...,...,...,...,...,...,...,...
164,D20240427T032509_IFCB199,35.949883,-75.119433,7,moored,,IFCB199,site_CP10CNSM,targetdepth_7m
165,D20240427T062507_IFCB199,35.949883,-75.119433,7,moored,,IFCB199,site_CP10CNSM,targetdepth_7m
166,D20240427T092509_IFCB199,35.949883,-75.119433,7,moored,,IFCB199,site_CP10CNSM,targetdepth_7m
167,D20240427T122509_IFCB199,35.949883,-75.119433,7,moored,,IFCB199,site_CP10CNSM,targetdepth_7m


In [6]:
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
output_filename = f"Metadata_CSVs/{cruise}_moored_ifcb_dashboard_metadata_{timestamp}.csv"
plims_metadata_df.to_csv(output_filename, index=False)