In [2]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd

### Retrieve recovered hdr files from the Raw Data Archive

In [None]:
# User-defined variables
download_folder = "Recovered_PLIMS_Data/R00001_HDRs/"
base_url = "https://rawdata.oceanobservatories.org/files/CP10CNSM/R00001/instruments/dcl27/PLIMS_sn199/"

# Make sure the folder exists
os.makedirs(download_folder, exist_ok=True)

# Store filenames
filenames = []

def get_hdr_files(url):
    """Recursively download .hdr files from a URL, excluding 'beads' folders."""
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    for link in soup.find_all('a'):
        href = link.get('href')
        if href is None or href.startswith('?') or href.startswith('/'):
            continue

        # Check if it's a directory
        if href.endswith('/') and 'beads' not in href.lower():
            new_url = urljoin(url, href)
            get_hdr_files(new_url)

        # Check if it's an .hdr file
        elif href.endswith('.hdr'):
            file_url = urljoin(url, href)
            filename = os.path.basename(href)
            file_path = os.path.join(download_folder, filename)

            if not os.path.exists(file_path):
                print(f"⬇️  Downloading: {filename}")
                r = requests.get(file_url)
                with open(file_path, 'wb') as f:
                    f.write(r.content)
                filenames.append(filename)
            else:
                print(f"✅ Already exists: {filename}")

# Start the recursive crawl
get_hdr_files(base_url)

print("✅ All .hdr files downloaded.")


⬇️  Downloading: D20240403T141609_IFCB199.hdr
⬇️  Downloading: D20240403T152620_IFCB199.hdr
⬇️  Downloading: D20240403T182514_IFCB199.hdr
⬇️  Downloading: D20240403T212511_IFCB199.hdr
⬇️  Downloading: D20240404T002511_IFCB199.hdr
⬇️  Downloading: D20240404T032512_IFCB199.hdr
⬇️  Downloading: D20240404T092532_IFCB199.hdr
⬇️  Downloading: D20240404T122511_IFCB199.hdr
⬇️  Downloading: D20240404T152510_IFCB199.hdr
⬇️  Downloading: D20240404T182510_IFCB199.hdr
⬇️  Downloading: D20240404T212511_IFCB199.hdr
⬇️  Downloading: D20240405T002510_IFCB199.hdr
⬇️  Downloading: D20240405T032510_IFCB199.hdr
⬇️  Downloading: D20240405T062512_IFCB199.hdr
⬇️  Downloading: D20240405T122530_IFCB199.hdr
⬇️  Downloading: D20240405T152511_IFCB199.hdr
⬇️  Downloading: D20240405T182512_IFCB199.hdr
⬇️  Downloading: D20240405T212510_IFCB199.hdr
⬇️  Downloading: D20240406T002509_IFCB199.hdr
⬇️  Downloading: D20240406T032510_IFCB199.hdr
⬇️  Downloading: D20240406T062512_IFCB199.hdr
⬇️  Downloading: D20240406T092510_

In [8]:
columns_in_metadata_csv = ['filename', 'Latitude', 'Longitude', 'Depth', 'sample_type', 'Cruise', 'Instrument', 'tag1', 'tag2']

# these are the varialbes that will need to be updated per recovery 
cnsm_surveyed_anchor_position_lat = 35.95097
cnsm_surveyed_anchor_position_lon = -75.13082
cruise = 'AR87'
ifcb_instrument_num = 'IFCB199'

metadata_rows = []

for fname in filenames:
    row = {
        'filename': fname.replace('.hdr', ''),
        'Latitude': cnsm_surveyed_anchor_position_lat,
        'Longitude': cnsm_surveyed_anchor_position_lon,
        'Depth': 7,
        'sample_type': 'moored',
        'Cruise': cruise,
        'Instrument': ifcb_instrument_num,
        'tag1': 'site_cp10cnsm',
        'tag2': 'targetdepth_7m'
    }
    metadata_rows.append(row)

plims_metadata_df = pd.DataFrame(metadata_rows)

plims_metadata_df

Unnamed: 0,filename,Latitude,Longitude,Depth,sample_type,Cruise,Instrument,tag1,tag2
0,D20240403T141609_IFCB199,35.95097,-75.13082,7,moored,AR87,IFCB199,site_cp10cnsm,targetdepth_7m
1,D20240403T152620_IFCB199,35.95097,-75.13082,7,moored,AR87,IFCB199,site_cp10cnsm,targetdepth_7m
2,D20240403T182514_IFCB199,35.95097,-75.13082,7,moored,AR87,IFCB199,site_cp10cnsm,targetdepth_7m
3,D20240403T212511_IFCB199,35.95097,-75.13082,7,moored,AR87,IFCB199,site_cp10cnsm,targetdepth_7m
4,D20240404T002511_IFCB199,35.95097,-75.13082,7,moored,AR87,IFCB199,site_cp10cnsm,targetdepth_7m
...,...,...,...,...,...,...,...,...,...
1420,D20250402T182401_IFCB199,35.95097,-75.13082,7,moored,AR87,IFCB199,site_cp10cnsm,targetdepth_7m
1421,D20250403T122400_IFCB199,35.95097,-75.13082,7,moored,AR87,IFCB199,site_cp10cnsm,targetdepth_7m
1422,D20250404T002400_IFCB199,35.95097,-75.13082,7,moored,AR87,IFCB199,site_cp10cnsm,targetdepth_7m
1423,D20250404T122401_IFCB199,35.95097,-75.13082,7,moored,AR87,IFCB199,site_cp10cnsm,targetdepth_7m


In [9]:
# Define output path for the metadata CSV
output_csv_path = "PLIMS_R00001_metadata-NEW.csv"  # <- change this filename if needed

# Save the DataFrame to CSV
plims_metadata_df.to_csv(output_csv_path, index=False)

print(f"✅ Metadata saved to {output_csv_path}")

✅ Metadata saved to PLIMS_R00001_metadata-NEW.csv
