In [2]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd

### Retrieve recovered hdr files from the Raw Data Archive

In [6]:
# User-defined variables
download_folder = "Recovered_PLIMS_Data/R00001_HDRs/"
base_url = "https://rawdata.oceanobservatories.org/files/CP10CNSM/R00001/instruments/dcl27/PLIMS_sn199/"

# Make sure the folder exists
os.makedirs(download_folder, exist_ok=True)

# Store filenames
filenames = []

def get_hdr_files(url):
    """Recursively download .hdr files from a URL, excluding 'beads' folders."""
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    for link in soup.find_all('a'):
        href = link.get('href')
        if href is None or href.startswith('?') or href.startswith('/'):
            continue

        # Check if it's a directory
        if href.endswith('/') and 'beads' not in href.lower():
            new_url = urljoin(url, href)
            get_hdr_files(new_url)

        # Check if it's an .hdr file
        elif href.endswith('.hdr'):
            file_url = urljoin(url, href)
            filename = os.path.basename(href)
            file_path = os.path.join(download_folder, filename)

            if not os.path.exists(file_path):
                print(f"⬇️  Downloading: {filename}")
                r = requests.get(file_url)
                with open(file_path, 'wb') as f:
                    f.write(r.content)
                filenames.append(filename)
            else:
                print(f"✅ Already exists: {filename}")

# Start the recursive crawl
get_hdr_files(base_url)

print("✅ All .hdr files downloaded.")


⬇️  Downloading: D20240403T141609_IFCB199.hdr
⬇️  Downloading: D20240403T152620_IFCB199.hdr
⬇️  Downloading: D20240403T182514_IFCB199.hdr
⬇️  Downloading: D20240403T212511_IFCB199.hdr
⬇️  Downloading: D20240404T002511_IFCB199.hdr
⬇️  Downloading: D20240404T032512_IFCB199.hdr
⬇️  Downloading: D20240404T092532_IFCB199.hdr
⬇️  Downloading: D20240404T122511_IFCB199.hdr
⬇️  Downloading: D20240404T152510_IFCB199.hdr
⬇️  Downloading: D20240404T182510_IFCB199.hdr
⬇️  Downloading: D20240404T212511_IFCB199.hdr
⬇️  Downloading: D20240405T002510_IFCB199.hdr
⬇️  Downloading: D20240405T032510_IFCB199.hdr
⬇️  Downloading: D20240405T062512_IFCB199.hdr
⬇️  Downloading: D20240405T122530_IFCB199.hdr
⬇️  Downloading: D20240405T152511_IFCB199.hdr
⬇️  Downloading: D20240405T182512_IFCB199.hdr
⬇️  Downloading: D20240405T212510_IFCB199.hdr
⬇️  Downloading: D20240406T002509_IFCB199.hdr
⬇️  Downloading: D20240406T032510_IFCB199.hdr
⬇️  Downloading: D20240406T062512_IFCB199.hdr
⬇️  Downloading: D20240406T092510_

⬇️  Downloading: D20240429T092509_IFCB199.hdr
⬇️  Downloading: D20240429T122511_IFCB199.hdr
⬇️  Downloading: D20240429T152509_IFCB199.hdr
⬇️  Downloading: D20240429T182508_IFCB199.hdr
⬇️  Downloading: D20240429T212508_IFCB199.hdr
⬇️  Downloading: D20240430T002510_IFCB199.hdr
⬇️  Downloading: D20240430T062507_IFCB199.hdr
⬇️  Downloading: D20240430T092508_IFCB199.hdr
⬇️  Downloading: D20240430T122510_IFCB199.hdr
⬇️  Downloading: D20240430T152508_IFCB199.hdr
⬇️  Downloading: D20240430T182508_IFCB199.hdr
⬇️  Downloading: D20240430T212508_IFCB199.hdr
⬇️  Downloading: D20240501T002509_IFCB199.hdr
⬇️  Downloading: D20240501T032508_IFCB199.hdr
⬇️  Downloading: D20240501T092528_IFCB199.hdr
⬇️  Downloading: D20240501T122508_IFCB199.hdr
⬇️  Downloading: D20240501T152507_IFCB199.hdr
⬇️  Downloading: D20240501T182507_IFCB199.hdr
⬇️  Downloading: D20240501T212507_IFCB199.hdr
⬇️  Downloading: D20240502T002507_IFCB199.hdr
⬇️  Downloading: D20240502T032508_IFCB199.hdr
⬇️  Downloading: D20240502T062507_

⬇️  Downloading: D20240531T122504_IFCB199.hdr
⬇️  Downloading: D20240531T162503_IFCB199.hdr
⬇️  Downloading: D20240531T202503_IFCB199.hdr
⬇️  Downloading: D20240601T002503_IFCB199.hdr
⬇️  Downloading: D20240601T042503_IFCB199.hdr
⬇️  Downloading: D20240601T122522_IFCB199.hdr
⬇️  Downloading: D20240601T162505_IFCB199.hdr
⬇️  Downloading: D20240601T202504_IFCB199.hdr
⬇️  Downloading: D20240602T002503_IFCB199.hdr
⬇️  Downloading: D20240602T042503_IFCB199.hdr
⬇️  Downloading: D20240602T082504_IFCB199.hdr
⬇️  Downloading: D20240602T122505_IFCB199.hdr
⬇️  Downloading: D20240602T162504_IFCB199.hdr
⬇️  Downloading: D20240603T002523_IFCB199.hdr
⬇️  Downloading: D20240603T042505_IFCB199.hdr
⬇️  Downloading: D20240603T082502_IFCB199.hdr
⬇️  Downloading: D20240603T122504_IFCB199.hdr
⬇️  Downloading: D20240603T162501_IFCB199.hdr
⬇️  Downloading: D20240603T202503_IFCB199.hdr
⬇️  Downloading: D20240604T002503_IFCB199.hdr
⬇️  Downloading: D20240604T042502_IFCB199.hdr
⬇️  Downloading: D20240604T122523_

⬇️  Downloading: D20240705T122458_IFCB199.hdr
⬇️  Downloading: D20240705T162457_IFCB199.hdr
⬇️  Downloading: D20240706T002515_IFCB199.hdr
⬇️  Downloading: D20240706T042458_IFCB199.hdr
⬇️  Downloading: D20240706T082455_IFCB199.hdr
⬇️  Downloading: D20240706T122458_IFCB199.hdr
⬇️  Downloading: D20240706T162458_IFCB199.hdr
⬇️  Downloading: D20240706T202458_IFCB199.hdr
⬇️  Downloading: D20240707T002456_IFCB199.hdr
⬇️  Downloading: D20240707T042457_IFCB199.hdr
⬇️  Downloading: D20240707T122456_IFCB199.hdr
⬇️  Downloading: D20240707T162458_IFCB199.hdr
⬇️  Downloading: D20240707T202458_IFCB199.hdr
⬇️  Downloading: D20240708T002457_IFCB199.hdr
⬇️  Downloading: D20240708T042455_IFCB199.hdr
⬇️  Downloading: D20240708T082457_IFCB199.hdr
⬇️  Downloading: D20240708T122458_IFCB199.hdr
⬇️  Downloading: D20240708T162456_IFCB199.hdr
⬇️  Downloading: D20240709T002514_IFCB199.hdr
⬇️  Downloading: D20240709T042456_IFCB199.hdr
⬇️  Downloading: D20240709T082456_IFCB199.hdr
⬇️  Downloading: D20240709T122458_

⬇️  Downloading: D20240903T002447_IFCB199.hdr
⬇️  Downloading: D20240903T122448_IFCB199.hdr
⬇️  Downloading: D20240904T122445_IFCB199.hdr
⬇️  Downloading: D20240904T162448_IFCB199.hdr
⬇️  Downloading: D20240904T202445_IFCB199.hdr
⬇️  Downloading: D20240905T002447_IFCB199.hdr
⬇️  Downloading: D20240905T042445_IFCB199.hdr
⬇️  Downloading: D20240905T082447_IFCB199.hdr
⬇️  Downloading: D20240905T122447_IFCB199.hdr
⬇️  Downloading: D20240905T162446_IFCB199.hdr
⬇️  Downloading: D20240906T002446_IFCB199.hdr
⬇️  Downloading: D20240906T042447_IFCB199.hdr
⬇️  Downloading: D20240906T082446_IFCB199.hdr
⬇️  Downloading: D20240906T122447_IFCB199.hdr
⬇️  Downloading: D20240906T162446_IFCB199.hdr
⬇️  Downloading: D20240906T202446_IFCB199.hdr
⬇️  Downloading: D20240907T002447_IFCB199.hdr
⬇️  Downloading: D20240907T042447_IFCB199.hdr
⬇️  Downloading: D20240907T122505_IFCB199.hdr
⬇️  Downloading: D20240907T162445_IFCB199.hdr
⬇️  Downloading: D20240907T202445_IFCB199.hdr
⬇️  Downloading: D20240908T002447_

⬇️  Downloading: D20241031T202438_IFCB199.hdr
⬇️  Downloading: D20241101T042501_IFCB199.hdr
⬇️  Downloading: D20241101T082439_IFCB199.hdr
⬇️  Downloading: D20241101T122440_IFCB199.hdr
⬇️  Downloading: D20241101T162439_IFCB199.hdr
⬇️  Downloading: D20241101T202440_IFCB199.hdr
⬇️  Downloading: D20241102T002442_IFCB199.hdr
⬇️  Downloading: D20241102T042439_IFCB199.hdr
⬇️  Downloading: D20241102T082441_IFCB199.hdr
⬇️  Downloading: D20241102T162459_IFCB199.hdr
⬇️  Downloading: D20241102T202438_IFCB199.hdr
⬇️  Downloading: D20241103T002439_IFCB199.hdr
⬇️  Downloading: D20241103T042439_IFCB199.hdr
⬇️  Downloading: D20241103T082439_IFCB199.hdr
⬇️  Downloading: D20241103T122440_IFCB199.hdr
⬇️  Downloading: D20241103T162439_IFCB199.hdr
⬇️  Downloading: D20241103T202439_IFCB199.hdr
⬇️  Downloading: D20241104T042459_IFCB199.hdr
⬇️  Downloading: D20241104T082439_IFCB199.hdr
⬇️  Downloading: D20241104T122441_IFCB199.hdr
⬇️  Downloading: D20241104T162441_IFCB199.hdr
⬇️  Downloading: D20241104T202439_

⬇️  Downloading: D20241221T062452_IFCB199.hdr
⬇️  Downloading: D20241221T122433_IFCB199.hdr
⬇️  Downloading: D20241221T182431_IFCB199.hdr
⬇️  Downloading: D20241222T002433_IFCB199.hdr
⬇️  Downloading: D20241222T062433_IFCB199.hdr
⬇️  Downloading: D20241222T122431_IFCB199.hdr
⬇️  Downloading: D20241222T182431_IFCB199.hdr
⬇️  Downloading: D20241223T002433_IFCB199.hdr
⬇️  Downloading: D20241223T122449_IFCB199.hdr
⬇️  Downloading: D20241223T182430_IFCB199.hdr
⬇️  Downloading: D20241224T002433_IFCB199.hdr
⬇️  Downloading: D20241224T062431_IFCB199.hdr
⬇️  Downloading: D20241224T122431_IFCB199.hdr
⬇️  Downloading: D20241224T182430_IFCB199.hdr
⬇️  Downloading: D20241225T002431_IFCB199.hdr
⬇️  Downloading: D20241225T062431_IFCB199.hdr
⬇️  Downloading: D20241225T182450_IFCB199.hdr
⬇️  Downloading: D20241226T002430_IFCB199.hdr
⬇️  Downloading: D20241226T062430_IFCB199.hdr
⬇️  Downloading: D20241226T122431_IFCB199.hdr
⬇️  Downloading: D20241226T182429_IFCB199.hdr
⬇️  Downloading: D20241227T062434_

⬇️  Downloading: D20250211T002436_IFCB199.hdr
⬇️  Downloading: D20250211T062416_IFCB199.hdr
⬇️  Downloading: D20250211T122416_IFCB199.hdr
⬇️  Downloading: D20250211T182416_IFCB199.hdr
⬇️  Downloading: D20250212T002416_IFCB199.hdr
⬇️  Downloading: D20250212T062416_IFCB199.hdr
⬇️  Downloading: D20250212T122415_IFCB199.hdr
⬇️  Downloading: D20250212T182416_IFCB199.hdr
⬇️  Downloading: D20250213T062435_IFCB199.hdr
⬇️  Downloading: D20250213T122415_IFCB199.hdr
⬇️  Downloading: D20250213T182415_IFCB199.hdr
⬇️  Downloading: D20250214T002415_IFCB199.hdr
⬇️  Downloading: D20250214T062416_IFCB199.hdr
⬇️  Downloading: D20250214T122415_IFCB199.hdr
⬇️  Downloading: D20250214T182415_IFCB199.hdr
⬇️  Downloading: D20250215T002415_IFCB199.hdr
⬇️  Downloading: D20250215T122434_IFCB199.hdr
⬇️  Downloading: D20250215T182413_IFCB199.hdr
⬇️  Downloading: D20250216T002415_IFCB199.hdr
⬇️  Downloading: D20250216T122418_IFCB199.hdr
⬇️  Downloading: D20250216T182412_IFCB199.hdr
⬇️  Downloading: D20250217T002413_

In [8]:
columns_in_metadata_csv = ['filename', 'Latitude', 'Longitude', 'Depth', 'sample_type', 'Cruise', 'Instrument', 'tag1', 'tag2']

# these are the varialbes that will need to be updated per recovery 
cnsm_surveyed_anchor_position_lat = 35.95097
cnsm_surveyed_anchor_position_lon = -75.13082
cruise = 'AR87'
ifcb_instrument_num = 'IFCB199'

metadata_rows = []

for fname in filenames:
    row = {
        'filename': fname.replace('.hdr', ''),
        'Latitude': cnsm_surveyed_anchor_position_lat,
        'Longitude': cnsm_surveyed_anchor_position_lon,
        'Depth': 7,
        'sample_type': 'moored',
        'Cruise': cruise,
        'Instrument': ifcb_instrument_num,
        'tag1': 'site_cp10cnsm',
        'tag2': 'targetdepth_7m'
    }
    metadata_rows.append(row)

plims_metadata_df = pd.DataFrame(metadata_rows)

plims_metadata_df

Unnamed: 0,filename,Latitude,Longitude,Depth,sample_type,Cruise,Instrument,tag1,tag2
0,D20240403T141609_IFCB199,35.95097,-75.13082,7,moored,AR87,IFCB199,site_cp10cnsm,targetdepth_7m
1,D20240403T152620_IFCB199,35.95097,-75.13082,7,moored,AR87,IFCB199,site_cp10cnsm,targetdepth_7m
2,D20240403T182514_IFCB199,35.95097,-75.13082,7,moored,AR87,IFCB199,site_cp10cnsm,targetdepth_7m
3,D20240403T212511_IFCB199,35.95097,-75.13082,7,moored,AR87,IFCB199,site_cp10cnsm,targetdepth_7m
4,D20240404T002511_IFCB199,35.95097,-75.13082,7,moored,AR87,IFCB199,site_cp10cnsm,targetdepth_7m
...,...,...,...,...,...,...,...,...,...
1420,D20250402T182401_IFCB199,35.95097,-75.13082,7,moored,AR87,IFCB199,site_cp10cnsm,targetdepth_7m
1421,D20250403T122400_IFCB199,35.95097,-75.13082,7,moored,AR87,IFCB199,site_cp10cnsm,targetdepth_7m
1422,D20250404T002400_IFCB199,35.95097,-75.13082,7,moored,AR87,IFCB199,site_cp10cnsm,targetdepth_7m
1423,D20250404T122401_IFCB199,35.95097,-75.13082,7,moored,AR87,IFCB199,site_cp10cnsm,targetdepth_7m


In [9]:
# Define output path for the metadata CSV
output_csv_path = "PLIMS_R00001_metadata-NEW.csv"  # <- change this filename if needed

# Save the DataFrame to CSV
plims_metadata_df.to_csv(output_csv_path, index=False)

print(f"✅ Metadata saved to {output_csv_path}")

✅ Metadata saved to PLIMS_R00001_metadata-NEW.csv
