# Data Download


This notebook copies data from an OOINET download site to localhost. The directory structure is

```
ooidata/
        array/
              platform/
                       density/
                               year_instrument/
                                               long_filename_for_dataset.nc
```


where...


- `ooidata` is verbatim (root)
- `array` = "rca" for regional cabled array, to begin with
- `platform` = "sb" for (Oregon) Slope Base, to begin with
- `density` = "scalar" or alternative "vector"
- `year` = "2018", etcetera; avoiding more than one year per folder
- `instrument` = "ctd" to begin with
- `long_filename.nc` is constructed by OOINET; and it indicates time range



In [9]:
import requests
from bs4 import BeautifulSoup
import os
from pathlib import Path

def download_ooi_data(download_link, target_folder):

    # expand target folder to full path if it uses ~
    target_folder = os.path.expanduser(target_folder)
    
    # Create folder if it doesn't exist
    Path(target_folder).mkdir(parents=True, exist_ok=True)
    
    # Get directory listing
    response = requests.get(download_link)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all file links (exclude directories ending with /)
    files = []
    for link in soup.find_all('a'):
        href = link.get('href')
        if href and not href.startswith('?') and href != '../' and not href.endswith('/'):
            files.append(href)
    
    # Get file sizes - Initial Download Manifest
    file_info = []
    total_size = 0
    nc_files = []
    
    for filename in files:
        file_url = f"{download_link.rstrip('/')}/{filename}"
        head = requests.head(file_url)
        size = int(head.headers.get('content-length', 0))
        file_info.append((filename, size))
        total_size += size
        
        if filename.endswith('.nc'):
            nc_files.append((filename, size))
    
    # Print initial statistics
    print(f"Total files: {len(files)}")
    print(f"Total data volume: {total_size / (1024**3):.2f} GB")
    
    if nc_files:
        mean_nc_size = sum(s for _, s in nc_files) / len(nc_files)
        print(f"NetCDF files: {len(nc_files)}")
        print(f"Mean NetCDF file size: {mean_nc_size / (1024**2):.2f} MB")
    
    # Modify Download Manifest
    userskip = input("\nSkip CTD files? (y/n): ")
    if userskip.lower() != "n":
        file_info = [(f, s) for f, s in file_info if not ('ctdpf' in f or 'CTDPF' in f)]
        print(f"Removed CTD files from manifest")
    
    # Print final statistics
    final_size = sum(s for _, s in file_info)
    print(f"\nFinal Download Manifest:")
    print(f"Total files: {len(file_info)}")
    print(f"Total data volume: {final_size / (1024**3):.2f} GB")
    
    # Confirm download
    confirm = input("\nProceed with download? (y/n): ")
    if confirm.lower() != 'y':
        print("Download cancelled")
        return

    print("\n\n\n\n")
    print(file_info)
    print("\n\n\n\n")


    # Download files
    for filename, size in file_info:
        file_url = f"{download_link.rstrip('/')}/{filename}"
        target_path = os.path.join(target_folder, filename)
        print("filename is \n\n")
        print(filename)
        print("\n\nfile_url is \n\n")
        print(file_url)
        print("\n\n\ntarget_path is \n\n")
        print(target_path)
        
        print(f"Downloading {filename} ({size / (1024**2):.2f} MB)...")
        
        r = requests.get(file_url, stream=True)
        with open(target_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    
    print(f"\nDownload complete. Files saved to {target_folder}")

if __name__ == "__main__":
    # download_link = input("Enter download link: ")
    # download_link = "https://downloads.oceanobservatories.org/async_results/kilroy1618@gmail.com/20260103T003554734Z-RS01SBPS-SF01A-3C-PARADA101-streamed-parad_sa_sample"
    # 2019 OSB PS CTD download_link = "https://downloads.oceanobservatories.org/async_results/kilroy1618@gmail.com/20260205T190055511Z-RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample"
    # 2020 OSB PS CTD
    download_link = "https://downloads.oceanobservatories.org/async_results/kilroy1618@gmail.com/20260205T191657736Z-RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample"

    # target_folder = input("Enter target folder path: ")
    target_folder = "/home/kilroy/ooidata/rca/sb/scalar/2019_ctd"
    download_ooi_data(download_link, target_folder)

Total files: 14
Total data volume: 4.26 GB
NetCDF files: 10
Mean NetCDF file size: 435.91 MB



Skip CTD files? (y/n):  n



Final Download Manifest:
Total files: 14
Total data volume: 4.26 GB



Proceed with download? (y/n):  y







[('deployment0008_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample.ncml', 2579), ('deployment0008_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20200803T164413.888856-20200824T235959.129850.nc', 491985122), ('deployment0008_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20200825T000000.129850-20200913T235959.627555.nc', 490287476), ('deployment0008_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20200914T000000.627765-20201002T115959.289144.nc', 482824481), ('deployment0008_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20201002T120000.289146-20201020T235959.042914.nc', 492198051), ('deployment0008_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20201021T000000.043437-20201110T115959.760754.nc', 492834729), ('deployment0008_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20201110T120000.760652-20201128T235959.413385.nc', 492154961), ('deployment0008_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20201129T000000.41

In [5]:
!ls ~/ooidata/rca/sb/scalar/2019_ctd

deployment0006_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample.ncml
deployment0006_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20190101T000001.742366-20190118T235959.845587.nc
deployment0006_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20190119T000000.845285-20190206T115959.792782.nc
deployment0006_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20190206T120000.792895-20190224T235959.207768.nc
deployment0006_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20190225T000000.207466-20190315T115959.271513.nc
deployment0006_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20190315T120000.271524-20190402T235959.171888.nc
deployment0006_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20190403T000000.172626-20190421T115959.199853.nc
deployment0006_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20190421T120000.199654-20190510T115959.483280.nc
deployment0006_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20190510T1200

In [8]:
import xarray as xr
ds = xr.open_dataset('~/ooidata/rca/sb/scalar/2019_ctd/deployment0006_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20190315T120000.271524-20190402T235959.171888.nc')
ds