# Data Download


This notebook deals with transferring datasets from OOINET (and possibly other
resources such as the PO-DAAC) to localhost.

In [None]:
import requests
from bs4 import BeautifulSoup
import os
from pathlib import Path

def download_ooi_data(download_link, target_folder):

    # expand target folder to full path if it uses ~
    target_folder = os.path.expanduser(target_folder)
    
    # Create folder if it doesn't exist
    Path(target_folder).mkdir(parents=True, exist_ok=True)
    
    # Get directory listing
    response = requests.get(download_link)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all file links (exclude directories ending with /)
    files = []
    for link in soup.find_all('a'):
        href = link.get('href')
        if href and not href.startswith('?') and href != '../' and not href.endswith('/'):
            files.append(href)
    
    # Get file sizes - Initial Download Manifest
    file_info = []
    total_size = 0
    nc_files = []
    
    for filename in files:
        file_url = f"{download_link.rstrip('/')}/{filename}"
        head = requests.head(file_url)
        size = int(head.headers.get('content-length', 0))
        file_info.append((filename, size))
        total_size += size
        
        if filename.endswith('.nc'):
            nc_files.append((filename, size))
    
    # Print initial statistics
    print(f"Total files: {len(files)}")
    print(f"Total data volume: {total_size / (1024**3):.2f} GB")
    
    if nc_files:
        mean_nc_size = sum(s for _, s in nc_files) / len(nc_files)
        print(f"NetCDF files: {len(nc_files)}")
        print(f"Mean NetCDF file size: {mean_nc_size / (1024**2):.2f} MB")
    
    # Modify Download Manifest
    userskip = input("\nSkip CTD files? (y/n): ")
    if userskip.lower() != "n":
        file_info = [(f, s) for f, s in file_info if not ('ctdpf' in f or 'CTDPF' in f)]
        print(f"Removed CTD files from manifest")
    
    # Print final statistics
    final_size = sum(s for _, s in file_info)
    print(f"\nFinal Download Manifest:")
    print(f"Total files: {len(file_info)}")
    print(f"Total data volume: {final_size / (1024**3):.2f} GB")
    
    # Confirm download
    confirm = input("\nProceed with download? (y/n): ")
    if confirm.lower() != 'y':
        print("Download cancelled")
        return

    print("\n\n\n\n")
    print(file_info)
    print("\n\n\n\n")


    # Download files
    for filename, size in file_info:
        file_url = f"{download_link.rstrip('/')}/{filename}"
        target_path = os.path.join(target_folder, filename)
        print("filename is \n\n")
        print(filename)
        print("\n\nfile_url is \n\n")
        print(file_url)
        print("\n\n\ntarget_path is \n\n")
        print(target_path)
        
        print(f"Downloading {filename} ({size / (1024**2):.2f} MB)...")
        
        r = requests.get(file_url, stream=True)
        with open(target_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    
    print(f"\nDownload complete. Files saved to {target_folder}")

if __name__ == "__main__":
    # download_link = input("Enter download link: ")
    download_link = "https://downloads.oceanobservatories.org/async_results/kilroy1618@gmail.com/20260103T003554734Z-RS01SBPS-SF01A-3C-PARADA101-streamed-parad_sa_sample"
    # target_folder = input("Enter target folder path: ")
    target_folder = "/home/rob/ooidata/rca/sb/scalar/2015_2025_par"
    download_ooi_data(download_link, target_folder)

Total files: 155
Total data volume: 70.86 GB
NetCDF files: 125
Mean NetCDF file size: 580.45 MB



Skip CTD files? (y/n):  y


Removed CTD files from manifest

Final Download Manifest:
Total files: 110
Total data volume: 55.41 GB



Proceed with download? (y/n):  y







[('deployment0001_RS01SBPS-SF01A-3C-PARADA101-streamed-parad_sa_sample.ncml', 2185), ('deployment0001_RS01SBPS-SF01A-3C-PARADA101-streamed-parad_sa_sample_20150101T173429.059605-20150205T145959.761242.nc', 691440176), ('deployment0001_RS01SBPS-SF01A-3C-PARADA101-streamed-parad_sa_sample_20150205T150000.053521-20150312T025959.778731.nc', 690818270), ('deployment0001_RS01SBPS-SF01A-3C-PARADA101-streamed-parad_sa_sample_20150312T030000.060683-20150421T145959.885758.nc', 688910602), ('deployment0001_RS01SBPS-SF01A-3C-PARADA101-streamed-parad_sa_sample_20150421T150000.167701-20150524T145959.808200.nc', 690762066), ('deployment0001_RS01SBPS-SF01A-3C-PARADA101-streamed-parad_sa_sample_20150524T150000.100172-20150628T085959.734947.nc', 692086824), ('deployment0001_RS01SBPS-SF01A-3C-PARADA101-streamed-parad_sa_sample_20150628T090000.027704-20150705T235959.767406.nc', 158898090), ('deployment0002_RS01SBPS-SF01A-3C-PARADA101-streamed-parad_sa_sample.ncml', 3292), ('deployment0002_RS01SBPS-SF