# Data Download


This notebook deals with transferring datasets from OOINET (and possibly other
resources such as the PO-DAAC) to localhost.


The data order is easily done manually although it gets a bit tedious if one does it in 1-year blocks.
The resulting links are placed in a text file with one URL per line:

```
2015 done https://downloads.oceanobservatories.org/async_results/kilroy1618@gmail.com/20260207T235914738Z-RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample
2016 done https://downloads.oceanobservatories.org/async_results/kilroy1618@gmail.com/20260207T235958368Z-RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample
2017      https://downloads.oceanobservatories.org/async_results/kilroy1618@gmail.com/20260208T000033389Z-RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample
2018 done https://downloads.oceanobservatories.org/async_results/kilroy1618@gmail.com/?
2019 done https://downloads.oceanobservatories.org/async_results/kilroy1618@gmail.com/?
2020 done https://downloads.oceanobservatories.org/async_results/kilroy1618@gmail.com/?
2021 done https://downloads.oceanobservatories.org/async_results/kilroy1618@gmail.com/20260207T235434950Z-RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample
2022      https://downloads.oceanobservatories.org/async_results/kilroy1618@gmail.com/20260207T235535508Z-RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample
2023      https://downloads.oceanobservatories.org/async_results/kilroy1618@gmail.com/20260207T235629121Z-RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample
2024      https://downloads.oceanobservatories.org/async_results/kilroy1618@gmail.com/20260207T235701343Z-RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample
2025      https://downloads.oceanobservatories.org/async_results/kilroy1618@gmail.com/20260207T235802626Z-RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample
```

We want the code in the cell below to read this file and go to each link in succession, 
downloading the data to a corresponding localhost folder. 

In [None]:
import requests
from pathlib import Path
from bs4 import BeautifulSoup
import re

def parse_year_from_filename(filename):
    """Extract year from NetCDF filename."""
    pattern = r'_(\d{4})\d{2}\d{2}T\d{6}\.\d+-\d{8}T\d{6}\.\d+\.nc$'
    match = re.search(pattern, filename)
    if match:
        return int(match.group(1))
    return None

def is_file_complete(filepath):
    """Check if file exists and is complete (non-zero size)."""
    if not filepath.exists():
        return False
    return filepath.stat().st_size > 0

def download_file(url, destination):
    """Download a file from URL to destination."""
    try:
        response = requests.get(url, stream=True, timeout=300)
        response.raise_for_status()
        
        # Download to temporary file first
        temp_file = destination.with_suffix('.tmp')
        
        with open(temp_file, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        
        # Rename to final name only if download completed
        temp_file.rename(destination)
        return True
    except Exception as e:
        print(f"    Error: {e}")
        # Clean up temp file if it exists
        if temp_file.exists():
            temp_file.unlink()
        return False

def bulk_download():
    """Bulk download CTD files from URL list with restart tolerance."""
    
    # Read URL list
    url_list_file = Path("~/argosy/download_link_list.txt").expanduser()
    
    if not url_list_file.exists():
        print(f"File not found: {url_list_file}")
        return
    
    with open(url_list_file, 'r') as f:
        urls = [line.strip() for line in f if line.strip()]
    
    print(f"Found {len(urls)} URLs to process\n")
    
    # Base folder
    base_folder = Path("~/ooidata/rca/sb/scalar").expanduser()
    
    # Check that year folders exist
    for year in range(2014, 2027):
        year_folder = base_folder / f"{year}_ctd"
        if not year_folder.exists():
            print(f"Destination folder does not exist: {year_folder}")
            print("Please create all required year folders before running download")
            return
    
    total_downloaded = 0
    total_skipped = 0
    total_already_complete = 0
    
    # Process each URL
    for url_idx, url in enumerate(urls, 1):
        print(f"=== URL {url_idx}/{len(urls)} ===")
        print(f"{url}")
        
        try:
            # Get directory listing
            response = requests.get(url, timeout=30)
            response.raise_for_status()
            
            # Parse HTML to find .nc files
            soup = BeautifulSoup(response.text, 'html.parser')
            links = soup.find_all('a')
            
            nc_files = []
            for link in links:
                href = link.get('href', '')
                if href.endswith('.nc') and not href.endswith('.ncml'):
                    nc_files.append(href)
            
            # Count already downloaded files
            already_downloaded = 0
            to_download = []
            
            for filename in nc_files:
                year = parse_year_from_filename(filename)
                if year is None:
                    continue
                    
                dest_folder = base_folder / f"{year}_ctd"
                dest_file = dest_folder / filename
                
                if is_file_complete(dest_file):
                    already_downloaded += 1
                else:
                    to_download.append((filename, year))
            
            # Print diagnostics
            print(f"  Total .nc files: {len(nc_files)}")
            print(f"  Already downloaded: {already_downloaded}")
            print(f"  Remaining to download: {len(to_download)}")
            
            total_already_complete += already_downloaded
            
            if len(to_download) == 0:
                print(f"  All files complete, skipping\n")
                continue
            
            # Download remaining files
            for file_idx, (filename, year) in enumerate(to_download, 1):
                dest_folder = base_folder / f"{year}_ctd"
                dest_file = dest_folder / filename
                
                file_url = url.rstrip('/') + '/' + filename
                print(f"  [{file_idx}/{len(to_download)}] Downloading {filename} to {year}_ctd/")
                
                if download_file(file_url, dest_file):
                    total_downloaded += 1
                    file_size = dest_file.stat().st_size / (1024*1024)
                    print(f"    Complete: {file_size:.1f} MB")
                else:
                    total_skipped += 1
            
            print()
                    
        except Exception as e:
            print(f"  Error processing URL: {e}\n")
            continue
    
    print(f"=== Download Summary ===")
    print(f"Files already complete: {total_already_complete}")
    print(f"Files newly downloaded: {total_downloaded}")
    print(f"Files failed/skipped: {total_skipped}")
    
    # Report files by year
    print("\nTotal files by year:")
    for year in range(2014, 2027):
        year_folder = base_folder / f"{year}_ctd"
        if year_folder.exists():
            count = len(list(year_folder.glob("*.nc")))
            if count > 0:
                print(f"  {year}: {count} files")

# Run the bulk download
bulk_download()


Found 8 URLs to process

=== URL 1/8 ===
https://downloads.oceanobservatories.org/async_results/kilroy1618@gmail.com/20260207T235914738Z-RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample
  Total .nc files: 23
  Already downloaded: 23
  Remaining to download: 0
  All files complete, skipping

=== URL 2/8 ===
https://downloads.oceanobservatories.org/async_results/kilroy1618@gmail.com/20260207T235958368Z-RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample
  Total .nc files: 24
  Already downloaded: 24
  Remaining to download: 0
  All files complete, skipping

=== URL 3/8 ===
https://downloads.oceanobservatories.org/async_results/kilroy1618@gmail.com/20260208T000033389Z-RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample
  Total .nc files: 13
  Already downloaded: 13
  Remaining to download: 0
  All files complete, skipping

=== URL 4/8 ===
https://downloads.oceanobservatories.org/async_results/kilroy1618@gmail.com/20260207T235434950Z-RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdp