In [None]:
# ================================================================================================
#   HARVARD DATAVERSE DATASET DOWNLOADER (Filtered by Keyword)
# ================================================================================================
#
#   ❖ PURPOSE:
#     Downloads only files containing a specific keyword (e.g., "ERA5L") from a Harvard Dataverse dataset.
#
#   ❖ FEATURES:
#     - Keyword-based filtering (case-insensitive)
#     - Resumable downloads with HTTP Range support
#     - Automatic retry on transient errors
#     - Clean, single-line progress bar with logical output
#     - Professional-style status notifications
#
# ================================================================================================


# STEP 1: IMPORT DEPENDENCIES
# ---------------------------
import requests
import os
import sys
from urllib.parse import quote
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry


# STEP 2: USER-EDITABLE OPTIONS
# -----------------------------
# ❖ Customize these variables to suit your dataset and local storage preferences.
dataset_doi        = "doi:10.7910/DVN/V2C6G2"  # Target dataset DOI
download_directory = "Observed_daily_dscharge/Raw_netcdf"
keyword_filter     = "ERA5L"                   # Keyword to filter filenames (case-insensitive)


# STEP 3: CONFIGURE RESILIENT HTTP SESSION
# ----------------------------------------

def configure_session():
    print("\n========== STEP 3: CONFIGURING HTTP SESSION ==========\n")
    print(">> Subprocess: Setting up retry strategy for network resilience\n")

    session = requests.Session()
    retry_strategy = Retry(
        total=5,
        status_forcelist=[429, 500, 502, 503, 504],
        backoff_factor=1
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("https://", adapter)
    session.mount("http://", adapter)

    print(">> ✅ Session configured to retry up to 5 times on transient errors.\n")
    return session


# STEP 4: RETRIEVE AND FILTER FILES BY KEYWORD (WITH DEBUGGING)
# --------------------------------------------------------------

def get_filtered_files(persistent_id, session, keyword="ERA5L"):
    print("\n========== STEP 4: RETRIEVING AND FILTERING FILES ==========\n")
    print(">> Subprocess: Accessing Dataverse API and filtering by keyword\n")

    try:
        encoded_doi = quote(persistent_id, safe='')
        files_api_url = f"https://dataverse.harvard.edu/api/datasets/:persistentId/versions/:latest/files?persistentId={encoded_doi}"
        print(f">> Requesting file list for DOI: {persistent_id}")
        print(f">> API Endpoint: {files_api_url}\n")

        response = session.get(files_api_url, timeout=90)

        # NEW: Print HTTP status code and partial response for debugging
        print(f">> HTTP Status Code: {response.status_code}")
        print(f">> Raw Response Preview:\n{response.text[:500]}\n")

        response.raise_for_status()

        try:
            json_data = response.json()
            files_data = json_data.get('data', [])
        except ValueError:
            print(">> ❌ ERROR: Response is not valid JSON.")
            print(">> Raw response:\n", response.text[:500])
            return []

        if not files_data:
            print(">> ❌ No files found in the dataset.\n")
            return []

        # Case-insensitive keyword filtering
        filtered_files = [
            f for f in files_data
            if keyword.lower() in f['dataFile']['filename'].lower()
        ]

        print(f">> ✅ Found {len(filtered_files)} file(s) containing keyword '{keyword}' (case-insensitive match)\n")
        return filtered_files

    except requests.exceptions.RequestException as e:
        print(f">> ❌ ERROR: Failed to retrieve file list. {e}\n")
        return []


# STEP 5: DOWNLOAD FILES WITH RESUME AND STATUS BAR
# -------------------------------------------------

def download_selected_files(files_to_download, download_dir, session):
    print("\n========== STEP 5: INITIATING DOWNLOADS ==========\n")
    print(">> Subprocess: Downloading files with resume support and progress tracking\n")

    os.makedirs(download_dir, exist_ok=True)

    for i, file_info in enumerate(files_to_download):
        file_id     = file_info['dataFile']['id']
        filename    = file_info['dataFile']['filename']
        total_size  = file_info['dataFile']['filesize']
        full_path   = os.path.join(download_dir, filename)

        print(f"\n[{i+1}/{len(files_to_download)}] Starting: {filename}")

        try:
            download_url   = f"https://dataverse.harvard.edu/api/access/datafile/{file_id}"
            headers        = {}
            file_mode      = 'wb'
            downloaded_size = 0

            # Resume logic
            if os.path.exists(full_path):
                downloaded_size = os.path.getsize(full_path)
                if downloaded_size < total_size:
                    file_mode = 'ab'
                    headers['Range'] = f'bytes={downloaded_size}-'
                    print(f">> Resuming from {downloaded_size / (1024*1024):.2f} MB (Partial download detected)\n")
                elif downloaded_size == total_size:
                    print(">> File already complete. Skipping.\n")
                    continue
                else:
                    print(">> Corrupt file detected (larger than expected). Restarting download.\n")
                    downloaded_size = 0

            # Begin download
            with session.get(download_url, stream=True, headers=headers, timeout=90) as r:
                r.raise_for_status()
                with open(full_path, file_mode) as file:
                    for chunk in r.iter_content(chunk_size=8192):
                        downloaded_size += len(chunk)
                        file.write(chunk)
                        percent = (downloaded_size / total_size) * 100 if total_size > 0 else 100
                        status = (
                            f">> Downloaded: {downloaded_size / (1024*1024):.2f} / "
                            f"{total_size / (1024*1024):.2f} MB ({percent:.1f}%)"
                        )
                        sys.stdout.write(f"\r{status}")
                        sys.stdout.flush()

            print(f"\n>> ✅ Completed: {filename}")
            print(">> Reasoning: File size matched expected total. Download integrity confirmed.\n")

        except requests.exceptions.RequestException as e:
            print(f"\n>> ❌ Network error during download of {filename}: {e}\n")
        except Exception as e:
            print(f"\n>> ❌ Unexpected error with {filename}: {e}\n")

    print("\n========== ✅ ALL FILTERED DOWNLOADS COMPLETED ==========\n")


# MAIN EXECUTION BLOCK
# --------------------

if __name__ == "__main__":
    print("\n===================================================")
    print("        HARVARD DATAVERSE DOWNLOADER - STARTED     ")
    print("===================================================\n")

    session = configure_session()
    files_to_download = get_filtered_files(dataset_doi, session, keyword=keyword_filter)

    if files_to_download:
        download_selected_files(files_to_download, download_directory, session)
    else:
        print(">> No matching files found or error occurred. Exiting.\n")


In [None]:
# =================================================================================================
#
#   ** Harvard Dataverse Dataset Downloader (Bulk download) **
#
#   **Description:**
#   This script downloads files from a specific Harvard Dataverse dataset. It automatically
#   retrieves the complete file list and prompts the user to download either the first or
#   second half of the files, allowing for large downloads to be split into two sessions.
#
#   **Key Features:**
#   - **Direct Download:** No search or manual file selection required.
#   - **Two-Step Downloading:** Automatically splits the full file list into two halves.
#   - **Text-Based Progress:** Displays a clean, single-line download status for each file.
#   - **Automatic Retries:** Automatically retries failed or timed-out connections.
#   - **Resumable Downloads:** If a file download is interrupted, it can be resumed.
#
# =================================================================================================

# 1. ===== Import Dependencies =====
# ===================================
import requests
import os
import time
import sys
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# =================================================================================================
# 2. ===== User-Definable Options =====
# ======================================
# --- Instructions ---
# Please modify the variables in this section to suit your needs.
#
# `dataset_doi`: The persistent identifier (DOI) for the target dataset.
# `download_directory`: The full path to the folder where the files will be saved.
#
dataset_doi = "doi:10.7910/DVN/V2C6G2" # Corresponds to "High-resolution gridded streamflow..."
download_directory = "Observed_daily_dscharge/Raw_netcdf"
# =================================================================================================


def get_files_and_choose_batch(persistent_id, session):
    """
    Retrieves the full file list for a dataset and prompts the user to
    select which half of the list to download.
    """
    print("STEP 3: RETRIEVING AND PREPARING FILE LIST")
    print("---------------------------------------------")
    try:
        files_api_url = f"https://dataverse.harvard.edu/api/datasets/:persistentId/versions/:latest/files?persistentId={persistent_id}"
        print(f"  > Requesting file list for DOI: {persistent_id}")
        meta_response = session.get(files_api_url, timeout=90)
        meta_response.raise_for_status()
        files_data = meta_response.json()['data']

        if not files_data:
            print("  > ❌ This dataset contains no files.")
            return []

        print(f"  ✅  Found a total of {len(files_data)} files in the dataset.")

        # Split the file list into two halves
        midpoint = (len(files_data) + 1) // 2
        first_half = files_data[:midpoint]
        second_half = files_data[midpoint:]

        print(f"    - First Half contains: {len(first_half)} files.")
        print(f"    - Second Half contains: {len(second_half)} files.")

        while True:
            batch_choice = input(f"\n  > Which batch do you want to download? (Enter 1 or 2): ")
            if batch_choice == '1':
                print(f"\n  > Preparing to download the first half ({len(first_half)} files).\n")
                return first_half
            elif batch_choice == '2':
                print(f"\n  > Preparing to download the second half ({len(second_half)} files).\n")
                return second_half
            else:
                print("  > Invalid input. Please enter 1 or 2.")

    except requests.exceptions.RequestException as e:
        print(f"\n  > ❌ ERROR: Could not retrieve file list. {e}")
        return []


def download_selected_files(files_to_download, download_dir, session):
    """
    Downloads a list of selected files one by one.
    """
    print("STEP 4: INITIATING DOWNLOADS")
    print("------------------------------")
    os.makedirs(download_dir, exist_ok=True)
    
    for i, file_info in enumerate(files_to_download):
        file_id = file_info['dataFile']['id']
        filename = file_info['dataFile']['filename']
        total_size = file_info['dataFile']['filesize']
        full_path = os.path.join(download_dir, filename)

        print(f"\n[{i+1}/{len(files_to_download)}] Starting download for: {filename}")

        try:
            download_url = f"https://dataverse.harvard.edu/api/access/datafile/{file_id}"
            headers = {}
            file_mode = 'wb'
            downloaded_size = 0

            if os.path.exists(full_path):
                downloaded_size = os.path.getsize(full_path)
                if downloaded_size < total_size:
                    file_mode = 'ab'
                    headers['Range'] = f'bytes={downloaded_size}-'
                    print(f"  > Resuming from {downloaded_size / (1024*1024):.2f} MB...")
                elif downloaded_size == total_size:
                    print("  > File already complete. Skipping.")
                    continue
                else:
                    print("  > Corrupt local file found. Restarting download.")
                    downloaded_size = 0

            with session.get(download_url, stream=True, headers=headers, timeout=90) as r:
                r.raise_for_status()
                with open(full_path, file_mode) as file:
                    for chunk in r.iter_content(chunk_size=8192):
                        downloaded_size += len(chunk)
                        file.write(chunk)
                        percent = (downloaded_size / total_size) * 100 if total_size > 0 else 100
                        status = f"> Downloaded {downloaded_size / (1024*1024):.2f} / {total_size / (1024*1024):.2f} MB ({percent:.1f}%)"
                        sys.stdout.write(f"\r{status}")
                        sys.stdout.flush()
            print(f"\n  ✅  Download complete: {filename}")

        except requests.exceptions.RequestException as e:
            print(f"\n  > ❌ DOWNLOAD FAILED for {filename}. {e}")
        except Exception as e:
            print(f"\n  > ❌ An UNEXPECTED ERROR occurred with {filename}: {e}")

if __name__ == "__main__":
    print("\n=============================================")
    print("   HARVARD DATAVERSE DOWNLOADER - STARTED")
    print("=============================================\n")

    print("STEP 2: CONFIGURING RESILIENT HTTP SESSION")
    print("--------------------------------------------")
    session = requests.Session()
    retry_strategy = Retry(total=5, status_forcelist=[429, 500, 502, 503, 504], backoff_factor=1)
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("https://", adapter)
    session.mount("http://", adapter)
    print("  > Session configured to retry up to 5 times on connection errors.\n")
    
    # Retrieve the file list and prompt the user to choose a batch
    files_to_download = get_files_and_choose_batch(dataset_doi, session)

    # Proceed with download if a batch was selected
    if files_to_download:
        download_selected_files(files_to_download, download_directory, session)
        print("\n\nAll selected downloads have been processed.")
    else:
        print("\nNo files were selected for download or an error occurred. Exiting script.")

