# Pre-Cruise Data Aggregator

**Goal:** Aggregate oceanographic datasets (Biology, Geology, Chemistry, Bathymetry) for a specific geographic region of interest (AOI).
**Output:** A single GeoPackage (`.gpkg`) importable into QGIS/ArcGIS.

**Datasets:**
TODO **GBIF** (Biological Occurrences)
TODO **OBIS** (eDNA & Occurrences)
TODO **IMLGS** (Geological Samples)
TODO **GLODAP** (Water Chemistry)
TODO **WCSD** (Water Column Sonar Footprints)
**GMRT Bathymetry** (Seafloor Bathymetry)

In [None]:
import os
import sys

# Detect if running in Google Colab
try:
    import google.colab
    IN_COLAB = True
    print("Running in Google Colab. Installing dependencies...")
    # Install required packages
    !pip install -q geopandas pygbif pyobis
except ImportError:
    IN_COLAB = False
    print("Running locally.")

import requests
import wget
import pandas as pd
import geopandas as gpd
from shapely.geometry import box, Polygon
from pygbif import occurrences as gbif_occ
from pyobis import occurrences as obis_occ
import json
import io
import zipfile
import xml.etree.ElementTree as ET

# Ensure data directory exists
if IN_COLAB:
    # In Colab, we save to the current working directory content
    DATA_DIR = "."
else:
    # Locally, we save to a data folder
    DATA_DIR = "./data"
    os.makedirs(DATA_DIR, exist_ok=True)

## Define Area of Interest (AOI)
Enter the bounding box coordinates for your cruise region.

In [None]:
# EXAMPLE: BLAKE PLATEAU / SOUTHEAST US
MIN_LAT = 28.0
MAX_LAT = 32.0
MIN_LON = -80.0
MAX_LON = -76.0

# Create a Shapely Polygon for the AOI
aoi_polygon = box(MIN_LON, MIN_LAT, MAX_LON, MAX_LAT)
aoi_wkt = aoi_polygon.wkt

print(f"Area of Interest defined: {aoi_wkt}")

# Output Filename
OUTPUT_FILENAME = os.path.join(DATA_DIR, "PreCruise_Data_Package.gpkg")

## Get Bathymetry
Get available high-resolution (masked and unmasked) bathymetry from GMRT synthesis using the GMRT GridServer API at: https://www.gmrt.org/services/gridserverinfo.php#!/services/getGMRTGrid

Masked and unmasked bathymetry files are added to the geodatabase. Set resolution using the argument below. Note requests with a filesize too large will fail.

In [None]:

# Create bathymetry landing directory
BATHY_DIR = os.path.join(DATA_DIR, "bathymetry")
os.makedirs(BATHY_DIR, exist_ok=True)

# GMRT resolution entries can be low/default, med, high, max
RESOLUTION = 'default'

# GMRT file types can vary, but recommend GeoTiff or Coards NetCDF grid
format = 'coards'  # geotiff, coards

# Create url for GMRT API call from coordinates - unmasked
gmrt_url_unmasked = (f'https://www.gmrt.org/services/GridServer?north={MAX_LAT}'
                     f'&west={MIN_LON}&east={MAX_LON}&south={MIN_LAT}'
                     f'&layer=topo&format={format}&resolution={RESOLUTION}')

# Create url for GMRT API call from coordinates - unmasked metadata
gmrt_url_unmasked_metadata = (f'https://www.gmrt.org/services/GridServer/metadata?north={MAX_LAT}'
                              f'&west={MIN_LON}&east={MAX_LON}&south={MIN_LAT}'
                              f'&format={format}&mformat=json&resolution={RESOLUTION}')

# Create url for GMRT API call from coordinates - masked
gmrt_url_masked = (f'https://www.gmrt.org/services/GridServer?north={MAX_LAT}'
                   f'&west={MIN_LON}&east={MAX_LON}&south={MIN_LAT}'
                   f'&layer=topo-mask&format={format}&resolution={RESOLUTION}')

# Create url for GMRT API call from coordinates - masked metadata
gmrt_url_masked_metadata = (f'https://www.gmrt.org/services/GridServer/metadata?north={MAX_LAT}'
                              f'&west={MIN_LON}&east={MAX_LON}&south={MIN_LAT}'
                              f'&format=geotiff&m{format}=json&resolution={RESOLUTION}')

print(f"Downloading bathymetry from\n{gmrt_url_unmasked}\n{gmrt_url_masked}\nto bathymetry folder")

# Use requests library to get the data
bathy_umask = requests.get(gmrt_url_unmasked, timeout=10)
bathy_mask = requests.get(gmrt_url_masked, timeout=10)
bathy_umask_metadata = requests.get(gmrt_url_unmasked_metadata, timeout=10)
bathy_mask_metadata = requests.get(gmrt_url_masked_metadata, timeout=10)

# set file extension
if format == 'coards':
    extension = 'grd'
elif format == 'geotiff':
    extension = 'tiff'
else:
    print("Invalid file extension")

# Check for success and write out file
if bathy_umask.status_code == 200:
    print("Retrieved umasked bathymetry")
    
    # parse the metadata for the file to a Python dictionary
    umask_metadata = bathy_umask_metadata.json()

    # save the file
    with open(f"{BATHY_DIR}/bathymetry_unmasked_{float(umask_metadata['meters_per_node']):.0f}m.{extension}", "wb") as f:
        f.write(bathy_umask.content)
else:
    print(f"Error getting unmasked bathymetry: {bathy_umask.status_code}")

# Check for success and write out file
if bathy_mask.status_code == 200:
    print("Retrieved masked bathymetry")
    
    # parse the metadata for the file to a Python dictionary
    mask_metadata = bathy_umask_metadata.json()

    # save the file
    with open(f"{BATHY_DIR}/bathymetry_masked_{float(mask_metadata['meters_per_node']):.0f}m.{extension}", "wb") as f:
        f.write(bathy_mask.content)

else:
   print(f"Error getting masked bathymetry: {bathy_mask.status_code}")


## Cruise Tracks

Download previous cruise tracks logged in GMRT 

In [None]:

track_url = "https://www.gmrt.org/shapefiles/gmrt_cruise_tracks.zip"
filename = "gmrt_cruise_tracks.zip"

wget.download(track_url, out=filename)

with zipfile.ZipFile(filename, 'r') as zip_ref:
    zip_ref.extractall(f"{DATA_DIR}/cruise_tracks")

# trim cruise track shapefile to bounding coordinates
gdf = gpd.read_file("cruise_tracks/gmrt_cruise_tracks.shp")

trimmed_gdf = gdf.clip(aoi_polygon)
trimmed_gdf.to_file(f"{DATA_DIR}/cruise_tracks/trimmed_cruise_tracks.shp")

# TODO either delete the larger global cruise tracks file or zip them back up


## Attributions

A separate code block downloads appropriate data attributions for each of the aggregated sources above

In [None]:

# TODO bathymetry attributions XML parsing

# Create url for GMRT API call from coordinates - attributions
gmrt_url_attrib = (f'https://www.gmrt.org/services/GridServer/attribution?north={MAX_LAT}'
                   f'&west={MIN_LON}&east={MAX_LON}&south={MIN_LAT}'
                   f'&layer=topo&format=geotiff&resolution={RESOLUTION}')


# Get bathymetry attributions and write to text file
report_path = os.path.join(BATHY_DIR, "data_attribution.txt")

bathy_attrib = requests.get(gmrt_url_attrib, timeout=10)

