# getBCODMOinfo
original code from Adam Shepard, Woods Hole Oceanographic Institution\
Notes added by Krista Longnecker, 29 June 2025
Edit this to use with the DeepDOM project, 18 July 2025

In [8]:
#only need to run this once
# %%capture
# !pip install frictionless
# !pip install frictionless[excel]
# !pip install -q sparqlwrapper

In [9]:
import pandas as pd
import requests
import os
import json

from datetime import datetime, timedelta, timezone

from SPARQLWrapper import SPARQLWrapper, POST, JSON

from frictionless import describe, Package

In [10]:
"""CONSTANTS"""

SPARQL_ENDPOINT = 'https://lod.bco-dmo.org/sparql'

PROJECT_URI = 'http://lod.bco-dmo.org/id/project/2204'

In [11]:
""" HELPER FUNCTIONS """
def rfc3339_datetime_str():
    """
    Construct an RFC3339-compliant datetime
    """
    return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")

def get_sparql_dataframe(service, query):
    """
    Helper function to convert SPARQL results into a Pandas data frame.
    """
    sparql = SPARQLWrapper(service)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    result = sparql.query()
    processed_results = json.load(result.response)
    cols = processed_results['head']['vars']

    out = []
    for row in processed_results['results']['bindings']:
        item = []
        for c in cols:
            item.append(row.get(c, {}).get('value'))
        out.append(item)

    return pd.DataFrame(out, columns=cols)



In [12]:
"""Get the Projects Datasets"""

PROJECT_DATASETS_QUERY = """
SELECT ?dataset ?title ?doi
WHERE {
  VALUES ?project { <""" + PROJECT_URI + """>}
  ?project a odo:Project .
  ?project odo:hasDataset ?dataset .
  ?dataset a odo:Dataset .
  OPTIONAL { ?dataset odo:datasetTitle ?title }
  OPTIONAL { ?dataset bibo:doi ?doi }
}
ORDER BY ?dataset ?data_file ?data_file_type"""

metadata = get_sparql_dataframe(SPARQL_ENDPOINT, PROJECT_DATASETS_QUERY)
metadata.head(10)

Unnamed: 0,dataset,title,doi
0,http://lod.bco-dmo.org/id/dataset/4008,Scientific sampling event log from R/V Knorr c...,10.1575/1912/bco-dmo.4008.1
1,http://lod.bco-dmo.org/id/dataset/473296,Inorganic and organic nutrient data from Niski...,10.1575/1912/bco-dmo.473296.1
2,http://lod.bco-dmo.org/id/dataset/481164,Processed CTD data from all sensors mounted on...,10.1575/1912/bco-dmo.481164.1
3,http://lod.bco-dmo.org/id/dataset/528510,Viral and bacterial counts from filtered water...,10.26008/1912/bco-dmo.528510.1
4,http://lod.bco-dmo.org/id/dataset/700024,Targeted dissolved metabolite abundances from ...,
5,http://lod.bco-dmo.org/id/dataset/700038,Targeted particulate metabolite abundances fro...,
6,http://lod.bco-dmo.org/id/dataset/745536,Total organic sulfur (TOS) collected from Nisk...,10.1575/1912/bco-dmo.745536.1
7,http://lod.bco-dmo.org/id/dataset/875622,Sample information for metaproteomic samples t...,


In [13]:
"""QUERIES for dataset data files and dataset parameters"""

DATASET_FILES_QUERY = """
SELECT ?url (STR(?is_primary_data_file) as ?is_primary_data_file)  ?bytesize ?type ?mimetype ?type_abbreviation ?type_name
WHERE {
  VALUES ?dataset { <{dataset_uri}>}
  ?dataset a odo:Dataset .
  ?dataset odo:dataFile ?data_file .
  OPTIONAL { ?data_file odo:isPrimaryDataFile ?is_primary_data_file }
  ?data_file odo:usesFileDescriptor [
    odo:bytesize ?bytesize ;
    odo:downloadUrl ?url ;
    odo:fileType ?type ;
  ] .
  ?type skos:prefLabel ?type_name .
  OPTIONAL { ?type odo:mimetype ?mimetype }
  OPTIONAL { ?type skos:altLabel ?type_abbreviation }
}
ORDER BY ?data_file ?data_file_type"""

DATASET_PARAMS_QUERY = """
SELECT ?supplied_name ?supplied_definition ?datatype ?units ?format
WHERE {
  VALUES ?dataset { <{dataset_uri}>}
  ?dataset a odo:Dataset .
  ?dataset odo:storesValuesFor ?dataset_param .
  ?dataset_param skos:prefLabel ?supplied_name .
    OPTIONAL { ?dataset_param skos:definition ?supplied_definition }
    OPTIONAL { ?dataset_param odo:hasUnitOfMeasure/rdf:value ?units }
    OPTIONAL { ?dataset_param odo:datatype/odo:frictionlessdataDatatype ?datatype }
    OPTIONAL { ?dataset_param odo:valueFormat ?format }
}
ORDER BY ?dataset_param"""

In [14]:
""" Create a Frictionless Data Package """

oneProject = Package(name='biosscope-bcodmo-datasets', profile='data-package')
oneProject.title = 'Dissolved Organic Matter Composition in the Deep Atlantic Ocean'
oneProject.description = 'DeepDOM datasets from BCO-DMO'
oneProject.created = rfc3339_datetime_str()
oneProject.sources = []

for index, dataset in metadata.iterrows():

  # Save the dataset as a 'source' in the Package
  source = {
    'path': dataset['dataset'],
    'title': dataset['title'],
    'doi': dataset['doi']
  }
  oneProject.sources.append(source)


  # Get the BCO-DMO parameters
  parameters = get_sparql_dataframe(SPARQL_ENDPOINT, DATASET_PARAMS_QUERY.replace('{dataset_uri}', dataset['dataset']))
  schema = []
  for param_index, parameter in parameters.iterrows():
    param = {}
    param['bcodmo:name'] = parameter['supplied_name']
    if parameter['supplied_definition'] is not None:
      param['bcodmo:description'] = parameter['supplied_definition']
    if parameter['datatype'] is not None:
      param['bcodmo:datatype'] = parameter['datatype']
    if parameter['units'] is not None:
      param['bcodmo:units'] = parameter['units']
    if parameter['format'] is not None:
      param['bcodmo:valueFormat'] = parameter['format']
    schema.append(param)

  # Get the 'data' files for a Dataset (skip any supplemental documentation)
  files = get_sparql_dataframe(SPARQL_ENDPOINT, DATASET_FILES_QUERY.replace('{dataset_uri}', dataset['dataset']))
  for file_index, file in files.iterrows():

    # Use Frictionless to describe the file
    datafile = describe(file['url'])

    # Get Table stats
    if datafile.type == 'table':
      datafile.infer(stats=True)

    # Specify which dataset this file belongs to
    datafile.sources = [source]

    # If the file is marked as the primary file for the dataset, attach the parameters to the file
    if schema is not None and file['is_primary_data_file'] == 'true':
      datafile.custom['bcodmo:parameters'] = schema

    # Add the file to the package
    oneProject.add_resource(datafile)

# Save the package
print(oneProject.to_json())
oneProject.to_json('datapackage.json')

{
  "name": "biosscope-bcodmo-datasets",
  "title": "Dissolved Organic Matter Composition in the Deep Atlantic Ocean",
  "description": "DeepDOM datasets from BCO-DMO",
  "profile": "data-package",
  "sources": [],
  "created": "2025-07-18T23:09:54.599651Z",
  "resources": [
    {
      "name": "event_log",
      "type": "table",
      "sources": [
        {
          "path": "http://lod.bco-dmo.org/id/dataset/4008",
          "title": "Scientific sampling event log from R/V Knorr cruise KN210-04 in the Western Atlantic Ocean between Uruguay and Barbados in 2013 (Deep Atlantic DOM project)",
          "doi": "10.1575/1912/bco-dmo.4008.1"
        }
      ],
      "path": "https://datadocs.bco-dmo.org/file/3YY3qMjTpR1EVL/event_log.csv",
      "scheme": "https",
      "format": "csv",
      "mediatype": "text/csv",
      "encoding": "utf-8",
      "hash": "sha256:527a02054c9b67362046f2075b755971595ed9e14a147f7f8f522d47b6b5a37c",
      "bytes": 259721,
      "fields": 12,
      "rows": 184

'{\n  "name": "biosscope-bcodmo-datasets",\n  "title": "Dissolved Organic Matter Composition in the Deep Atlantic Ocean",\n  "description": "DeepDOM datasets from BCO-DMO",\n  "profile": "data-package",\n  "sources": [],\n  "created": "2025-07-18T23:09:54.599651Z",\n  "resources": [\n    {\n      "name": "event_log",\n      "type": "table",\n      "sources": [\n        {\n          "path": "http://lod.bco-dmo.org/id/dataset/4008",\n          "title": "Scientific sampling event log from R/V Knorr cruise KN210-04 in the Western Atlantic Ocean between Uruguay and Barbados in 2013 (Deep Atlantic DOM project)",\n          "doi": "10.1575/1912/bco-dmo.4008.1"\n        }\n      ],\n      "path": "https://datadocs.bco-dmo.org/file/3YY3qMjTpR1EVL/event_log.csv",\n      "scheme": "https",\n      "format": "csv",\n      "mediatype": "text/csv",\n      "encoding": "utf-8",\n      "hash": "sha256:527a02054c9b67362046f2075b755971595ed9e14a147f7f8f522d47b6b5a37c",\n      "bytes": 259721,\n      "fiel