# convertBCODMOtoCMAP
Krista Longnecker, 18 July 2025\
Run this after running ```getBCODMOinfo.ipynb```\
This script will make the format required by CMAP.\
Note: this version puts most of the code into separate python scripts (convert*.py) to make it easier to run through multiple data files.\
Also note that you end up running through this twice in order to get all the metadata needed for the _variables_. There are too many details needed by CMAP to automate that step.\
Editing to use this framework to get the DeepDOM data into CMAP.

In [1]:
%reset -f

In [2]:
#some of these are residual from assembling the data file, keep for now.
import pandas as pd
import requests
import os
import json
import re
import pdb

from datetime import datetime, timedelta, timezone
from SPARQLWrapper import SPARQLWrapper, POST, JSON
from frictionless import describe, Package

In [3]:
# Make a function that searches for bcodmo:name and returns bcodmo:description and bcodmo:units
# input: md --> the list of parameters for one dataset
def getDetails(md,bcodmo_name):
    """
    Take the list of information from BCO-DMO, search for a name, and return the description and units for that name
    """
    for i, item in enumerate(md):
        if item['bcodmo:name'] == bcodmo_name:
            #actually want the descrption, so return that
            description = item['bcodmo:description']
            if 'bcodmo:units' in item:
                units = item['bcodmo:units']
            else:
                units = 'not applicable'
            #print(units)

    return description, units

In [4]:
#Check that an Excel file exists for the variable metadata:
if os.path.exists('CMAP_variableMetadata_additions.xlsx'):
    print('Found Excel file with metadata')
else:
    #You cannot proceed without the file, so stop the script if it's not found
    print(f"No Excel file with metadata found")
    sys.exit(1)
    

Found Excel file with metadata


In [5]:
#read in the package that was already made (using getBCODMOinfo.ipynb)
oneProject = Package('datapackage.json')

In [6]:
out = [];
for idx,item in enumerate(oneProject.resources):
    justFile = item.name;
    out.append(justFile)

out #this will be a list, leave here so I see the filenames

['event_log',
 'nutrients',
 'ctd',
 'viral_bact_counts',
 'metabolites_dissolved',
 'metabolites_particulate',
 'tos',
 'deepdom_sample_metadata_for_opp']

In [9]:
#most datasets need additional processing to get depth and/or time. 
#Set up a table here to note what needs to be done, use later to spin off to other scripts.
#this essentially splits by PI since each PI has different project metadata
toSkip = {'deepdom_sample_metadata_for_opp'}
discreteData = {'nutrients'};

# Work though all of the data files

In [10]:
for idx in range(len(oneProject.resources)):
    data_url = oneProject.resources[idx].path
    if data_url.endswith('.csv'):
        checkFile = re.split('/',data_url).pop().replace('.csv','')
        #have a few options and trying to group these based on added steps needed to make the data file ready
        if checkFile in toSkip:
            print('skip ' + checkFile)
        # elif checkFile in pumpData:
        #     %run convert_pumpData.py {idx}   
        # elif checkFile in zoopData:
        #     %run convert_zoopData.py {idx}   
        elif checkFile in discreteData:
            %run convert.py {idx}   
        else:
            print('no match ' + checkFile)

no match event_log


KeyError: 'decy'

no match CTD
no match viral_bact_counts
no match metabolites_dissolved
no match metabolites_particulate
no match TOS
skip deepdom_sample_metadata_for_opp


In [11]:
idx_json = 1
oneProject = Package('datapackage.json')

data_url = oneProject.resources[idx_json].path
md = oneProject.resources[idx_json].custom['bcodmo:parameters'] #this is a list, don't forget 'custom' (!!)

#make a short name out of the data_url, will use this as part of the name for the final Excel file 
exportFile = re.split('/',data_url).pop().replace('.csv','')

#super easy to work with the CSV file once I have the URL
bcodmo = pd.read_csv(data_url,na_values = ['nd']) #now I have NaN...but they get dropped when writing the file
 

In [12]:
data_url

'https://datadocs.bco-dmo.org/file/WWW1EMYcyGpEYx/nutrients.csv'

In [13]:
bcodmo.head()

Unnamed: 0,cast,station,date_start_utc,time_start_utc,event_start,lat_start,lon_start,niskin,depth,press,bots,PO4,NO3_NO2,silicate,NO2,NH4,NPOC,TN
0,2,2,20130327,1609,20130327.1609.001,-37.997297,-45.000042,10,5108.562,5209.693,10,2.4173,33.3321,126.0667,0.0498,0.021,47.04342,37.323593
1,2,2,20130327,1609,20130327.1609.001,-37.997297,-45.000042,11,4001.029,4069.818,11,2.1939,30.1202,91.6798,0.0406,0.0196,46.527814,33.007255
2,2,2,20130327,1609,20130327.1609.001,-37.997297,-45.000042,12,3002.155,3046.643,12,1.9267,26.3843,60.5519,0.0363,0.0283,46.202492,28.239207
3,2,2,20130327,1609,20130327.1609.001,-37.997297,-45.000042,23,2501.794,2535.868,23,1.7849,25.5304,50.0437,0.0271,0.0169,44.600432,27.436167
4,3,2,20130328,1230,20130328.1230.001,-37.9963,-44.998833,1,503.511,507.93,1,1.4213,19.4291,6.7561,0.0302,0.0315,51.21737,20.778466


In [None]:

idx_json = int(sys.argv[1])
#to do: figure out a better way to do this so I am not reading in the json file every time
oneProject = Package('datapackage.json')

data_url = oneProject.resources[idx_json].path
md = oneProject.resources[idx_json].custom['bcodmo:parameters'] #this is a list, don't forget 'custom' (!!)

#make a short name out of the data_url, will use this as part of the name for the final Excel file 
exportFile = re.split('/',data_url).pop().replace('.csv','')

#super easy to work with the CSV file once I have the URL
bcodmo = pd.read_csv(data_url,na_values = ['nd']) #now I have NaN...but they get dropped when writing the file
    
# Required variables are time, lat, lon, depth
df = pd.DataFrame(columns=['time','lat','lon','depth'])

# time --> CMAP requirement is this: #< Format  %Y-%m-%dT%H:%M:%S,  Time-Zone:  UTC,  example: 2014-02-28T14:25:55 >
# Do this in two steps so I can check the output more easily
temp = bcodmo.copy()
temp['date'] = pd.to_datetime(temp['decy'], unit='D', origin='1970-01-01')
temp['date_cmap'] = temp['date'].dt.strftime("%Y-%m-%dT%H:%M:%S")
df['time'] = temp['date_cmap']

# lat (-90 to 90) and lon (-180 to 180); use variable names at BCO-DMO
df['lat'] = bcodmo['Latitude']
df['lon'] = bcodmo['Longitude']  #BCO-DMO already has this as negative
df['depth'] = bcodmo['Depth']

# all remaining columns in bcodmo can be considered data
#remember: bcodmo_trim will have the list of variables that I will use later to get metadata about the variables
bcodmo_trim = bcodmo.drop(columns=['Latitude', 'Longitude', 'Depth'])
nVariables = bcodmo_trim.shape[1] #remember in Python indexing starts with 0 (rows, 1 is the columns)
# and then add to the datafile I am assembling (essentially re-order columns
df = pd.concat([df, bcodmo_trim], axis=1)
       