# convertBCODMOtoCMAP
Krista Longnecker, 7 August 2025\
Run this after running ```getBCODMOinfo.ipynb```\
This script will make the format required by CMAP using a script called ```convert.py```.\
Also note that you end up running through this twice in order to get all the metadata needed for the _variables_. There are too many details needed by CMAP to automate that step.\

In [23]:
#Bring in the libaries
import pandas as pd
import requests
import os
import json
import re
import pdb

from datetime import datetime, timedelta, timezone, date
from SPARQLWrapper import SPARQLWrapper, POST, JSON
from frictionless import describe, Package

In [24]:
# Make a function that searches for bcodmo:name and returns bcodmo:description and bcodmo:units
# input: md --> the list of parameters for one dataset
def getDetails(md,bcodmo_name):
    """
    Take the list of information from BCO-DMO, search for a name, and return the description and units for that name
    """
    for i, item in enumerate(md):
        if item['bcodmo:name'] == bcodmo_name:
            #actually want the descrption, so return that
            description = item['bcodmo:description']
            if 'bcodmo:units' in item:
                units = item['bcodmo:units']
            else:
                units = 'not applicable'
            #print(units)

    return description, units

In [25]:
#Check that an Excel file exists for the variable metadata:
if os.path.exists('CMAP_variableMetadata_additions.xlsx'):
    print('Found Excel file with metadata')
else:
    #You cannot proceed without the file, so stop the script if it's not found
    print(f"No Excel file with metadata found")
    sys.exit(1)    

Found Excel file with metadata


In [26]:
#read in the package that was already made (using getBCODMOinfo.ipynb)
oneProject = Package('datapackage.json')

In [27]:
out = [];
for idx,item in enumerate(oneProject.resources):
    justFile = item.name;
    out.append(justFile)

out #this will be a list, leave here so I see the filenames

['event_log',
 'nutrients',
 'ctd',
 'viral_bact_counts',
 'metabolites_dissolved',
 'metabolites_particulate',
 'tos',
 'deepdom_sample_metadata_for_opp']

In [28]:
#do the event log first as I will need that for other samples (probably some at BCO-DMO and definitely the MTBLS1752 dataset at MetaboLights)

idx_json = 0
#to do: figure out a better way to do this so I am not reading in the json file every time
biosscope = Package('datapackage.json')

data_url = biosscope.resources[idx_json].path
md = biosscope.resources[idx_json].custom['bcodmo:parameters'] #this is a list, don't forget 'custom' (!!)

#make a short name out of the data_url, will use this as part of the name for the final Excel file 
exportFile = re.split('/',data_url).pop().replace('.csv','')

#super easy to work with the CSV file once I have the URL
eventLog = pd.read_csv(data_url,na_values = ['nd']) #now I have NaN...but they get dropped when writing the file
    
# Don't need to submit the event log to CMAP, just use it for merging to other datastreams.
# need zfill to get a four digit time, otherwise this will fail, here the format is what the numbers are in (not what I want)
eventLog['date_column'] = pd.to_datetime(eventLog['date_utc'].apply(str) + ' ' + eventLog['time_utc'].apply(str).str.zfill(4), format="%Y%m%d %H%M")
eventLog['time_cmap'] = eventLog['date_column'].dt.strftime("%Y-%m-%dT%H:%M:%S")

##later realized that the MetaboLights data are also at BCO-DMO, so I don't have to mess around with MetaboLights

# Work though all of the data files

In [29]:
#two options for DeepDOM: run the script or skip the file
toSkip = {'deepdom_sample_metadata_for_opp','event_log'} #not ready 
discreteData = {'nutrients','viral_bact_counts','CTD','metabolites_dissolved','metabolites_particulate','TOS'};

In [30]:
for idx in range(len(oneProject.resources)):
    data_url = oneProject.resources[idx].path
    if data_url.endswith('.csv'):
        checkFile = re.split('/',data_url).pop().replace('.csv','')
        #look for a match, a file to skip, or an error:
        if checkFile in toSkip:
            print('skip ' + checkFile)  
        elif checkFile in discreteData:
            %run convert.py {idx}   
        else:
            print('no match ' + checkFile)



skip event_log
Data here: d:\dropbox\github_niskin\DeepDOM\data\nutrients
Data here: d:\dropbox\github_niskin\DeepDOM\data\CTD
Data here: d:\dropbox\github_niskin\DeepDOM\data\viral_bact_counts
Data here: d:\dropbox\github_niskin\DeepDOM\data\metabolites_dissolved
Data here: d:\dropbox\github_niskin\DeepDOM\data\metabolites_particulate
Data here: d:\dropbox\github_niskin\DeepDOM\data\TOS
skip deepdom_sample_metadata_for_opp


In [114]:
#I could not get this to work, but the online option is great (though files get submitted one at a time)
## %run SimonsCMAP_QCAPI.py

In [13]:
#figure out best way to shorten the var_unit that is too long
idx_json = 2
biosscope = Package('datapackage.json')
data_url = biosscope.resources[idx_json].path
md = biosscope.resources[idx_json].custom['bcodmo:parameters'] #this is a list, don't forget 'custom' (!!)

#make a short name out of the data_url, will use this as part of the name for the final Excel file 
exportFile = re.split('/',data_url).pop().replace('.csv','')

#super easy to work with the CSV file once I have the URL
bcodmo = pd.read_csv(data_url,na_values = ['nd']) #now I have NaN...but they get dropped when writing the file

# Required variables are time, lat, lon, depth
df = pd.DataFrame(columns=['time','lat','lon','depth'])

# time --> CMAP requirement is this: #< Format  %Y-%m-%dT%H:%M:%S,  Time-Zone:  UTC,  example: 2014-02-28T14:25:55 >
temp = bcodmo.copy()

#In the TOS data, multiple variables have different labels :( 
if exportFile == 'TOS':
    useDate = 'date_utc_YYYYMMDD_start'
    useTime = 'time_utc_HHMM_start'
    useDepth = 'depth_m'
else:
    useDate = 'date_start_utc'
    useTime = 'time_start_utc'
    useDepth = 'depth'

temp['date'] = pd.to_datetime(temp[useDate].apply(str) + ' ' + temp[useTime].apply(str).str.zfill(4), format="%Y%m%d %H%M")
temp['date_cmap'] = temp['date'].dt.strftime("%Y-%m-%dT%H:%M:%S" + "+00:00") #update to add 0 offset from UTC
df['time'] = temp['date_cmap']

# lat (-90 to 90) and lon (-180 to 180); 
df['lat'] = bcodmo['lat_start']
df['lon'] = bcodmo['lon_start']  #BCO-DMO already has this as negative
df['depth'] = bcodmo[useDepth]

# all remaining columns in bcodmo can be considered data
#remember: bcodmo_trim will have the list of variables that I will use later to get metadata about the variables
bcodmo_trim = bcodmo.drop(columns=['lat_start', 'lon_start', useDepth])
nVariables = bcodmo_trim.shape[1] #remember in Python indexing starts with 0 (rows, 1 is the columns)
# and then add to the datafile I am assembling
df = pd.concat([df, bcodmo_trim], axis=1)

# work on the second sheet: metadata about the variables; 
# use the CMAP dataset template to setup the dataframe so I get the column headers right
templateName = 'datasetTemplate.xlsx'
sheet_name = 'vars_meta_data'
vars = pd.read_excel(templateName, sheet_name=sheet_name)
metaVarColumns = vars.columns.tolist()
#df2 will be the dataframe with the metadata about the variables, set it up empty here
df2 = pd.DataFrame(columns=metaVarColumns,index = pd.RangeIndex(start=0,stop=nVariables)) #remember, Python is 0 indexed

#the variables I need to search for are here: bcodmo_trim.columns, put them in the first column
df2['var_short_name'] = bcodmo_trim.columns
    

In [11]:
md

[{'bcodmo:name': 'cast',
  'bcodmo:description': '<p>Consecutive cast number for the instrument.</p>',
  'bcodmo:units': 'dimensionless'},
 {'bcodmo:name': 'station',
  'bcodmo:description': '<p>Identification number of the sampling station.</p>',
  'bcodmo:units': 'dimensionless'},
 {'bcodmo:name': 'date_start_utc',
  'bcodmo:description': '<p>Date (UTC) given as 4-digit year -- 2-digit month -- 2-digit day in&nbsp;YYYYmmdd format.</p>',
  'bcodmo:units': 'unitless'},
 {'bcodmo:name': 'time_start_utc',
  'bcodmo:description': '<p>Time (UTC) given as hour -- minute.</p>',
  'bcodmo:units': 'HHMM'},
 {'bcodmo:name': 'event_start',
  'bcodmo:description': '<p>The event number from the ELOG maintained during the cruise.</p>',
  'bcodmo:units': 'dimensionless'},
 {'bcodmo:name': 'lat_start',
  'bcodmo:description': '<p>Latitude at the time the event started (from the cruise event log).</p>',
  'bcodmo:units': 'decimal degrees'},
 {'bcodmo:name': 'lon_start',
  'bcodmo:description': '<p>Lon

In [17]:
idx = 19
a,b = getDetails(md,df2.loc[idx,'var_short_name']) #getDetails is the function I wrote (see above)


In [18]:
a


'<p>SPAR / surface irradiance.</p>'

In [19]:
b

'microEinsteins per square meter per second (μE/m2-sec)'

In [22]:
if b == 'microEinsteins per square meter per second (μE/m2-sec)':
    b = 'microEinsteins per square meter per sec(μE/m2-sec)'

b

'microEinsteins per square meter per sec(μE/m2-sec)'