In [3]:
import pandas as pd
import numpy as np
from ckanapi import RemoteCKAN
NaN = np.nan
from pyobistools.validation.check_eventids import check_eventids, check_extension_eventids
import plotly.express as px
import requests
pd.set_option('max_colwidth', None)

### Info about this notebook series

This series of notebook is meant to serve as an educational tool to learn how to use the PyObis biodiversity data validation package: + https://github.com/cioos-siooc/pyobistools

Command to install PyObis (currently not hosted on Pypi)
+ pip install git+https://github.com/cioos-siooc/pyobistools@main#egg=pyobistools

Darwin Core documentation: 
+ https://dwc.tdwg.org/

Darwin Core file types required fields: 
+ https://ioos.github.io/bio_mobilization_workshop/01-introduction/index.html
+ https://ioos.github.io/bio_mobilization_workshop/04-create-schema/index.html

### Notebook to test Pyobis' functions 'check_eventids' and 'check_extension_eventids'

##### Function 'check_eventids' description
The 'check_eventids' function is used with one file of either Darwin Core file types 'event_core' or 'occurrence_core', to report:
    + the absence of the fields 'eventid' and 'parenteventid'
    + duplicates values in 'event_ids'
    + if all 'parentEventIDs' have corresponding 'eventid' in a given file

##### Function 'check_eventids' arguments
+ data: Dataframe of the data to evaluate


##### Function 'check_extension_eventids' description
The 'check_extension_eventids' function is used with two Darwin Core files at a time - One file needs to be an 'event_core' while the other needs to be either an 'occurence_extension' or an 'extended_measurement_or_fact_extension' (event or occurrence in the latter case). The function reports if all eventIDs in an extension file have corresponding eventIDs in the core file

##### Function 'check_extension_eventids' arguments
+ event: Dataframe of the 'event_core' file to evaluate
+ extension: Dataframe of the 'extended_measurement_or_fact_extension' to evaluate
+ field (default = 'eventID'): The 'eventID' field name in the 'extended_measurement_or_fact_extension' file.

##### Relation to obistools package in R:
They are the python equivalent of check_eventids() and check_extension_eventids()
See documentation https://github.com/iobis/obistools#check-eventid-and-parenteventid

Load different types of DWC files:

In [10]:
event_core = pd.read_csv('https://catalogue.preprod.ogsl.ca/data/mun/ca-cioos_d006cd88-6d08-409a-9c11-2ca63a20ca1b/event_placentia_diadromous-fish.csv')
event_core.head(3)

Unnamed: 0,eventID,eventDate,waterBody,cardinal_direction,decimalLatitude,decimalLongitude,distance_ocean_m,locality,site_number,station_number,...,p1_starting_time,p1_ending_time,p2_starting_volts,p2_ending_volts,p2_starting_time,p2_ending_time,p3_starting_volts,p3_ending_volts,p3_starting_time,p3_ending_time
0,MUN-EF-1-1-2020-001,2020-07-07,Plancentia Bay,east,472713249,-538442982,200,North-east Placentia,1,1,...,945,1120,425,445,1210,1305,,,,
1,MUN-EF-1-3-2020-002,2020-08-05,Plancentia Bay,east,4716323,-5350182,970,North-east Placentia,1,3,...,800,920,545,545,950,1125,,,,
2,MUN-EF-1-2-2020-003,2020-07-10,Plancentia Bay,east,472732485,-538394947,650,North-east Placentia,1,2,...,925,1045,525,525,1055,1200,,,,


#### ONC Event Core Example

In [4]:
event_core = pd.read_csv('Z:\Desktop\obis-diveid-5040\obis-diveid-5040_0vote0pp\event.csv')
event_core.head(10)

Unnamed: 0,eventID,parentEventID,eventRemarks,eventDate,modified,type,habitat,decimalLatitude,decimalLongitude,minimumDepthInMeters,maximumDepthInMeters,verbatimCoordinateSystem,geodeticDatum,footprintWKT
0,1120,,Cruise NEPTUNE/VENUS Maintenance 2021-03,2021-03-18T00:00:00Z/2021-03-30T00:00:00Z,2021-11-10T23:55:57Z,Event,,,,,,EPSG:26910,WGS84,
1,5040,1120.0,Dive OE0253 Saanich Inlet,2021-03-29T14:25:00Z/2021-03-29T19:47:48Z,2021-11-10T23:56:11Z,Event,,48.653529,-123.484401,,,EPSG:26910,WGS84,"POLYGON((-123.4944416666667 48.65638883333333, -123.4743601666667 48.65638883333333, -123.4944416666667 48.6506685, -123.4743601666667 48.6506685))"
2,7350290,5040.0,,2021-03-29T14:33:50Z,2021-05-28T21:29:38Z,Text,,48.650686,-123.487217,101.0,101.0,EPSG:26910,WGS84,
3,7350320,5040.0,,2021-03-29T14:37:27Z,2021-05-28T22:04:15Z,Text,,48.650866,-123.486683,100.0,100.0,EPSG:26910,WGS84,
4,7350380,5040.0,,2021-03-29T14:48:04Z,2021-05-28T22:05:03Z,Text,,48.650905,-123.486683,100.0,100.0,EPSG:26910,WGS84,
5,7350400,5040.0,,2021-03-29T14:48:07Z,2022-02-24T19:20:00Z,Text,,48.650904,-123.486683,100.0,100.0,EPSG:26910,WGS84,
6,7350410,5040.0,,2021-03-29T14:48:55Z,2022-02-24T22:06:17Z,Text,,48.650904,-123.486684,100.0,100.0,EPSG:26910,WGS84,
7,7350450,5040.0,,2021-03-29T14:51:57Z,2021-05-28T22:09:55Z,Text,,48.650899,-123.486686,100.0,100.0,EPSG:26910,WGS84,
8,7350520,5040.0,,2021-03-29T14:59:30Z,2021-05-28T22:11:00Z,Text,,48.650896,-123.486687,100.0,100.0,EPSG:26910,WGS84,
9,7350550,5040.0,,2021-03-29T15:05:10Z,2021-05-28T22:11:40Z,Text,,48.650894,-123.486687,100.0,100.0,EPSG:26910,WGS84,


In [3]:
emof1 = pd.read_csv('https://catalogue.preprod.ogsl.ca/data/mun/ca-cioos_d006cd88-6d08-409a-9c11-2ca63a20ca1b/event_emof_placentia_diadromous-fish.csv')
emof1.head(3)

Unnamed: 0,eventID,site_number,station_number,habitat_number,habitatID,measurementID,measurementType,measurementValue,measurementUnit
0,MUN-EF-10-1-2020-022,10,1,1,habitat-10-1-1,MUN-2020-habitat-10-1-1-01,Habitat type,riffle,
1,MUN-EF-10-1-2020-022,10,1,1,habitat-10-1-1,MUN-2020-habitat-10-1-1-02,Habitat length,481,m
2,MUN-EF-10-1-2020-022,10,1,1,habitat-10-1-1,MUN-2020-habitat-10-1-1-03,Habitat width,232,m


Try the check_eventids function:

In [5]:
check_eventids(event_core)

Unnamed: 0,field,level,row,message


Try the check_extension_eventids function:

In [5]:
check_extension_eventids(event_core, emof1, field = 'eventID').head()

Unnamed: 0,field,level,row,message
1461,eventid,error,1461,Field MUN-EF-7-2-2020-019 has no corresponding eventID in the core
1462,eventid,error,1462,Field MUN-EF-7-2-2020-019 has no corresponding eventID in the core
1463,eventid,error,1463,Field MUN-EF-7-2-2020-019 has no corresponding eventID in the core
1464,eventid,error,1464,Field MUN-EF-7-2-2020-019 has no corresponding eventID in the core
1465,eventid,error,1465,Field MUN-EF-7-2-2020-019 has no corresponding eventID in the core
