# pyobistools: Tools for data enhancement and quality control - for python!

## Installation
---
Installing `pyobistools` requires going to https://github.com/cioos-siooc/pyobistools/

In [3]:
import sys
import pandas as pd
import numpy as np
from ckanapi import RemoteCKAN
NaN = np.nan
from pyobistools.taxa import *
from pyobistools.validation.check_fields import check_fields
from pyobistools.validation.check_eventids import *
from pyobistools.validation.check_onland import *
from pyobistools.validation.check_scientificname_and_ids import *
import plotly.express as px
import requests
pd.set_option('max_colwidth', None)

You can also install `pyobis` - instructions: https://github.com/iobis/pyobis/blob/main/README.md

In [8]:
from pyobis import dataset
from pyobis import occurrences

## Taxon matching
---
`search_worms()` searches for records based on a list of scientific names and returns a standardized pandas DataFrame representing the results 

In [2]:
names = ["Abra alva", "Buccinum fusiforme", "Buccinum fusiforme", "Buccinum fusiforme", "hlqsdkf"]
search_worms(names)

Unnamed: 0,url,scientificname,authority,status,unacceptreason,rank,valid_name,valid_authority,kingdom,phylum,...,taxon_rank_id,is_extinct,parent_name_usage_id,is_fresh_water,is_terrestrial,is_marine,is_brackish,match_input,matched,match_from
0,https://www.marinespecies.org/aphia.php?p=taxdetails&id=141433,Abra alba,"(W. Wood, 1802)",accepted,,Species,Abra alba,"(W. Wood, 1802)",Animalia,Mollusca,...,220.0,False,138474.0,False,False,True,False,Abra alva,True,worms
1,https://www.marinespecies.org/aphia.php?p=taxdetails&id=531014,Buccinum fusiforme,"Kiener, 1834",unaccepted,"Invalid: junior homonym of <i>Buccinum fusiforme</i> Borson, 1820; <i>Buccinum kieneri</i> is a replacement name",Species,Buccinum humphreysianum,"Bennett, 1824",Animalia,Mollusca,...,220.0,False,137701.0,False,False,True,False,Buccinum fusiforme,True,worms
2,https://www.marinespecies.org/aphia.php?p=taxdetails&id=510389,Buccinum fusiforme,"Broderip, 1830",unaccepted,"invalid: junior homonym of <i>Buccinum fusiforme</i> Borson, 1822",Species,Turrisipho fenestratus,"(W. Turton, 1834)",Animalia,Mollusca,...,220.0,False,137701.0,False,False,True,False,Buccinum fusiforme,True,worms
3,https://www.marinespecies.org/aphia.php?p=taxdetails&id=531014,Buccinum fusiforme,"Kiener, 1834",unaccepted,"Invalid: junior homonym of <i>Buccinum fusiforme</i> Borson, 1820; <i>Buccinum kieneri</i> is a replacement name",Species,Buccinum humphreysianum,"Bennett, 1824",Animalia,Mollusca,...,220.0,False,137701.0,False,False,True,False,Buccinum fusiforme,True,worms
4,https://www.marinespecies.org/aphia.php?p=taxdetails&id=510389,Buccinum fusiforme,"Broderip, 1830",unaccepted,"invalid: junior homonym of <i>Buccinum fusiforme</i> Borson, 1822",Species,Turrisipho fenestratus,"(W. Turton, 1834)",Animalia,Mollusca,...,220.0,False,137701.0,False,False,True,False,Buccinum fusiforme,True,worms
5,https://www.marinespecies.org/aphia.php?p=taxdetails&id=531014,Buccinum fusiforme,"Kiener, 1834",unaccepted,"Invalid: junior homonym of <i>Buccinum fusiforme</i> Borson, 1820; <i>Buccinum kieneri</i> is a replacement name",Species,Buccinum humphreysianum,"Bennett, 1824",Animalia,Mollusca,...,220.0,False,137701.0,False,False,True,False,Buccinum fusiforme,True,worms
6,https://www.marinespecies.org/aphia.php?p=taxdetails&id=510389,Buccinum fusiforme,"Broderip, 1830",unaccepted,"invalid: junior homonym of <i>Buccinum fusiforme</i> Borson, 1822",Species,Turrisipho fenestratus,"(W. Turton, 1834)",Animalia,Mollusca,...,220.0,False,137701.0,False,False,True,False,Buccinum fusiforme,True,worms
7,,,,,,,,,,,...,,False,,False,False,False,False,hlqsdkf,False,worms


## Check required fields
---
`check_fields(data, level, analysis_type, accepted_name_usage_id_check)` will check all OBIS requirements are present for a given core or extension.
- **data** = the input data as a pandas DataFrame
- **level** = `error` or `warning`, the difference between requirements not being met or recommendations not being met
- **analysis_type** = `event_core`, `occurrence_core`, `occurrence_extension`, or `extended_measurement_or_fact_extension`
- **accepted_name_usage_id_check** = `True` or `False` will filter out unaccepted scientific name ids

In [15]:
data = pd.DataFrame(columns = ["occurrenceID","sciientificName","locality","minimumDepthInMeters"])
data["occurrenceID"] = [1,2,3]
data["scientificName"] = ["Abra alba", "NA", "NA"]
data["locality"] = ["North Sea", "English Channel", "Flemish Banks"]
data["minimumDepthInMeters"] = [10,None,5]

check_fields(data, 'error', 'occurrence_core', 'False')

Unnamed: 0,field,level,row,message
1,basisofrecord,error,,Required field basisofrecord is missing
3,scientificnameid,error,,Required field scientificnameid is missing
4,eventdate,error,,Required field eventdate is missing
5,decimallatitude,error,,Required field decimallatitude is missing
6,decimallongitude,error,,Required field decimallongitude is missing
7,occurrencestatus,error,,Required field occurrencestatus is missing
8,countrycode,error,,Required field countrycode is missing
9,kingdom,error,,Required field kingdom is missing
10,geodeticdatum,error,,Required field geodeticdatum is missing


## Plot points on a map
---
`not found`

## Check on-land
---
`check_onland(data, land, report, buffer, offline)` will check whether given points are over land or not.

- **data** = the input data as a pandas DataFrame
- **land** = a custom land polygon to check against. If not provided, use Natural Earth.
- **report** = if True, errors returned instead of records
- **buffer** = set how far inland points are still to be deemed valid
- **offline** = if True, a local simplified shoreline is used, otherwise an OBIS webservice is used. Default is False

In [6]:
##query = dataset.search(scientificname = 'Mola mola')
##data = query.execute() # or query.data
##data

{'total': 213,
 'results': [{'id': '2101d4c5-c20b-49c0-a44b-3d6484c4c891',
   'url': 'http://ipt.env.duke.edu/resource?r=zd_1404',
   'archive': 'http://ipt.env.duke.edu/archive.do?r=zd_1404',
   'published': '2022-01-20T16:06:56.000Z',
   'created': None,
   'updated': '2022-07-21T12:35:11.886Z',
   'core': 'occurrence',
   'extensions': [],
   'statistics': {'Event': 0,
    'absence': 0,
    'dropped': 13,
    'Occurrence': 133369,
    'DNADerivedData': 0,
    'MeasurementOrFact': 0},
   'extent': 'POLYGON((-179.978292 -25.86469,-179.978292 51.571235,179.986607 51.571235,179.986607 -25.86469,-179.978292 -25.86469))',
   'title': 'Observatoire Pelagis aerial surveys 2002-2021',
   'citation': 'Van Canneyt, O. 2022. Observatoire Pelagis aerial surveys 2002-2021. Data downloaded from OBIS-SEAMAP (http://seamap.env.duke.edu/dataset/1404) on yyyy-mm-dd.',
   'citation_id': None,
   'abstract': "Original provider:\nObservatoire PELAGIS UAR 3462 University La Rochelle - CNRS\n\nDataset cred

In [11]:
## Grab Mola mola occurrences from OBIS web server
data = occurrences.search(scientificname = 'Mola mola').execute()

Fetching: [████████████████████████████████████████████████████████████████████████████████████████████████████] 21360/21360
Fetched 21360 records.


In [14]:
data.head(2) # shows an example of the data package

Unnamed: 0,infraphylum,date_year,scientificNameID,scientificName,individualCount,associatedReferences,dropped,gigaclassid,aphiaID,decimalLatitude,...,disposition,originalNameUsage,associatedMedia,acceptedNameUsage,acceptedNameUsageID,identificationID,verbatimSRS,previousIdentifications,dataGeneralizations,geometry
0,Gnathostomata,2007.0,urn:lsid:marinespecies.org:taxname:127405,Mola mola,1.0,"[{""crossref"":{""citeinfo"":{""origin"":""Halpin, P.N., A.J. Read, E. Fujioka, B.D. Best, B. Donnelly, L.J. Hazen, C. Kot, K. Urian, E. LaBrecque, A. Dimatteo, J. Cleary, C. Good, L.B. Crowder, and K.D. Hyrenbach"",""pubdate"":""2009"",""title_html"":""OBIS-SEAMAP: The world data center for marine mammal, sea bird, and sea turtle distributions"",""title"":""OBIS-SEAMAP: The world data center for marine mammal, sea bird, and sea turtle distributions"",""serinfo"":{""sername"":""Oceanography"",""issue"":""22(2):104-115""},""onlink"":""http:\/\/www.tos.org\/oceanography\/article\/obis-seamap-the-world-data-center-for-marine-mammal-sea-bird-and-sea-turtle""}}}]",False,10194,127405,42.38,...,,,,,,,,,,POINT (-65.37000 42.38000)
1,Gnathostomata,2019.0,urn:lsid:marinespecies.org:taxname:127405,Mola mola,,"[{""crossref"":{""citeinfo"":{""origin"":""APEM and Normandeau Associates prepared for the Bureau of Ocean Energy Management (BOEM)"",""pubdate"":""2021"",""title_html"":""Ecological Baseline Studies of the U.S. Outer Continental Shelf Option Year 2"",""title"":""Ecological Baseline Studies of the U.S. Outer Continental Shelf Option Year 2""}}},{""crossref"":{""citeinfo"":{""origin"":""Halpin, P.N., A.J. Read, E. Fujioka, B.D. Best, B. Donnelly, L.J. Hazen, C. Kot, K. Urian, E. LaBrecque, A. Dimatteo, J. Cleary, C. Good, L.B. Crowder, and K.D. Hyrenbach"",""pubdate"":""2009"",""title_html"":""OBIS-SEAMAP: The world data center for marine mammal, sea bird, and sea turtle distributions"",""title"":""OBIS-SEAMAP: The world data center for marine mammal, sea bird, and sea turtle distributions"",""serinfo"":{""sername"":""Oceanography"",""issue"":""22(2):104-115""},""onlink"":""http:\/\/www.tos.org\/oceanography\/article\/obis-seamap-the-world-data-center-for-marine-mammal-sea-bird-and-sea-turtle""}}}]",False,10194,127405,34.5414,...,,,,,,,,,,POINT (-77.15940 34.54140)


In [13]:
## Passes OBIS data through a check to see whether any values may be on land or not
check_onland(data) # potentially 235 observations that might be on land

  return GeometryArray(vectorized.points_from_xy(x, y, z), crs=crs)
  super(GeoDataFrame, self).__setitem__(key, value)
  super(GeoDataFrame, self).__setitem__(key, value)


Unnamed: 0,infraphylum,date_year,scientificNameID,scientificName,individualCount,associatedReferences,dropped,gigaclassid,aphiaID,decimalLatitude,...,originalNameUsage,associatedMedia,acceptedNameUsage,acceptedNameUsageID,identificationID,verbatimSRS,previousIdentifications,dataGeneralizations,geometry,on_land
212,Gnathostomata,2002.0,urn:lsid:marinespecies.org:taxname:127405,Mola mola,1,,False,10194,127405,33.380000,...,,,,,,,,,POINT (-118.42000 33.38000),True
291,Gnathostomata,1990.0,urn:lsid:marinespecies.org:taxname:127405,Mola mola,,,False,10194,127405,50.100000,...,,,,,,,,,POINT (-5.63333 50.10000),True
382,Gnathostomata,1959.0,,Mola mola,1,,False,10194,127405,37.635168,...,,,,,,,,,POINT (-122.49467 37.63517),True
433,Gnathostomata,2010.0,urn:lsid:marinespecies.org:taxname:127405,Mola mola,,,False,10194,127405,51.886170,...,,,,,,,,,POINT (-10.39283 51.88617),True
1007,Gnathostomata,2001.0,urn:lsid:marinespecies.org:taxname:127405,Mola mola,,,False,10194,127405,51.735901,...,,,,,,,,,POINT (-5.22091 51.73590),True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21173,Gnathostomata,2001.0,urn:lsid:marinespecies.org:taxname:127405,Mola mola,,,False,10194,127405,51.166000,...,,,,,,,,,POINT (-4.66903 51.16600),True
21251,Gnathostomata,2003.0,urn:lsid:marinespecies.org:taxname:127405,Mola mola,,,False,10194,127405,50.219019,...,,,,,,,,,POINT (-5.47764 50.21902),True
21287,Gnathostomata,1959.0,urn:lsid:marinespecies.org:taxname:127405,Mola mola,1,,False,10194,127405,38.097839,...,,,,,,,,,POINT (-122.26863 38.09784),True
21338,Gnathostomata,1968.0,urn:lsid:marinespecies.org:taxname:127405,Mola mola,,,False,10194,127405,-28.083330,...,,,,,,,,,POINT (153.45000 -28.08333),True
