
## Example of data lifecycle using Digital Specimens and API integration 
##  

<b>First, we import necessary python libraries that deal with json and a specific library for ENA API.</b>

In [6]:
import json
import requests
from IPython.display import Markdown as md
import enasearch

<b>Read a JSON file that is a Digital Specimen record and print the raw data.<b>

In [7]:
with open('DO/castex.json') as json_file:
    data = json.load(json_file)
print(json.dumps(data, indent=2))

{
  "id": "20.5000.1025/6e2b07784b8f608d9e37",
  "type": "DigitalSpecimen",
  "attributes": {
    "content": {
      "id": "20.5000.1025/6e2b07784b8f608d9e37",
      "creationdatetime": "2019-05-24T17:17:44.436Z",
      "creator": "admin",
      "midslevel": 2,
      "scientificName": "Holorchis castex Bray & Justine",
      "country": "New Caledonia",
      "locality": "Rocher a la voile",
      "decimalLat/Long": [
        -22.3,
        166.42
      ],
      "recordedBy": "J L. Justine",
      "collectionDate": "2006-06-01",
      "catalogNumber": "2006.12.6.40-41",
      "otherCatalogNumbers": "NHMUK:ecatalogue:7072219",
      "institutionCode": "NHMUK",
      "collectionCode": "ZOO, Parasitic worms",
      "stableIdentifier": "https://data.nhm.ac.uk/object/e90b81bc-1642-47ca-b587-6aa8885cd6a0/1558569600000",
      "physicalSpecimenId": "013258549",
      "Annotations": "Type status = paratype. Holotype =  MNHN JNC 1848 \u2013D 1",
      "gbifId": "https://www.gbif.org/occurrence/1

<b>Grab the data elements we need. For this example, we will use the accession ID ENA database.</b>

In [9]:
stableIdentifier = data['attributes']['content']['stableIdentifier']
catalogNumber = data['attributes']['content']['catalogNumber']
enaLink = data['attributes']['content']['enaSequence']


In [10]:
md("<b>Link to the specimen</b> {} ".format(stableIdentifier))

<b>Link to the specimen</b> https://data.nhm.ac.uk/object/e90b81bc-1642-47ca-b587-6aa8885cd6a0/1558569600000 

In [11]:
md("<b>Catalog Number</b> {} ".format(catalogNumber))

<b>Catalog Number</b> 2006.12.6.40-41 

In [12]:
md("<b>Link to the sequence</b> {} ".format(enaLink))

<b>Link to the sequence</b> https://www.ebi.ac.uk/ena/data/view/FJ788436 

<b>Using enasearch python library (which uses the BioPython SeqRecord data class) grab the sequence.<b>

In [13]:
enaresult = enasearch.retrieve_data(
        ids="FJ788436",
        download=None,
        display="fasta",
        file=None,
        offset=0,
        header=None)

In [14]:
mysequence = enaresult[0].seq
print(mysequence)

GGTTTATTGCAGAGGTTTGCGGACTTATTGANGTTGATTATTAAGTTTAAGATTGCTTTTTTTCAGGCGCGTAGGTGGCTGTCTTGGGGAGGGGTGCTTTTGTTAGTTTTCTTGTCTTGTAGGTATTGTCTTATTTTTGCTTTTTGTCAGAGGGGTCAGAGTAATCAGACTGTGCTGTTGTGGCTTTTGGTTATTACTAGTTTAACTGGTTATAGATTGCTTAGTGTTGGGTGAGGATCATATAATAAGTATGCTTTGGTGAGTTGTGTTCGGTCTGCGTTTGGCTCTATAAGGTTTGAGGCTGTTTTTATGTGTGTTGTTATTATGGTTGGGCTGCTGTGAGGGGGTTATTATTCTTTTCCCGGGGTTGAGCATTCTTGAATGTTGCTGTTGGTTTCACCGCTGTTGTATGCTGTTTGGCTTGTAGGTATACTTTGTGAGTGCAACCGAACTCCATTG


In [46]:
md("<b>Now fetch the same sequence using SciData integrated file. This file contains Digital Specimen and all the related records</b>")

<b>Now fetch the same sequence using SciData integrated file. This file contains Digital Specimen and all the related records</b>

In [47]:
with open('DO/fair_holocris_castex_scidata_related.jsonld') as jsonld_file:
    jsonlddata = json.load(jsonld_file)

In [None]:
md("Grab the elements. ")

In [39]:
scidata_array = jsonlddata['@graph']['scidata'][0]['system']['facets']
source_array  = jsonlddata['@graph']['sources']

In [41]:
for i in scidata_array: 
    print(i['@type'])
    print(i['source'])

dissco:specimenRecord
source/2/
dissco:occurrenceRecord
source/3/
dissco:speciesRecord
source/4/
dissco:treatmentRecord
source/5/
dissco:sequenceRecord
source/6/


In [45]:
md("<b>Grab the sequence</b>")

<b>Grab the sequence</b>

In [43]:
sequence = jsonlddata['@graph']['scidata'][0]['system']['facets'][4]['sequence']
print(sequence)

ggtttattgcagaggtttgcggacttattgangttgattattaagtttaagattgctttttttcaggcgcgtaggtggctgtcttggggaggggtgcttttgttagttttcttgtcttgtaggtattgtcttatttttgctttttgtcagaggggtcagagtaatcagactgtgctgttgtggcttttggttattactagtttaactggttatagattgcttagtgttgggtgaggatcatataataagtatgctttggtgagttgtgttcggtctgcgtttggctctataaggtttgaggctgtttttatgtgtgttgttattatggttgggctgctgtgagggggttattattcttttcccggggttgagcattcttgaatgttgctgttggtttcaccgctgttgtatgctgtttggcttgtaggtatactttgtgagtgcaaccgaactccattg


In [48]:
md("<b>Using the same json ld we can grab taxon information</b>")

<b>Using the same json ld we can grab taxon information</b>

In [51]:
taxon = jsonlddata['@graph']['scidata'][0]['system']['facets'][3]['taxon']
print(json.dumps(taxon, indent=2))

{
  "@id": "taxon/1/",
  "@type": [
    "plazi:definesTaxonConcept",
    "http://filteredpush.org/ontologies/oa/dwcFP#Taxon"
  ],
  "box": "[151,354,658,684]",
  "class": "Trematoda",
  "family": "Lepocreadiidae",
  "genus": "Holorchis",
  "kingdom": "Animalia",
  "order": "Plagiorchiida",
  "pageId": "1",
  "pageNumber": "52",
  "phylum": "Platyhelminthes",
  "rank": "species",
  "species": "castex",
  "status": "n. sp."
}
