In [14]:
# %load annotatorcode.py
# bioportal
import os
import json
import requests
import urllib
from urllib import parse, request

REST_URL = "http://data.bioontology.org"
API_KEY = '3b00793b-f3cc-489c-9d6e-7a888f4b656e'
PATH_TO_PAPERS = 'kaggle/'

def parse_directory():
    '''For parse and join paths in PATH_TO_PAPERS.
       Output:path_files: list with all paths to record 
    '''
    global path_files
    path_files = []

    for root, dirs, files in os.walk(PATH_TO_PAPERS):
        for file in files:
            path_files.append(os.path.join(root, file))

    return path_files

def load_json_record(path_to_record):
    ''' Load entire json record for a paper.
        Input: path to record.
        Output: json_record.
    '''
    global json_record
    with open(path_to_record, 'r') as json_handler:
        json_record = json.load(json_handler)
    return json_record

def make_url_query(text_to_annotate):
    global URL 
    URL = REST_URL + "/annotator/?text=%s"%(parse.quote(text_to_annotate))
    return URL

def get_json(url):
    opener = urllib.request.build_opener()
    opener.addheaders = [('Authorization', 'apikey token=' + API_KEY)]
    return json.loads(opener.open(url).read())

def call_endpoint_annotator(URL, PARAMS):
    r = requests.get(url = URL, params = PARAMS)
    global json_api_results
    json_api_results = r.json()
    return json_api_results

def store_annotations(annotations,get_class=True):
    anns = []
    for result in annotations:
        ann = {}
        class_details = result["annotatedClass"]
        if get_class:
            try:
                class_details = get_json(result["annotatedClass"]["links"]["self"])
            except urllib.error.HTTPError:
                print(f"Error retrieving {result['annotatedClass']['@id']}")
                continue
        ann['id'] = class_details["@id"]
        ann['preflabel'] = class_details["prefLabel"]
        ann['ontology'] = class_details["links"]["ontology"]
        ann['annotation_details'] =  []
        for annotation in result['annotations']:
            ann['annotation_details'].append(
                {
                    'from': str(annotation["from"]),
                    'to': str(annotation["to"]),
                    'match_type': annotation["matchType"],
                    'text': annotation['text']
                })
        if result["hierarchy"]:
            ann['hierarchy'] = []
            for annotation in result["hierarchy"]:
                try:
                    class_details = get_json(annotation["annotatedClass"]["links"]["self"])
                except urllib.error.HTTPError:
                    print(f"Error retrieving {annotation['annotatedClass']['@id']}")
                    continue
                ann['hierarchy'].append(
                    {
                     'id':class_details["@id"],
                     'preflabel':class_details["prefLabel"],
                     'ontology':class_details["links"]["ontology"],
                     'distance_from_original_class':str(annotation["distance"])
                    })
        anns.append(ann)
    return anns

def get_annotations(text_to_annotate,ontologies,expand_class_hierarchy='false',class_hierarchy_max_level=0):
    
    PARAMS = {'apikey':API_KEY,'ontologies':ontologies,
              'expand_class_hierarchy':expand_class_hierarchy,
              'class_hierarchy_max_level':0,
              'longest_only':'true'
             }
    make_url_query(text_to_annotate)
    call_endpoint_annotator(URL,PARAMS)
    return store_annotations(json_api_results)

In [2]:
ontologies = "PREMEDONTO,MESH,COVID19,SNOMEDCT" # ontoportal id ontology.

In [3]:
text_to_annotate = "According to current live statistics at the time of editing this letter, Russia has been the third country in the world to be affected by COVID-19 with both new cases and death rates rising. It remains in a position of advantage due to the later onset of the viral spread within the country since the worldwide disease outbreak."

In [4]:
ann = get_annotations(text_to_annotate,ontologies)

In [5]:
ann

[{'id': 'http://purl.bioontology.org/ontology/SNOMEDCT/15240007',
  'preflabel': 'Current',
  'ontology': 'https://data.bioontology.org/ontologies/SNOMEDCT',
  'annotation_details': [{'from': '14',
    'to': '20',
    'match_type': 'PREF',
    'text': 'CURRENT'}]},
 {'id': 'http://purl.obolibrary.org/obo/NCIT_C25471',
  'preflabel': 'Current',
  'ontology': 'https://data.bioontology.org/ontologies/PREMEDONTO',
  'annotation_details': [{'from': '14',
    'to': '20',
    'match_type': 'PREF',
    'text': 'CURRENT'}]},
 {'id': 'http://purl.bioontology.org/ontology/MESH/D020500',
  'preflabel': 'Statistics',
  'ontology': 'https://data.bioontology.org/ontologies/MESH',
  'annotation_details': [{'from': '27',
    'to': '36',
    'match_type': 'PREF',
    'text': 'STATISTICS'}]},
 {'id': 'http://purl.bioontology.org/ontology/SNOMEDCT/410669006',
  'preflabel': 'Time',
  'ontology': 'https://data.bioontology.org/ontologies/SNOMEDCT',
  'annotation_details': [{'from': '45',
    'to': '48',
   

In [None]:
count = {}
for i in ann:
    onto = i['ontology'].split('/')[-1]
    key = onto + "_Matched_Term"
    if onto not in count.keys():
        count[onto] = 1
        count[key] = [i['preflabel']]
    else:
        count[onto] += 1
        count[key].append(i['preflabel'])
        
    print(i['ontology'].split('/')[-1])

In [26]:
def count_annotations(ann):
    count = {}
    for i in ann:
        onto = i['ontology'].split('/')[-1]
        key = onto + "_Matched_Term"
        if onto not in count.keys():
            count[onto] = 1
            count[key] = [i['preflabel']]
        else:
            count[onto] += 1
            count[key].append(i['preflabel'])
    return count

In [None]:
parse_directory()

In [18]:
pmc_art = load_json_record(path_files[0])

In [29]:
pmc_art.keys()

dict_keys(['paper_id', 'metadata', 'body_text', 'ref_entries', 'back_matter', 'bib_entries'])

In [46]:
pmc_art_ann = {
    'id':'',
    'annotations':{},
    'annotations_count':[]
}

In [67]:
for i in pmc_art['body_text']:
    print(i['text']+'\n')

hnRNP A1 is an RNA-binding protein that contains two RNA-binding domains (RBDs) and a glycine-rich domain responsible for protein–protein interaction. It is involved in pre-mRNA splicing and transport of cellular RNAs (reviewed by Dreyfuss et al., 1993). It is predominantly located in the nucleus, but also shuttles between the nucleus and the cytoplasm (Piñol-Roma and Dreyfuss, 1992). The signal that mediates shuttling has been identified as a 38 amino acid sequence, termed M9, located near the C-terminus of hnRNP A1 between amino acids 268 and 305 (Michael et al., 1995; Siomi and Dreyfuss, 1995; Weighardt et al., 1995). Yeast two-hybrid screening with M9 as bait resulted in the discovery of a novel transportin-mediated pathway for nuclear import of hnRNP A1 (Pollard et al., 1996; Fridell et al., 1997; Siomi et al., 1997). The function of the cytoplasmic hnRNP A1 has not been well defined. Studies have shown that cytoplasmic and nuclear hnRNP A1 exhibit different RNA-binding profiles. 

In [68]:
with open('cuerpo_articulo.txt','a') as file:
    for i in pmc_art['body_text']:
        file.write(i['text']+'\n')

In [69]:
pmc_art['paper_id']

'PMC302072'

In [32]:
pmc_art_ann['id'] = pmc_art['paper_id']

In [47]:
for chunk in pmc_art['body_text']:
    # get annotation for chunk
    ann = get_annotations(chunk['text'],ontologies)
    # store annotation
    section = chunk['section'].split(':::')[-1]
    if section not in pmc_art_ann['annotations'].keys():
        pmc_art_ann['annotations'][section] = ann
    else:
        pmc_art_ann['annotations'][section].append(ann)
    # count annotations
    ann_count=count_annotations(ann)
    #store annotation count
    pmc_art_ann['annotations_count'].append(ann_count)        

In [59]:
conteo = pmc_art_ann['annotations_count']

In [61]:
with open('conteo_anotaciones.json', 'w') as json_handler:
    json_handler.write(json.dumps(conteo))

In [45]:
for k in pmc_art_ann['annotations'].keys():
    pmc_art_ann['annotations'][k.split(':')[-1]] = pmc_art_ann['annotations'][k]
    del pmc_art_ann['annotations'][k]

RuntimeError: dictionary changed size during iteration

In [43]:
for i in pmc_art['body_text']:
    print(k['section'].split(':')[-1])

Introduction
Introduction
Introduction
Introduction
 Results
 Results
 Results
 Results
 Results
 Results
 Results
 Results
 Results
 Results
 Results
Discussion
Discussion
Discussion
Discussion
Discussion
Discussion
Discussion
Discussion
 Materials and methods
 Materials and methods
 Materials and methods
 Materials and methods
 Materials and methods
 Materials and methods
 Materials and methods
 Materials and methods
 Materials and methods
 Materials and methods
 Materials and methods
 Materials and methods
 Materials and methods
 Materials and methods
 Materials and methods


In [None]:
# metadata PMC
for file in path_files:
    if 'PMC' in file:
        print(load_json_record(path_files[0]).keys())

In [None]:
# falto nombre clase.

In [None]:
# make class for handle general db
# METHODS
# create db
# store things in db
# load db in memory
# write db in disk

In [None]:
# make class for handle bioportal api service
# METHODS
# handle web servise
# handle virtual machine service
# call annotator

In [None]:
# proccess json
# read file
# map keys
# iterate over keys
# call api with section content
# store annotations for section
# count annotations for each annotated section
# store count
# store annotations and count annotation for file in db

In [None]:
# json article annotations

{
    'id':str(id_db),
    'annotations':{
        'section_x':[],
        'section_y':[],
        ...
        'section_z':[]
    }
    count_annotations:[
        
    ]
}

In [3]:
d = {'c':1,'a':2,'b':3}

In [6]:
list(d.keys())

['c', 'a', 'b']

In [None]:
# data management plan
# -> intelectual property right management
#    -> identification of data
#    -> funder requeriments
#    -> authority
#    -> composite
#    -> sharing requeriments
#        ->permisions
#        conditions
#        restrictions
#        select and grouping
#    -> identifiying license more suitable
#    -> depositing data in data verse
#    -> setting up permissions in dataverse
#       -> diligence metadata fields related to license
#       ->