### CONTENT NOTEBOOK

IMPORTS

VARIABLES

FUNCTIONS

CLASS

MAIN CODE

BUILD AND TESTS

### ----------------------------------------------------------------

In [None]:
#%%writefile sectionPaperAnnotator.py
### 
#IMPORTS

import os
import sys
import json
import pandas
import urllib
import requests
from urllib import parse, request

#VARIABLES

REST_URL = "http://data.bioontology.org"
API_KEY = '3b00793b-f3cc-489c-9d6e-7a888f4b656e'
PATH_TO_PAPERS = 'kaggle/'
PATH_TO_ONTS_SELECT = 'data/onts_select.csv'



#FUNCTIONS

def parse_directory():
    '''For parse and join paths in PATH_TO_PAPERS.
       Output:path_files: list with all paths to record 
    '''
    global path_files
    path_files = []

    for root, dirs, files in os.walk(PATH_TO_PAPERS):
        for file in files:
            path_files.append(os.path.join(root, file))

    return path_files

def get_metadata_record(path_to_record):
    '''Get dict keys from JSON-PAPER data structure.
       Input:path_to_json record.
       Output: record.keys.
    '''
    with open(path_to_record,'r') as json_handler:
        record=json.load(json_handler)
    return record.keys()

def load_json_record(path_to_record):
    ''' Load entire json record for a paper.
        Input: path to record.
        Output: json_record.
    '''
    global json_record
    with open(path_to_record, 'r') as json_handler:
        json_record = json.load(json_handler)
    return json_record

def load_select_onts():
    ''' For reading and proccess ontologies selected file in PATH_TO_ONTS_SELECT.
        Output: select_onts.
    '''
    #proces file.
    ont_sel = pandas.read_csv(PATH_TO_ONTS_SELECT)
    list_onto = [x.replace('\t','').replace(' ','').replace('\n','') for x in list(ont_sel['Ontology'].values)if type(x)!=float]
    #
    global select_onts
    select_onts= []
    for i in list_onto:
        if i not in select_onts:
            select_onts.append(i)
        else:
            next
            
    return select_onts

def get_json(url):
    
    '''
    General util: call some bioportal endpoint.
    Header Auth method.
    Input:general url bioportal.
    Output: json response fo a general urls requests.
    
    '''
    
    opener = urllib.request.build_opener()
    opener.addheaders = [('Authorization', 'apikey token=' + API_KEY)] 
    return json.loads(opener.open(url).read())

def load_bio_onts():
    '''Use bioportal endPoint to retrive all Bioportal Onts'''
    
    resources = get_json(REST_URL + "/")                    # call to retrieve json
    ontologies = get_json(resources["links"]["ontologies"]) # call again for especific fiels of json response
    
    global bio_onts
    bio_onts = []
    for ontology in ontologies:
        bio_onts.append(f"{ontology['name']}\n{ontology['@id']}\n")
        
    return bio_onts    

def onts():
    '''
       Merge select_onts and bio_onts,
       for  build later string-url-params that cotain only ontlogies
       selected and existing in bioportal.
       Output: ontologies url query parameter 
    '''
    
    select_onts = load_select_onts() # call other function
    bio_onts = load_bio_onts()       # call function
      
    #build an string characteres that contain a unique words: each word is a 
    # ontologie selected and existing in bio.
    global ontologies
    ontologies = '' 
    
    for ontosel in select_onts:          
        for ontobio in bio_onts:
            if ontobio.find(ontosel) > 1:          # bugOne: the word MESH make match twice and twice is append,
                if ontologies.find(ontosel) < 1:   # but they should be appended only once. i don't catch .
                    ontologies += ontosel +' '     # the reason of this.
                else:
                    pass
    ontologies = ontologies[:-1].replace(' ',',')
    
    return ontologies

def str_list_onts(ontologie_list):
    '''General util.
       make methamorfosis from list to string-of-elements
       Input:ontologies_list
       Output: ontologies url query parameter
    '''
    global ontologies
    ontologies = ""
    for ont in ontologie_list:
        ontologies += ','+ ont
    return ontologies[1:]

'''
#### nota: Es necesario estandarizar el archivo onts_select.csv: 
- se deben registrar las siglas oficiales de cada ontologia registrada. 
- se deben incluir ontologias que existan en bioportal.


para el caso de "TheCOVID-19InfectiousDiseaseOntology" la sigla oficias es IDO-COVID19. vease: https://bioportal.bioontology.org/ontologies/IDO-COVID-19

para el caso de: "NHC" no se encuentra en bioportal ontologias. vease: 
https://bioportal.bioontology.org/ontologies
'''

def make_url_query(text_to_annotate):
    '''return URL'''
    global URL 
    URL = REST_URL + "/annotator/?text=%s"%(parse.quote(text_to_annotate))
    return URL

#Call api V.2
def call_endpoint_annotator(URL, PARAMS):
    r = requests.get(url = URL, params = PARAMS)
    global json_api_results
    json_api_results = r.json()
    return json_api_results

### 
#CLASS

class dbMain:
    
    '''this class store data for tree schemas.
    s1: item collection.
    s2:chunks of paper
    s2:annotations for each chunk.
    contain methods for update schemas an store db main.
    Methods:
    annotations['matchTerm'].append(matchTerm)
    annotations['ontology'].append(ontology)
    annotations['annotatedClass'].append(annotatedClass)
    chunks['text'].append(text_to_annotate)
    chunks['sections'].append(section_text_to_annotated)
    chunks['chunks_paper'].append(annotations)
    collection['chunks_paper'].append(chunks)
    collection['paper_id'] += paper_id
    db.append(item_collection).
    '''
    
    db = [] # list of items_collections

    def __init__(self):

        self.item_collection = {
                                'paper_id':'',
                                'chunks_paper':[]
                                  }
        
        self.chunks = {
                        'texts':[],
                        'sections':[],
                        'annotations':[]
                        }

        self.annotations = {
                            'matchTerm':[],
                            'ontology':[],
                            'annotatedClass':[]
                            }

    #methods
    def restart_annotations(self):
        self.annotations = {
                            'matchTerm':[],
                            'ontology':[],
                            'annotatedClass':[]
                            }
    def update_annotations(self,matchTerm,
                                ontology,
                                annotatedClass):
        self.annotations['matchTerm'].append(matchTerm)
        self.annotations['ontology'].append(ontology)
        self.annotations['annotatedClass'].append(annotatedClass)
    
    def update_chunks(self,text,section,annotations):
        self.chunks['texts'].append(text)
        self.chunks['sections'].append(section)
        self.chunks['annotations'].append(annotations)

    def update_item_collection(self,paper_id,chunks_paper):
        self.item_collection['paper_id'] += paper_id
        self.item_collection['chunks_paper'].append(chunks_paper)
        
    def update_db_main(self):
        dbMain.db.append(self.item_collection)
        
    def store_db_main(self):
        
        with open('annotationsDBlist.json','w') as json_file:
            json_file.write(json.dumps(dbMain().db))

In [None]:
#MAIN CODE

ontologies = "MESH,COVID-19,HPIO,SNOMEDCT,OCHV,PREMEDONTO,NCIT,CIDO,VO,IOBC,BAO,COVID19,MEDDRA,COVIDCRFRAPID,EFO,CODO"

PARAMS = {'apikey':API_KEY,'ontologies':ontologies,
          'expand_class_hierarchy':'false','class_hierarchy_max_level':0,
           'longest_only':'true'
         }

#parse_directory()

parse_directory()

for idx, path in enumerate(path_files):
    
    load_json_record(path)
    emty_sections = []
    item = dbMain()
        
    try: 
        
        for idx, _chunk in enumerate(json_record['abstract']):

            #make url here
            text_to_annotate = _chunk['text']
            print(text_to_annotate)

            if text_to_annotate:
                make_url_query(text_to_annotate)
                print(URL)

                #call api here
                call_endpoint_annotator(URL,PARAMS)
                #init db schema annotations
                item.restart_annotations()

                #proccess and store resultsfor section here
                for i in range(len(json_api_results)):
                    item.update_annotations(json_api_results[i]['annotations'][0]['text'],
                                            json_api_results[i]['annotatedClass']['links']['ontology'],
                                            json_api_results[i]['annotatedClass']['@id']
                                           )
                print(item.annotations)
                item.update_chunks(_chunk['text'],
                                   _chunk['section'],
                                   item.annotations
                                  )
    
            else:
                emty_sections.append(idx)

        print('ok abstract')
        #Init api params

        for idx, _chunk in enumerate(json_record['body_text']):

            #make url here
            text_to_annotate = _chunk['text']
            print(text_to_annotate)

            if text_to_annotate:
                make_url_query(text_to_annotate)
                print(URL)

                #call api here
                call_endpoint_annotator(URL,PARAMS)
                #init schema annotations
                item.restart_annotations()

                #proccess and store resultsfor section here
                for i in range(len(json_api_results)):
                    item.update_annotations(json_api_results[i]['annotations'][0]['text'],
                                            json_api_results[i]['annotatedClass']['links']['ontology'],
                                            json_api_results[i]['annotatedClass']['@id']
                                           )
                item.update_chunks(_chunk['text'],
                                   _chunk['section'],
                                   item.annotations
                                  )
            else:
                emty_sections.append(idx)
                
    except KeyError:

        for idx, _chunk in enumerate(json_record['body_text']):

            #make url here
            text_to_annotate = _chunk['text']
            print(text_to_annotate)

            if text_to_annotate:
                make_url_query(text_to_annotate)
                print(URL)

                #call api here
                call_endpoint_annotator(URL,PARAMS)
                #init schema annotations
                item.restart_annotations()
                #proccess and store resultsfor section here
                for i in range(len(json_api_results)):
                    item.update_annotations(json_api_results[i]['annotations'][0]['text'],
                                            json_api_results[i]['annotatedClass']['links']['ontology'],
                                            json_api_results[i]['annotatedClass']['@id']

                                           )
                item.update_chunks(_chunk['text'],
                                   _chunk['section'],
                                   item.annotations
                                  )
            else:
                emty_sections.append(idx)

    finally:
        #for title
        text_to_annotate = json_record['metadata']['title']
        
        if text_to_annotate:
            make_url_query(text_to_annotate)
            print(URL)

            #call api here
            call_endpoint_annotator(URL,PARAMS)
            #init schema annotations
            
            item.restart_annotations()
            #proccess and store resultsfor section here
            for i in range(len(json_api_results)):
                item.update_annotations(json_api_results[i]['annotations'][0]['text'],
                                        json_api_results[i]['annotatedClass']['links']['ontology'],
                                        json_api_results[i]['annotatedClass']['@id']

                                           )
            item.update_chunks(_chunk['text'],
                                   _chunk['section'],
                                   item.annotations
                                  )
        item.update_item_collection(json_record['paper_id'], item.chunks)
        print('paper index:',idx,'processed')
        item.update_db_main()
item.store_db_main()

#### ---------------------------------------------------
BUILD AND TESTS


### load data

In [11]:
#example parse directory
parse_directory()

samp_r1=path_files[0]# pmc 
samp_r2=path_files[49] #pdf_json
samp_r1

#load_json_record(samp_r1)
#load_json_record(samp_r2)

'kaggle/document_parses/pmc_json/PMC1072807.xml.json'

In [13]:
load_json_record(samp_r1)

{'paper_id': 'PMC1072807',
 'metadata': {'title': 'Towards standardization of RNA quality assessment using user-independent classifiers of microcapillary electrophoresis traces',
  'authors': [{'first': 'Sandrine',
    'middle': [],
    'last': 'Imbeaud',
    'suffix': '',
    'email': None,
    'affiliation': {}},
   {'first': 'Esther',
    'middle': [],
    'last': 'Graudens',
    'suffix': '',
    'email': None,
    'affiliation': {}},
   {'first': 'Virginie',
    'middle': [],
    'last': 'Boulanger',
    'suffix': '',
    'email': None,
    'affiliation': {}},
   {'first': 'Xavier',
    'middle': [],
    'last': 'Barlet',
    'suffix': '',
    'email': None,
    'affiliation': {}},
   {'first': 'Patrick',
    'middle': [],
    'last': 'Zaborski',
    'suffix': '',
    'email': None,
    'affiliation': {}},
   {'first': 'Eric',
    'middle': [],
    'last': 'Eveno',
    'suffix': '',
    'email': None,
    'affiliation': {}},
   {'first': 'Odilo',
    'middle': [],
    'last': 'Mue

### See json schemes 

In [8]:
get_metadata_record(samp_r1) #for pmc_json

dict_keys(['paper_id', 'metadata', 'body_text', 'ref_entries', 'back_matter', 'bib_entries'])

In [None]:
#schema json for pmc_json
{
    'paper_id':'str',
    'metadata': {'title':'str', 'authors':'list'},
    #'abstract': {'text': 'str' ,'section':'str'},
    'bodytext': [{'text': 'str', 'section':'str'}]
}

In [9]:
get_metadata_record(samp_r2) #for pdf_json

dict_keys(['paper_id', 'metadata', 'abstract', 'body_text', 'bib_entries', 'ref_entries', 'back_matter'])

In [10]:
#eschema json for pdf_json
{
    'paper_id':'str',
    'metadata': {'title':'str', 'authors':'list'},
    'abstract': [{'text': 'str' ,'section':'str'}],
    'bodytext': [{'text': 'str', 'section':'str'}]
}

{'paper_id': 'str',
 'metadata': {'title': 'str', 'authors': 'list'},
 'abstract': [{'text': 'str', 'section': 'str'}],
 'bodytext': [{'text': 'str', 'section': 'str'}]}

In [None]:
#data['paper_id']            ####pmc
#data['metadata']['title']
#data['body_text'] #list of paragraphs

#data['paper_id']            #### pdf_json
#data['metadata']['title']
#data['abstract'] #list of paragraps 
#data['body_text'] #list of paragraps

### Proposal Nosql dbSchemas

In [None]:
#Nosql schemaDB proposal V.2:  schema COLLECTION of texts annotated from bioportal
db = list(Item_collection)

In [None]:
#Implementation Nosql schemaDB proposal V.2
item_collection = {
    'paper_id':'',
    'chunks_paper':[{'text':[], 
                    'section': [], 
                    'annotations':[{'matchTerm':[],
                                         'ontology':[],
                                         'annotatedClass':[]}
                                       ]
                        }]    
} 

In [None]:
#schema for each chunk of document for annotate
chunks = {'texts':[], 'sections':[], 'annotations': []}

In [None]:
#schema for results Bioportal api: annotations for each chunk.
annotations = {'matchTerm':[],             
                     'ontology':[],
                     'annotatedClass':[]}

In [None]:
#implementations of methods for organization and store data
#schema_chunks['text'].append('text_to_annotate')
#schema_chunks['section'].append('section_text_to_annotate')
#schema_chunks['annotations'].append(schema_annotations)
#item_collection['chunks_paper'].append(schema_chunks)
#item_collection['paper_id'] = 'paper_id'

## Tests

### test cero

### call api and retrieve annotations for one fragment of text

In [14]:
parse_directory()

['kaggle/document_parses/pmc_json/PMC1072807.xml.json',
 'kaggle/document_parses/pmc_json/PMC1402269.xml.json',
 'kaggle/document_parses/pmc_json/PMC302072.xml.json',
 'kaggle/document_parses/pmc_json/PMC59580.xml.json',
 'kaggle/document_parses/pmc_json/PMC549431.xml.json',
 'kaggle/document_parses/pmc_json/PMC59549.xml.json',
 'kaggle/document_parses/pmc_json/PMC1373654.xml.json',
 'kaggle/document_parses/pmc_json/PMC1351155.xml.json',
 'kaggle/document_parses/pmc_json/PMC1072806.xml.json',
 'kaggle/document_parses/pmc_json/PMC1090610.xml.json',
 'kaggle/document_parses/pmc_json/PMC1083427.xml.json',
 'kaggle/document_parses/pmc_json/PMC1459127.xml.json',
 'kaggle/document_parses/pmc_json/PMC306617.xml.json',
 'kaggle/document_parses/pmc_json/PMC468896.xml.json',
 'kaggle/document_parses/pmc_json/PMC137267.xml.json',
 'kaggle/document_parses/pmc_json/PMC302018.xml.json',
 'kaggle/document_parses/pmc_json/PMC420498.xml.json',
 'kaggle/document_parses/pmc_json/PMC521691.xml.json',
 'ka

In [15]:
#prepared url params
text_to_annotate = load_json_record(path_files[0])['body_text'][0]['text']
text_to_annotate 

#for test param
ontologies = "MESH,COVID-19,HPIO,SNOMEDCT,OCHV,PREMEDONTO"
ontologies

#make url2
URL=make_url_query(text_to_annotate)
URL

#longest_only={true|false}
PARAMS = {'apikey':API_KEY,'ontologies':ontologies,
          'expand_class_hierarchy':'false','class_hierarchy_max_level':0,
           'longest_only':'true'
         }
#PARAMS

In [None]:
#load_json_record(path_files[0]).keys()

In [16]:
#init item_collection schema
item_collection = {
    'paper_id':'',
    'chunks_paper':[]    
} 

call_endpoint_annotator(URL,PARAMS)

#json_api_results
#len(json_api_results)
#json_api_results[0].keys()

chunks = {'texts':[], 'sections':[], 'annotations': []}

# init schema_annotations
annotations= {'matchTerm':[],             
                     'ontology':[],
                     'annotatedClass':[]}

In [60]:
json_api_results

[{'annotatedClass': {'@id': 'http://purl.bioontology.org/ontology/SNOMEDCT/246224005',
   '@type': 'http://www.w3.org/2002/07/owl#Class',
   'links': {'self': 'http://data.bioontology.org/ontologies/SNOMEDCT/classes/http%3A%2F%2Fpurl.bioontology.org%2Fontology%2FSNOMEDCT%2F246224005',
    'ontology': 'http://data.bioontology.org/ontologies/SNOMEDCT',
    'children': 'http://data.bioontology.org/ontologies/SNOMEDCT/classes/http%3A%2F%2Fpurl.bioontology.org%2Fontology%2FSNOMEDCT%2F246224005/children',
    'parents': 'http://data.bioontology.org/ontologies/SNOMEDCT/classes/http%3A%2F%2Fpurl.bioontology.org%2Fontology%2FSNOMEDCT%2F246224005/parents',
    'descendants': 'http://data.bioontology.org/ontologies/SNOMEDCT/classes/http%3A%2F%2Fpurl.bioontology.org%2Fontology%2FSNOMEDCT%2F246224005/descendants',
    'ancestors': 'http://data.bioontology.org/ontologies/SNOMEDCT/classes/http%3A%2F%2Fpurl.bioontology.org%2Fontology%2FSNOMEDCT%2F246224005/ancestors',
    'instances': 'http://data.bio

In [33]:
json_api_results[2]['annotatedClass']['links']['ontology']

'http://data.bioontology.org/ontologies/OCHV'

In [38]:
json_api_results[3]['annotatedClass']['@id']

'http://sbmi.uth.tmc.edu/ontology/ochv#20452'

In [None]:
#process results api V.1
for i in range(len(json_api_results)):
    annotations['matchTerm'].append(json_api_results[i]['annotations'][0]['text'])
    annotations['ontology'].append(json_api_results[i]['annotatedClass']['links']['ontology'])
    annotations['annotatedClass'].append(json_api_results[i]['annotatedClass']['@id'])
chunks['annotations'].append(annotations)
chunks['texts'].append(text_to_annotate)
chunks['sections'].append(load_json_record(path_files[0])['body_text'][0]['section'])
item_collection['chunks_paper'].append(chunks)
item_collection['paper_id'] += load_json_record(path_files[0])['paper_id']

In [None]:
test = item_collection.copy()

In [None]:
#Some Tests for schema annotations process results api v.1

#for marchTerms
test_list_matchTerm = []
for result in json_api_results:
    test_list_matchTerm.append(result['annotations'][0]['text'])
                               
#for list ontology
test_list_ontology = []
for result in json_api_results:
    test_list_ontology.append(result['annotatedClass']['links']['ontology'])
                               
#for annotated class
test_list_annotated_class = []
for result in json_api_results:
    test_list_annotated_class.append(result['annotatedClass']['@id'])

assert annotations['matchTerm'] == test_list_matchTerm
assert annotations['ontology'] == test_list_ontology
assert annotations['annotatedClass'] == test_list_annotated_class

In [None]:
#test class db
item = dbMain()

In [None]:
#process results api v.2: employed class db
item.restart_annotations()
for i in range(len(json_api_results)):
    item.update_annotations(json_api_results[i]['annotations'][0]['text'],
                            json_api_results[i]['annotatedClass']['links']['ontology'],
                            json_api_results[i]['annotatedClass']['@id']
                           )
item.update_chunks(text_to_annotate,
                         load_json_record(path_files[0])['body_text'][0]['section'],
                         item.annotations
                         )
item.update_item_collection(load_json_record(path_files[0])['paper_id'],
                                  item.chunks)

In [None]:
#Some Tests for schema annotations process results api v.2

#for marchTerms
test_list_matchTerm = []
for result in json_api_results:
    test_list_matchTerm.append(result['annotations'][0]['text'])
                               
#for list ontology
test_list_ontology = []
for result in json_api_results:
    test_list_ontology.append(result['annotatedClass']['links']['ontology'])
                               
#for annotated class
test_list_annotated_class = []
for result in json_api_results:
    test_list_annotated_class.append(result['annotatedClass']['@id'])

assert annotations['matchTerm'] == item.annotations['matchTerm']
assert annotations['ontology'] == item.annotations['ontology']
assert annotations['annotatedClass'] == item.annotations['annotatedClass']

In [None]:
item.annotations

In [None]:
item.chunks

In [None]:
item.item_collection['chunks_paper']

In [None]:
test['chunks_paper']

In [None]:
#test for results belong process results api v.1 v.2
assert test['paper_id'] == item.item_collection['paper_id']
assert test['chunks_paper'] == item.item_collection['chunks_paper']

### Test one

### call api for each fragment of text in one pmc_json record

In [None]:
# for body text pmc_json
ontologies = "MESH,COVID-19,HPIO,SNOMEDCT,OCHV,PREMEDONTO"

PARAMS = {'apikey':API_KEY,'ontologies':ontologies,
          'expand_class_hierarchy':'false','class_hierarchy_max_level':0,
           'longest_only':'true'
         }

#init dbs schemas
emty_sections = []
item_collection = {'paper_id':'',
                   'chunks_paper':[]    
                  } 
chunks = {'texts':[], 
          'sections':[], 
          'annotations': []
         }
for idx, _chunk in enumerate(load_json_record(samp_r1)['body_text']):
    
    #make url here
    text_to_annotate = _chunk['text']
    print(text_to_annotate)
    
    if text_to_annotate:
        make_url_query(text_to_annotate)
        print(URL)

        #call api here
        call_endpoint_annotator(URL,PARAMS)
        #init schema annotations
        annotations= {'matchTerm':[],             
                         'ontology':[],
                         'annotatedClass':[]}
        #proccess and store resultsfor section here
        for i in range(len(json_api_results)):
            annotations['matchTerm'].append(json_api_results[i]['annotations'][0]['text'])
            annotations['ontology'].append(json_api_results[i]['annotatedClass']['links']['ontology'])
            annotations['annotatedClass'].append(json_api_results[i]['annotatedClass']['@id'])
        chunks['texts'].append(_chunk['text'])
        chunks['sections'].append(_chunk['section'])
        chunks['annotations'].append(annotations)
    else:
        emty_sections.append(idx)
        pass
item_collection['paper_id'] += load_json_record(samp_r1)['paper_id']
item_collection['chunks_paper'].append(chunks)

In [None]:
emty_sections

In [None]:
test = load_json_record(samp_r1)
test['body_text'][17]

In [None]:
item_collection['chunks_paper'][0]['texts'][1]

In [None]:
item_collection['chunks_paper'][0]['sections']

In [None]:
item_collection['chunks_paper'][0]['annotations']

In [None]:
# for body text pmc_json
ontologies = "MESH,COVID-19,HPIO,SNOMEDCT,OCHV,PREMEDONTO"

PARAMS = {'apikey':API_KEY,'ontologies':ontologies,
          'expand_class_hierarchy':'false','class_hierarchy_max_level':0,
           'longest_only':'true'
         }

#init db class
emty_sections = []

item = dbMain()
for idx, _chunk in enumerate(load_json_record(samp_r1)['body_text']):
    
    #make url here
    text_to_annotate = _chunk['text']
    print(text_to_annotate)
    
    if text_to_annotate:
        make_url_query(text_to_annotate)
        print(URL)

        #call api here
        call_endpoint_annotator(URL,PARAMS)
        #init schema annotations
        item.restart_annotations()
        #proccess and store resultsfor section here
        for i in range(len(json_api_results)):
            item.update_annotations(json_api_results[i]['annotations'][0]['text'],
                                    json_api_results[i]['annotatedClass']['links']['ontology'],
                                    json_api_results[i]['annotatedClass']['@id']
                                    
                                   )
        item.update_chunks(_chunk['text'],
                           _chunk['section'],
                           item.annotations
                          )
    else:
        emty_sections.append(idx)
        pass
item.update_item_collection(load_json_record(samp_r1)['paper_id'],
                            item.chunks)

In [None]:
item.item_collection['chunks_paper'][0]['annotations']

### Test two

### call api for each fragment of text in pdf_json record

In [None]:
#test for pdf_json
ontologies = "MESH,COVID-19,HPIO,SNOMEDCT,OCHV,PREMEDONTO"

PARAMS = {'apikey':API_KEY,'ontologies':ontologies,
          'expand_class_hierarchy':'false','class_hierarchy_max_level':0,
           'longest_only':'true'
         }

#init dbs schemas
chunks = {'texts':[], 'sections':[], 'annotations': []}
for _chunk in load_json_record(samp_r2)['abstract']:
    
    chunks['texts'].append(_chunk['text'])  # text to annotate
    chunks['sections'].append(_chunk['section'])
    
    #make url here
    text_to_annotate = _chunk['text']
    print(text_to_annotate)
    
    if text_to_annotate:
        make_url_query(text_to_annotate)
        print(URL)

        #call api here
        call_endpoint_annotator(URL,PARAMS)
        #init schema annotations
        annotations= {'matchTerm':[],             
                         'ontology':[],
                         'annotatedClass':[]}
        #proccess and store resultsfor section here
        for i in range(len(json_api_results)):
            annotations['matchTerm'].append(json_api_results[i]['annotations'][0]['text'])
            annotations['ontology'].append(json_api_results[i]['annotatedClass']['links']['ontology'])
            annotations['annotatedClass'].append(json_api_results[i]['annotatedClass']['@id'])
        chunks['annotations'].append(annotations)
    else:
        pass
    
print('ok abstract')
#Init api params

for _chunk in load_json_record(samp_r2)['body_text']:
    
    chunks['texts'].append(_chunk['text'])  # text to annotate
    chunks['sections'].append(_chunk['section'])
    
    #make url here
    text_to_annotate = _chunk['text']
    print(text_to_annotate)
    
    if text_to_annotate:
        make_url_query(text_to_annotate)
        print(URL)

        #call api here
        call_endpoint_annotator(URL,PARAMS)
        #init schema annotations
        annotations= {'matchTerm':[],             
                         'ontology':[],
                         'annotatedClass':[]}
        #proccess and store resultsfor section here
        for i in range(len(json_api_results)):
            annotations['matchTerm'].append(json_api_results[i]['annotations'][0]['text'])
            annotations['ontology'].append(json_api_results[i]['annotatedClass']['links']['ontology'])
            annotations['annotatedClass'].append(json_api_results[i]['annotatedClass']['@id'])
        chunks['annotations'].append(annotations)
    else:
        pass

In [None]:
test_list_sections = []
for chunk_ in load_json_record(samp_r2)['abstract']:
    test_list_sections.append(chunk_['section'])
for chunk_ in load_json_record(samp_r2)['body_text']:
    test_list_sections.append(chunk_['section'])
if load_json_record(samp_r2)['metadata']['title']:
    test_list_sections.append('title')
len(test_list_sections)

In [None]:
#test for pdf_json
ontologies = "MESH,COVID-19,HPIO,SNOMEDCT,OCHV,PREMEDONTO"

PARAMS = {'apikey':API_KEY,'ontologies':ontologies,
          'expand_class_hierarchy':'false','class_hierarchy_max_level':0,
           'longest_only':'true'
         }

#init dbs schemas
emty_sections = []
item = dbMain()
for idx,_chunk in enumerate(load_json_record(samp_r2)['abstract']):
    
    #make url here
    text_to_annotate = _chunk['text']
    print(text_to_annotate)
    
    if text_to_annotate:
        make_url_query(text_to_annotate)
        print(URL)

        #call api here
        call_endpoint_annotator(URL,PARAMS)
        #init db schema annotations
        item.restart_annotations()
       
        #proccess and store resultsfor section here
        for i in range(len(json_api_results)):
            item.update_annotations(json_api_results[i]['annotations'][0]['text'],
                                    json_api_results[i]['annotatedClass']['links']['ontology'],
                                    json_api_results[i]['annotatedClass']['@id']
                                   )
        item.update_chunks(_chunk['text'],
                           _chunk['section'],
                           item.annotations
                          )
    else:
        emty_sections.append(idx)
    
print('ok abstract')
#Init api params

for idx, _chunk in enumerate(load_json_record(samp_r2)['body_text']):
    
    #make url here
    text_to_annotate = _chunk['text']
    print(text_to_annotate)
    
    if text_to_annotate:
        make_url_query(text_to_annotate)
        print(URL)

        #call api here
        call_endpoint_annotator(URL,PARAMS)
        #init schema annotations
        item.restart_annotations()
       
        #proccess and store resultsfor section here
        for i in range(len(json_api_results)):
            item.update_annotations(json_api_results[i]['annotations'][0]['text'],
                                    json_api_results[i]['annotatedClass']['links']['ontology'],
                                    json_api_results[i]['annotatedClass']['@id']
                                   )
        item.update_chunks(_chunk['text'],
                           _chunk['section'],
                           item.annotations
                          )
    else:
        emty_sections.append(idx)
item.update_db_main()

In [None]:
test_list_sections = []
for chunk_ in load_json_record(samp_r2)['abstract']:
    test_list_sections.append(chunk_['section'])
for chunk_ in load_json_record(samp_r2)['body_text']:
    test_list_sections.append(chunk_['section'])
'''if load_json_record(samp_r2)['metadata']['title']:
    test_list_sections.append('title')'''
len(test_list_sections)

In [None]:
assert test_list_sections == item.chunks['sections']

### Test tree 

### Call api for each fragment of text in pcm_json and pdf_json records

In [None]:
db = []

ontologies = "MESH,COVID-19,HPIO,SNOMEDCT,OCHV,PREMEDONTO"

PARAMS = {'apikey':API_KEY,'ontologies':ontologies,
          'expand_class_hierarchy':'false','class_hierarchy_max_level':0,
           'longest_only':'true'
         }

#parse_directory()

path_files = [samp_r1,samp_r2]

for idx, path in enumerate(path_files):
    
    item_collection = {
    'paper_id':'',
    'chunks_paper':[]    
    } 
         
    load_json_record(path)
        
    try: 
        
        schema_chunks = {'texts':[], 'sections':[], 'annotations': []}
        for _chunk in load_json_record(path)['abstract']:
    
            schema_chunks['texts'].append(_chunk['text'])  # text to annotate
            schema_chunks['sections'].append(_chunk['section'])
    
            #make url here
            text_to_annotate = _chunk['text']
            print(text_to_annotate)

            if text_to_annotate:
                make_url_query(text_to_annotate)
                print(URL)

                #call api here
                call_endpoint_annotator(URL,PARAMS)
                #init schema annotations
                schema_annotations= {'matchTerm':[],             
                                 'ontology':[],
                                 'annotatedClass':[]}
            #proccess and store resultsfor section here
                for i in range(len(json_api_results)):
                    schema_annotations['matchTerm'].append(json_api_results[i]['annotations'][0]['text'])
                    schema_annotations['ontology'].append(json_api_results[i]['annotatedClass']['links']['ontology'])
                    schema_annotations['annotatedClass'].append(json_api_results[i]['annotatedClass']['@id'])
                schema_chunks['annotations'].append(schema_annotations)
            else:
                pass
    
        print('ok abstract')
   

        for _chunk in load_json_record(path)['body_text']: # fragments of text in body
    
            schema_chunks['texts'].append(_chunk['text'])  # text to annotate
            schema_chunks['sections'].append(_chunk['section'])

            #make url here
            text_to_annotate = _chunk['text']
            print(text_to_annotate)
    
            if text_to_annotate:
                make_url_query(text_to_annotate)
                print(URL)

                #call api here
                call_endpoint_annotator(URL,PARAMS)
                #init schema annotations
                schema_annotations= {'matchTerm':[],             
                                 'ontology':[],
                                 'annotatedClass':[]}
                #proccess and store resultsfor section here
                for i in range(len(json_api_results)):
                    schema_annotations['matchTerm'].append(json_api_results[i]['annotations'][0]['text'])
                    schema_annotations['ontology'].append(json_api_results[i]['annotatedClass']['links']['ontology'])
                    schema_annotations['annotatedClass'].append(json_api_results[i]['annotatedClass']['@id'])
                schema_chunks['annotations'].append(schema_annotations)
            else:
                pass
            
    except:
        schema_chunks = {'texts':[], 'sections':[], 'annotations': []}
        for _chunk in load_json_record(path)['body_text']:

            schema_chunks['texts'].append(_chunk['text'])  # text to annotate
            schema_chunks['sections'].append(_chunk['section'])

            #make url here
            text_to_annotate = _chunk['text']
            print(text_to_annotate)

            if text_to_annotate:
                make_url_query(text_to_annotate)
                print(URL)

                #call api here
                call_endpoint_annotator(URL,PARAMS)
                #init schema annotations
                schema_annotations= {'matchTerm':[],             
                                 'ontology':[],
                                 'annotatedClass':[]}
                #proccess and store resultsfor section here
                for i in range(len(json_api_results)):
                    schema_annotations['matchTerm'].append(json_api_results[i]['annotations'][0]['text'])
                    schema_annotations['ontology'].append(json_api_results[i]['annotatedClass']['links']['ontology'])
                    schema_annotations['annotatedClass'].append(json_api_results[i]['annotatedClass']['@id'])
                schema_chunks['annotations'].append(schema_annotations)
            else:
                pass
                
    finally:
        #for title
        text_to_annotate = json_record['metadata']['title']
        
        if text_to_annotate:
            make_url_query(text_to_annotate)
            print(URL)

            #call api here
            call_endpoint_annotator(URL,PARAMS)
            #init schema annotations
            schema_annotations= {'matchTerm':[],             
                                 'ontology':[],
                                 'annotatedClass':[]}
            #proccess and store resultsfor section here
            for i in range(len(json_api_results)):
                schema_annotations['matchTerm'].append(json_api_results[i]['annotations'][0]['text'])
                schema_annotations['ontology'].append(json_api_results[i]['annotatedClass']['links']['ontology'])
                schema_annotations['annotatedClass'].append(json_api_results[i]['annotatedClass']['@id'])
            schema_chunks['annotations'].append(schema_annotations)
            schema_chunks['sections'].append('title')
            schema_chunks['texts'].append(text_to_annotate)
        item_collection['paper_id'] += json_record['paper_id'] 
        item_collection['chunks_paper'].append(schema_chunks)
        print('paper index:',idx,'processed')
        db.append(item_collection)

In [None]:
db[1]

In [None]:
os.listdir('.')

In [None]:
parse_directory()

In [None]:
samp_r1 = path_files[0]
samp_r2 = path_files[48]

In [None]:
ontologies = "MESH,COVID-19,HPIO,SNOMEDCT,OCHV,PREMEDONTO"

PARAMS = {'apikey':API_KEY,'ontologies':ontologies,
          'expand_class_hierarchy':'false','class_hierarchy_max_level':0,
           'longest_only':'true'
         }

#parse_directory()

path_files = [samp_r1,samp_r2]

for idx, path in enumerate(path_files):
    
    load_json_record(path)
    emty_sections = []
    item = dbMain()
        
    try: 
        
        for idx, _chunk in enumerate(json_record['abstract']):

            #make url here
            text_to_annotate = _chunk['text']
            print(text_to_annotate)

            if text_to_annotate:
                make_url_query(text_to_annotate)
                print(URL)

                #call api here
                call_endpoint_annotator(URL,PARAMS)
                #init db schema annotations
                item.restart_annotations()

                #proccess and store resultsfor section here
                for i in range(len(json_api_results)):
                    item.update_annotations(json_api_results[i]['annotations'][0]['text'],
                                            json_api_results[i]['annotatedClass']['links']['ontology'],
                                            json_api_results[i]['annotatedClass']['@id']
                                           )
                print(item.annotations)
                item.update_chunks(_chunk['text'],
                                   _chunk['section'],
                                   item.annotations
                                  )
    
            else:
                emty_sections.append(idx)

        print('ok abstract')
        #Init api params

        for idx, _chunk in enumerate(json_record['body_text']):

            #make url here
            text_to_annotate = _chunk['text']
            print(text_to_annotate)

            if text_to_annotate:
                make_url_query(text_to_annotate)
                print(URL)

                #call api here
                call_endpoint_annotator(URL,PARAMS)
                #init schema annotations
                item.restart_annotations()

                #proccess and store resultsfor section here
                for i in range(len(json_api_results)):
                    item.update_annotations(json_api_results[i]['annotations'][0]['text'],
                                            json_api_results[i]['annotatedClass']['links']['ontology'],
                                            json_api_results[i]['annotatedClass']['@id']
                                           )
                item.update_chunks(_chunk['text'],
                                   _chunk['section'],
                                   item.annotations
                                  )
            else:
                emty_sections.append(idx)
                
    except KeyError:

        for idx, _chunk in enumerate(json_record['body_text']):

            #make url here
            text_to_annotate = _chunk['text']
            print(text_to_annotate)

            if text_to_annotate:
                make_url_query(text_to_annotate)
                print(URL)

                #call api here
                call_endpoint_annotator(URL,PARAMS)
                #init schema annotations
                item.restart_annotations()
                #proccess and store resultsfor section here
                for i in range(len(json_api_results)):
                    item.update_annotations(json_api_results[i]['annotations'][0]['text'],
                                            json_api_results[i]['annotatedClass']['links']['ontology'],
                                            json_api_results[i]['annotatedClass']['@id']

                                           )
                item.update_chunks(_chunk['text'],
                                   _chunk['section'],
                                   item.annotations
                                  )
            else:
                emty_sections.append(idx)

    finally:
        #for title
        text_to_annotate = json_record['metadata']['title']
        
        if text_to_annotate:
            make_url_query(text_to_annotate)
            print(URL)

            #call api here
            call_endpoint_annotator(URL,PARAMS)
            #init schema annotations
            
            item.restart_annotations()
            #proccess and store resultsfor section here
            for i in range(len(json_api_results)):
                item.update_annotations(json_api_results[i]['annotations'][0]['text'],
                                        json_api_results[i]['annotatedClass']['links']['ontology'],
                                        json_api_results[i]['annotatedClass']['@id']

                                           )
            item.update_chunks(_chunk['text'],
                                   _chunk['section'],
                                   item.annotations
                                  )
        item.update_item_collection(json_record['paper_id'], item.chunks)
        print('paper index:',idx,'processed')
        item.update_db_main()

In [None]:
# make some test her
one_item = dbMain.db[0]
one_item.keys()

In [None]:
one_item['chunks_paper'][0].keys()

In [None]:
one_item['chunks_paper'][0]['texts'][0]

In [None]:
one_item['chunks_paper'][0]['sections'][0]

In [None]:
one_item['chunks_paper'][0]['annotations'][0]

In [None]:
item.store_db_main()

In [None]:
len(item.db[1]['chunks_paper'][0]['annotations'][0]['annotatedClass'])

In [None]:
load_json_record(samp_r1)['paper_id']

In [None]:
samp_r2

###  FINAL CODE TO ANNOTATED PAPERS

In [None]:
len(item.db)