In [257]:
# %load annotatorcode.py
# bioportal
import os
import json
import requests
import urllib
from urllib import parse, request

import nltk
from nltk.corpus import stopwords

REST_URL = "http://data.bioontology.org"
API_KEY = '3b00793b-f3cc-489c-9d6e-7a888f4b656e'
PATH_TO_PAPERS = 'kaggle/'

def parse_directory():
    '''For parse and join paths in PATH_TO_PAPERS.
       Output:path_files: list with all paths to record 
    '''
    global path_files
    path_files = []

    for root, dirs, files in os.walk(PATH_TO_PAPERS):
        for file in files:
            path_files.append(os.path.join(root, file))

    return path_files

def load_json_record(path_to_record):
    ''' Load entire json record for a paper.
        Input: path to record.
        Output: json_record.
    '''
    global json_record
    with open(path_to_record, 'r') as json_handler:
        json_record = json.load(json_handler)
    return json_record

def make_url_query(text_to_annotate):
    global URL 
    URL = REST_URL + "/annotator/?text=%s"%(parse.quote(text_to_annotate))
    return URL

def get_json(url):
    opener = urllib.request.build_opener()
    opener.addheaders = [('Authorization', 'apikey token=' + API_KEY)]
    return json.loads(opener.open(url).read())

def call_endpoint_annotator(URL, PARAMS):
    r = requests.get(url = URL, params = PARAMS)
    global json_api_results
    json_api_results = r.json()
    return json_api_results

def store_annotations(annotations,get_class=True):
    anns = []
    for result in annotations:
        ann = {}
        class_details = result["annotatedClass"]
        if get_class:
            try:
                class_details = get_json(result["annotatedClass"]["links"]["self"])
            except urllib.error.HTTPError:
                print(f"Error retrieving {result['annotatedClass']['@id']}")
                continue
        ann['id'] = class_details["@id"]
        ann['preflabel'] = class_details["prefLabel"]
        ann['ontology'] = class_details["links"]["ontology"]
        ann['annotation_details'] =  []
        for annotation in result['annotations']:
            ann['annotation_details'].append(
                {
                    'from': str(annotation["from"]),
                    'to': str(annotation["to"]),
                    'match_type': annotation["matchType"],
                    'text': annotation['text']
                })
        if result["hierarchy"]:
            ann['hierarchy'] = []
            for annotation in result["hierarchy"]:
                try:
                    class_details = get_json(annotation["annotatedClass"]["links"]["self"])
                except urllib.error.HTTPError:
                    print(f"Error retrieving {annotation['annotatedClass']['@id']}")
                    continue
                ann['hierarchy'].append(
                    {
                     'id':class_details["@id"],
                     'preflabel':class_details["prefLabel"],
                     'ontology':class_details["links"]["ontology"],
                     'distance_from_original_class':str(annotation["distance"])
                    })
        anns.append(ann)
    return anns

def get_annotations(text_to_annotate,ontologies,expand_class_hierarchy='false',class_hierarchy_max_level=0):
    '''get annotations from bioPortal for one fragment of text'''
    PARAMS = {'apikey':API_KEY,'ontologies':ontologies,
              'expand_class_hierarchy':expand_class_hierarchy,
              'class_hierarchy_max_level':0,
              'longest_only':'true'
             }
    make_url_query(text_to_annotate)
    call_endpoint_annotator(URL,PARAMS)
    return store_annotations(json_api_results)

def count_annotations(ann):
    count = {}
    for i in ann:
        onto = i['ontology'].split('/')[-1]
        key = onto + "_Matched_Term"
        if onto not in count.keys():
            count[onto] = 1
            count[key] = [i['preflabel']]
        else:
            count[onto] += 1
            count[key].append(i['preflabel'])
    return count

In [258]:
ontologies = "PREMEDONTO,MESH,COVID19,SNOMEDCT" # ontoportal id ontology.

In [259]:
parse_directory()

['kaggle/document_parses/pdf_json/06ced00a5fc04215949aa72528f2eeaae1d58927.json',
 'kaggle/document_parses/pdf_json/30a4842a2e257f725cc041e32fd682c495111a50.json',
 'kaggle/document_parses/pdf_json/3bb07ea10432f7738413dff9816809cc90f03f99.json',
 'kaggle/document_parses/pdf_json/95aec306660c1b7519e9f019d92cc3a8206481e4.json',
 'kaggle/document_parses/pdf_json/faaf1022ccfe93b032c5608097a53543ba24aedb.json',
 'kaggle/document_parses/pdf_json/a1b6c2b3b808e995697f94d9676d5d5c85180177.json',
 'kaggle/document_parses/pdf_json/0a1533470817bc5ef0d0d0af56386a96b505dc0d.json',
 'kaggle/document_parses/pdf_json/40e41f0c52b39669dee24e37875d7a9fabc38636.json',
 'kaggle/document_parses/pdf_json/184aded923f0ac3cbdbcf74d2a5b42cda0f414c2.json',
 'kaggle/document_parses/pdf_json/d706e82e2ebc8571e70654d1acf0ffd3969ab2d3.json',
 'kaggle/document_parses/pdf_json/5806726a24dc91de3954001effbdffd7a82d54e2.json',
 'kaggle/document_parses/pdf_json/5b44feca5d6ffaaeb66501fa84cc6dd44d06660a.json',
 'kaggle/documen

In [260]:
pmc_art = load_json_record(path_files[1])

In [261]:
pmc_art['body_text']

[{'text': 'There are hundreds of viruses that infect different human organs and cause diseases. Some fatal emerging viral infections have become serious public health issues worldwide. Early diagnosis and subsequent treatment are therefore essential for fighting viral infections. Current diagnostic techniques frequently employ polymerase chain reaction (PCR)-based methods to quickly detect the pathogenic viruses and establish the etiology of the disease or illness. However, the fast PCR method suffers from many drawbacks such as a high false-positive rate and the ability to detect only one or a few gene targets at a time. Microarray technology solves the problems of the PCR limitations and can be effectively applied to all fields of molecular medicine. Recently, a report in Retrovirology described a multi-virus DNA array that contains more than 250 open reading frames from eight human viruses including human immunodeficiency virus type 1. This array can be used to detect multiple viral

In [None]:
# falto nombre clase.

In [None]:
# make class for handle general db
# METHODS
# create db
# store things in db
# load db in memory
# write db in disk

In [None]:
# make class for handle bioportal api service
# METHODS
# handle web servise
# handle virtual machine service
# call annotator

In [None]:
# proccess json
# read file
# map keys
# iterate over keys
# call api with section content
# store annotations for section
# count annotations for each annotated section
# store count
# store annotations and count annotation for file in db

In [None]:
# json article annotations

{
    'id':str(id_db),
    'annotations':{
        'section_x':[],
        'section_y':[],
        ...
        'section_z':[]
    }
    count_annotations:[
        
    ]
}

In [None]:
# data management plan
# -> intelectual property right management
#    -> identification of data
#    -> funder requeriments
#    -> authority
#    -> composite
#    -> sharing requeriments
#        ->permisions
#        conditions
#        restrictions
#        select and grouping
#    -> identifiying license more suitable
#    -> depositing data in data verse
#    -> setting up permissions in dataverse
#       -> diligence metadata fields related to license
#       ->

In [336]:
class articleProcess:
    def __init__(self,record):
        
        self.record = record
        
        self.pmc_art_ann = {
                            'id':record['paper_id'],
                            'annotations':{},
                            'annotations_count':[],
                            'annotations_count_resume':{}, # total
                            'token_resumen': {}
                            }
        
    def AnnotateRecord(self):
        
        for chunk in self.record['body_text']:
            # get annotation for chunk
            ann = get_annotations(chunk['text'],ontologies)
            print(chunk['text']+'\n')
            # store annotation
            section = chunk['section'].split(':::')[-1]
            if section not in self.pmc_art_ann['annotations'].keys():
                self.pmc_art_ann['annotations'][section] = ann
            else:
                self.pmc_art_ann['annotations'][section].append(ann)
            # count annotations
            ann_count=count_annotations(ann)
            #store annotation count
            self.pmc_art_ann['annotations_count'].append(ann_count)
            
    def Resumen(self):
        word_resume = []
        for onto in ontologies.split(','):
            self.pmc_art_ann['annotations_count_resume'][onto]=0
            self.pmc_art_ann['annotations_count_resume'][onto+'_Matched_Term'] = []

        for onto in ontologies.split(','): 
            for i in self.pmc_art_ann['annotations_count']:
                if onto in i.keys():
                    self.pmc_art_ann['annotations_count_resume'][onto] += i[onto]
                    self.pmc_art_ann['annotations_count_resume'][onto + '_Matched_Term'].extend(i[onto+'_Matched_Term'])
                    word_resume.extend(i[onto+'_Matched_Term'])
        
        word_count = {}
        for i in word_resume:
            if i not in word_count.keys():
                word_count[i] = 1
            else:
                word_count[i]+=1

        stops = stopwords.words('english')
        tokens = ''

        for i in ins.record['body_text']:
            tokens=tokens+''+i['text']
            token = [t for t in tokens.split(' ') if t not in stops]

        self.pmc_art_ann['token_resumen']['article_tokens'] = list(set(token))
        self.pmc_art_ann['token_resumen']['count_matches'] = word_count
        self.pmc_art_ann['token_resumen']['all_matches'] = word_resume
        self.pmc_art_ann['token_resumen']['unique_matches'] =  list(set(word_resume))
        self.pmc_art_ann['token_resumen']['porcentage_matched'] = len(self.pmc_art_ann['token_resumen']['unique_matches'])/len(self.pmc_art_ann['token_resumen']['article_tokens'])
        

In [337]:
ins = articleProcess(pmc_art)

In [338]:
ins.pmc_art_ann.keys()

dict_keys(['id', 'annotations', 'annotations_count', 'annotations_count_resume', 'token_resumen'])

In [339]:
ins.AnnotateRecord()

There are hundreds of viruses that infect different human organs and cause diseases. Some fatal emerging viral infections have become serious public health issues worldwide. Early diagnosis and subsequent treatment are therefore essential for fighting viral infections. Current diagnostic techniques frequently employ polymerase chain reaction (PCR)-based methods to quickly detect the pathogenic viruses and establish the etiology of the disease or illness. However, the fast PCR method suffers from many drawbacks such as a high false-positive rate and the ability to detect only one or a few gene targets at a time. Microarray technology solves the problems of the PCR limitations and can be effectively applied to all fields of molecular medicine. Recently, a report in Retrovirology described a multi-virus DNA array that contains more than 250 open reading frames from eight human viruses including human immunodeficiency virus type 1. This array can be used to detect multiple viral co-infecti

In [340]:
ins.Resumen()

In [343]:
ins.pmc_art_ann.keys()

dict_keys(['id', 'annotations', 'annotations_count', 'annotations_count_resume', 'token_resumen'])

In [352]:
ins.pmc_art_ann['token_resumen']['porcentage_matched']

0.4411764705882353

############################ 

In [236]:
text = 'There are hundreds of viruses that infect different human organs and cause diseases. Some fatal emerging viral infections have become serious public health issues worldwide. Early diagnosis and subsequent treatment are therefore essential for fighting viral infections. Current diagnostic techniques frequently employ polymerase chain reaction (PCR)-based methods to quickly detect the pathogenic viruses and establish the etiology of the disease or illness. However, the fast PCR method suffers from many drawbacks such as a high false-positive rate and the ability to detect only one or a few gene targets at a time. Microarray technology solves the problems of the PCR limitations and can be effectively applied to all fields of molecular medicine. Recently, a report in Retrovirology described a multi-virus DNA array that contains more than 250 open reading frames from eight human viruses including human immunodeficiency virus type 1. This array can be used to detect multiple viral co-infections in cells and in vivo. Another benefit of this kind of multi-virus array is in studying promoter activity and viral gene expression and correlating such readouts with the progression of disease and reactivation of latent infections. Thus, the virus DNA-chip development reported in Retrovirology is an important advance in diagnostic application which could be a potent clinical tool for characterizing viral co-infections in AIDS as well as other patients.'

In [321]:
con = ins.pmc_art_ann['annotations_count']

In [322]:
count_resume = {}
word_resume = []
for onto in ontologies.split(','):
    count_resume[onto]=0
    count_resume[onto+'_Matched_Term'] = []


for i in con:
    for onto in ontologies.split(','):
        if onto in i.keys():
            count_resume[onto] += i[onto]
            count_resume[onto + '_Matched_Term'].extend(i[onto+'_Matched_Term'])
            word_resume.extend(i[onto+'_Matched_Term'])

In [323]:
count_resume.keys()

dict_keys(['PREMEDONTO', 'PREMEDONTO_Matched_Term', 'MESH', 'MESH_Matched_Term', 'COVID19', 'COVID19_Matched_Term', 'SNOMEDCT', 'SNOMEDCT_Matched_Term'])

In [324]:
len(word_resume)

445

In [325]:
len(count_resume['MESH_Matched_Term'])

103

In [326]:
stops = stopwords.words('english')

tokens = ''

for i in ins.record['body_text']:
    tokens=tokens+''+i['text']

token = [t for t in tokens.split(' ') if t not in stops]

In [327]:
word_count = {}
for i in word_resume:
    if i not in word_count.keys():
        word_count[i] = 1
    else:
        word_count[i]+=1

In [328]:
word_count

{'Difference': 4,
 'Human': 6,
 'Have': 3,
 'Treat': 1,
 'Therapeutic Procedure': 1,
 'Essential': 2,
 'Current': 4,
 'Diagnosis': 4,
 'Diagnostic': 2,
 'Technique': 11,
 'Frequently': 3,
 'Use': 5,
 'Basis': 2,
 'Rapidly': 3,
 'Method': 2,
 'High': 8,
 'Positive': 2,
 'Rate': 1,
 'Ability': 1,
 'Gene': 7,
 'Time': 5,
 'Technology': 6,
 'Report': 4,
 'Multiple': 4,
 'DNA': 10,
 'Cell': 9,
 'Benefit': 1,
 'Type': 1,
 'Activity': 9,
 'Active': 4,
 'Gene Expression': 11,
 'Development': 4,
 'Important': 1,
 'Application': 6,
 'Clinical': 8,
 'Patient': 2,
 'virology': 2,
 'Viruses': 2,
 'Disease': 4,
 'Infections': 4,
 'Public Health': 1,
 'Early Diagnosis': 1,
 'therapy': 1,
 'Polymerase Chain Reaction': 1,
 'Methods': 1,
 'methods': 1,
 'etiology': 1,
 'Molecular Medicine': 1,
 'Virus': 6,
 'Open Reading Frames': 1,
 'Cells': 5,
 'Patients': 2,
 'Fatal': 1,
 'Fighting': 1,
 'Polymerase chain reaction': 1,
 'Etiology': 1,
 'Illness': 1,
 'Fast': 1,
 'False': 1,
 'Recent': 1,
 '250': 2,
 