In [1]:
import pickle
import json
import warnings

import utils

from ontology_processing import OntologyProc
from rdf_processing import RDFProc
from report_processing import ReportProc
from bionlp import BioNLP

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# filter warnings produced by spacy on similarity between empty vectors
warnings.filterwarnings('ignore')

In [3]:
# available models
fasttext = '/home/ims/Desktop/Marchesin/EXAMODE_COLON/embeddings/BioWordVec_PubMed_MIMICIII_d200.bin' 
bert = 'emilyalsentzer/Bio_ClinicalBERT'

In [4]:
## set instances - this part needs to be uploaded as a request server (I guess?)

# ontology processing
exa_proc = OntologyProc(ontology_path='./ontology/examode.owl', hiearchies_path='./hierarchy_relations.txt')
# rdf processing
rdf_proc = RDFProc()
# report processing
report_proc = ReportProc()
# biomedical nlp processing
bio_proc = BioNLP(biospacy="en_core_sci_lg", biofast=fasttext, bert=None, 
                 rules='./rules.txt', dysplasia_mappings='./dysplasia_mappings.txt', cin_mappings='./cin_mappings.txt')




In [11]:
## load translatec datasets 

# load AOEC translated reports
with open('./trans_reports/colon/aoec/aoec_translated_reports_1stDS.pkl', 'rb') as out:
    reports = pickle.load(out)

In [12]:
## user-defined parameters

# define disease use case
use_case = 'colon'

In [13]:
## restrict base on ontology use case

# restrict hand-crafted rules and dysplasia mappings based on use-case
bio_proc.restrict2use_case(use_case=use_case)
# restrict ontology to given use case
exa_use_case = exa_proc.restrict2use_case(use_case=use_case)
# get concept labels for the considered use case
exa_labels = bio_proc.process_ontology_concepts(labels=[label.lower() for label in exa_use_case['label'].tolist()])

In [15]:
## perform linking and merge concepts from 'structured' and 'nlp' parts

# process AOEC reports before linking
reports = report_proc.process_aoec_reports(reports)

# perform linking over AOEC reports
concepts = bio_proc.aoec_entity_linking(reports, exa_proc, exa_labels, use_case, exa_use_case)

100%|██████████| 50/50 [00:14<00:00,  3.55it/s]


In [16]:
## convert report concepts into pre-defined labels used for classification 

# get report labels 
labels = utils.aoec_colon_concepts2labels(concepts)
# convert report labels to binary
binary = utils.aoec_colon_labels2binary(labels)

In [17]:
## store concepts and labels

utils.store_concepts(concepts, './data/concepts/colon/aoec/concepts_1stDS_no_mention_match_rules')
utils.store_labels(labels, './data/labels/colon/aoec/labels_1stDS_no_mention_match_rules')
utils.store_labels(binary, './data/labels/colon/aoec/binary_1stDS_no_mention_match_rules')

True

In [18]:
## convert report concepts into an rdf graph and serialize into n3, trig, and turtle formats

graphs = list()
# convert report concepts into list of (s, p, o) triples
for rid in reports.keys():
    graphs.append(rdf_proc.aoec_create_graph(rid, reports[rid], concepts[rid], exa_proc, use_case))
# serialize graphs into rdf using specified format
rdf_proc.searialize_report_graphs(graphs, output='./data/graphs/cervix/aoec/graph_2ndDS.ttl', rdf_format='turtle')
rdf_proc.searialize_report_graphs(graphs, output='./data/graphs/cervix/aoec/graph_2ndDS.n3', rdf_format='n3')
rdf_proc.searialize_report_graphs(graphs, output='./data/graphs/cervix/aoec/graph_2ndDS.trig', rdf_format='trig')

rdf graph serialized to ./data/graphs/cervix/aoec/graph_2ndDS.ttl with turtle format
rdf graph serialized to ./data/graphs/cervix/aoec/graph_2ndDS.n3 with n3 format
rdf graph serialized to ./data/graphs/cervix/aoec/graph_2ndDS.trig with trig format


True