In [1]:
# Import libraries
import os
import re
import random
import pickle
import subprocess
import numpy as np
import pandas as pd
import datetime as dt

from tqdm import tqdm
from datetime import datetime
from collections import Counter

# 1. Setup concept extractors

Some options were [MetaMap](https://metamap.nlm.nih.gov/) and [spaCy](https://spacy.io/). 

[MetaMap](https://metamap.nlm.nih.gov/) is specific to recognizing UMLS concepts. There is a [Python wrapper](https://github.com/AnthonyMRios/pymetamap), but known to be slow and bad.

[spaCy](https://spacy.io/) is a popular NLP Python package with an extensive library for named entity recognition. It has a wide variety of [extensions](https://spacy.io/universe) and models to choose from. We're going with the following.

* [scispaCy](https://spacy.io/universe/project/scispacy) contains spaCy models for processing biomedical, scientific or clinical text. It seems easy to use and has a wide variety of concepts it can recognize, including UMLS, RxNorm, etc.

* [negspaCy](https://spacy.io/universe/project/negspacy) identifies negations using some extension of regEx. Probably useful for things like, "this pt is diabetic" v. "this pt is not diabetic." [todo: negation identification of medspacy might be better, https://github.com/medspacy/medspacy]

* [Med7](https://github.com/kormilitzin/med7) is a model trained for recognizing entities in prescription text, e.g. identifies drug name, dosage, duration, etc., which could be useful stuff to check for conflicts. 

We're going with spaCy for this.. and coming up with a coherent way to integrate entities picked up by these three extensions/models.

## i) Installations

In [2]:
import sys; sys.executable

'/opt/conda/bin/python'

In [3]:
import spacy
import scispacy

from pprint import pprint
from collections import OrderedDict

from spacy import displacy
# from scispacy.abbreviation import AbbreviationDetector # UMLS already contains abbrev. detect
from scispacy.umls_linking import UmlsEntityLinker

# should be 2.3.5 and >=0.3.0
spacy.__version__, scispacy.__version__

('2.3.5', '0.3.0')

## ii) Setting up the model

The model is used to form word/sentence embeddings for the NER task. Thus, it's important to choose model that has been tuned for our specific use case (e.g. clinical text, prescription information) so the embeddings are useful for naming the entity.

[Note to self:] one potential idea to look into if we have time remaining, something about using custom model for spacy pipeline (could we do smth with the romanov models since they've been trained specifically for conflict detection?) -- https://spacy.io/usage/v3

### a) scispaCy

For scispaCy, we set up one of their models that has been trained on biomedical data. Other models can be found [here](https://allenai.github.io/scispacy/). 

We load two models since we will be linking different entity linkers (knowledge bases that link text to named entites) later.

In [97]:
## uncomment to install model if not already installed
# !/opt/conda/envs/opennotes/bin/python -m pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_sm-0.2.5.tar.gz
!/opt/conda/envs/opennotes/bin/python -m pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_lg-0.2.5.tar.gz

Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_lg-0.2.5.tar.gz
  Downloading https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_lg-0.2.5.tar.gz (502.0 MB)
[K     |████████████████████████████████| 502.0 MB 20 kB/s s eta 0:00:01     |██████████████████████████████▌ | 478.1 MB 5.4 MB/s eta 0:00:05
Building wheels for collected packages: en-core-sci-lg
  Building wheel for en-core-sci-lg (setup.py) ... [?25ldone
[?25h  Created wheel for en-core-sci-lg: filename=en_core_sci_lg-0.2.5-py3-none-any.whl size=502195831 sha256=8c5b0e3d7804073d2c54a765fed3e4a99011985e7652ca45a76ce60862019c76
  Stored in directory: /home/jiangsharon9/.cache/pip/wheels/a5/83/74/457fe79f6f41213efa07e8b02b70387046dab5119af3e5195b
Successfully built en-core-sci-lg
Installing collected packages: en-core-sci-lg
Successfully installed en-core-sci-lg-0.2.5
You should consider upgrading via the '/opt/conda/envs/opennotes/bin/python -m pip install --u

In [102]:
# for umls (general biomedical concepts)
umls_nlp   = spacy.load("en_core_sci_sm")
#umls_nlp   = spacy.load("en_core_sci_lg")


# for rxnorm (prescriptions)
rxnorm_nlp = spacy.load("en_core_sci_sm")

### b) Med7

For Med7, we set up their model that has been trained specifically for NER of medication-related concepts: dosage, drug names, duration, form, frequency, route of administration, and strength. The model is trained on MIMIC-III, so it should work well for us.

In [6]:
# # installs Med7 model
!pip install https://www.dropbox.com/s/xbgsy6tyctvrqz3/en_core_med7_lg.tar.gz?dl=1

Collecting https://www.dropbox.com/s/xbgsy6tyctvrqz3/en_core_med7_lg.tar.gz?dl=1
  Downloading https://www.dropbox.com/s/xbgsy6tyctvrqz3/en_core_med7_lg.tar.gz?dl=1 (892.8 MB)
[K     |█████████████████               | 474.9 MB 99.3 MB/s eta 0:00:056

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[K     |███████████████████████████▊    | 772.1 MB 85.9 MB/s eta 0:00:022

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[K     |████████████████████████████████| 892.8 MB 8.0 kB/s  eta 0:00:01     |███████████████████████████████▋| 881.6 MB 86.9 MB/s eta 0:00:01


In [8]:
med7_nlp = spacy.load("en_core_med7_lg")

## iii) Adding an entity linker

The EntityLinker is a spaCy component that links to a knowledge base. The linker compares words with the concepts in the specified knowledge base (e.g. scispaCy's UMLS does some form of character overlap-based nearest neighbor search, has option to resolve abbreviations first).

[Note: Entities generally get resolved to a list of different entities. This [blog post](http://sujitpal.blogspot.com/2020/08/disambiguating-scispacy-umls-entities.html) describes one potential way to disambiguate this by figuring out "most likely" set of entities. Gonna start off with just resolving to the 1st entity tho... hopefully that's sufficient.]

### a) scispaCy

#### UMLS Linker

UMLS linker maps entities to the UMLS concept. Main parts we'll be interested in are: semantic type and concept (mainly the common name, maybe the CUI might become important later).

* _Semantic type_ is the broader category that the entity falls under, e.g. disease, pharmacologic substance, etc. See [this](https://metamap.nlm.nih.gov/Docs/SemanticTypes_2018AB.txt) for a full list.

* _Concepts_ refer to the more fundamental entity itself, e.g. pneumothorax, ventillator, etc. Many concepts can fall under a semantic type.

More info on `UmlsEntityLinker` ([source code](https://github.com/allenai/scispacy/blob/4ade4ec897fa48c2ecf3187caa08a949920d126d/scispacy/linking.py#L9))

See source code for `.jsonl` file with the knowledge base.

In [9]:
from scispacy.umls_linking import UmlsEntityLinker

# abbreviation_pipe = AbbreviationDetector(nlp) # automatically included with UMLS linker
# nlp.add_pipe(abbreviation_pipe)
umls_linker = UmlsEntityLinker(k=10,                          # number of nearest neighbors to look up from
                               threshold=0.7,                 # confidence threshold to be added as candidate
                               max_entities_per_mention=1,    # number of entities returned per concept (todo: tune)
                               filter_for_definitions=False,  # no definition is OK
                               resolve_abbreviations=True)    # resolve abbreviations before linking
umls_nlp.add_pipe(umls_linker)

https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2020-10-09/umls/tfidf_vectors_sparse.npz not found in cache, downloading to /tmp/tmpjdeem7d2
Finished download, copying /tmp/tmpjdeem7d2 to cache at /home/jiangsharon9/.scispacy/datasets/e9f7327283e43f0482f7c0c71b71dec278a58ccb3ffdd03c2c2350159e7ef146.f2a350ad19015b2591545f7feeed6a6d6d2fffcd635d868a5d7fc0dfc3cadfd8.tfidf_vectors_sparse.npz
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2020-10-09/umls/nmslib_index.bin not found in cache, downloading to /tmp/tmpfpuwfipd
Finished download, copying /tmp/tmpfpuwfipd to cache at /home/jiangsharon9/.scispacy/datasets/f48455d6c79262057cce66b4619123c2b558b21092d42fac97f47bb99a5b8f9f.dd70d3dffe7d90d7ac8914460e16a48375dab32485fb6313a34e6fbcaf53218b.nmslib_index.bin
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2020-10-09/umls/tfidf_vectorizer.joblib not found in cache, downloading to /tmp/tmpnyaf_1vw
Finished download, copying /tmp/tmpnyaf_1vw to c



https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2020-10-09/umls/concept_aliases.json not found in cache, downloading to /tmp/tmpmsfa3bxz
Finished download, copying /tmp/tmpmsfa3bxz to cache at /home/jiangsharon9/.scispacy/datasets/1428ec15d3b1061731ea273c03699130b3d6b90948993e74bda66af605ff8e2a.aeb7a686c654df6bccb6c2c23d3eda3eb381daaefda4592b58158d0bee53b352.concept_aliases.json
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/kbs/2020-10-09/umls_2020_aa_cat0129.jsonl not found in cache, downloading to /tmp/tmp7jj57sql
Finished download, copying /tmp/tmp7jj57sql to cache at /home/jiangsharon9/.scispacy/datasets/4d7fb8fcae1035d1e0a47d9072b43d5a628057d35497fbfb2499b4b7b2dd4dd7.05ec7eef12f336d4666da85b7fa69b9401883a7dd4244473f7b88b413ccbba03.umls_2020_aa_cat0129.jsonl


#### RxNorm Linker

RxNorm linker maps entities to RxNorm, an ontology for clinical drug names. It contains about 100k concepts for normalized names for clinical drugs. It is comprised of several other drug vocabularies commonly used in pharmacy management and drug interaction, including First Databank, Micromedex, and the Gold Standard Drug Database.

More info on `RxNorm` ([NIH page](https://www.nlm.nih.gov/research/umls/rxnorm/index.html), [source code](https://github.com/allenai/scispacy/blob/2290a80cfe0948e48d8ecfbd60064019d57a6874/scispacy/linking_utils.py#L120))

See source code for `.jsonl` file with the knowledge base.

In [10]:
from scispacy.linking import EntityLinker

# rxnorm_linker = EntityLinker(resolve_abbreviations=True, name="rxnorm")
rxnorm_linker = EntityLinker(k=10,                          # number of nearest neighbors to look up from
                             threshold=0.7,                 # confidence threshold to be added as candidate
                             max_entities_per_mention=1,    # number of entities returned per concept (todo: tune)
                             filter_for_definitions=False,  # no definition is OK
                             resolve_abbreviations=True,    # resolve abbreviations before linking
                             name="rxnorm")                 # RxNorm ontology

rxnorm_nlp.add_pipe(rxnorm_linker)

https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2020-10-09/rxnorm/tfidf_vectors_sparse.npz not found in cache, downloading to /tmp/tmpe3vr75v3
Finished download, copying /tmp/tmpe3vr75v3 to cache at /home/jiangsharon9/.scispacy/datasets/bda8d228cdcd3b9014f0be51cd154f7aab200b49c9edb93ffc9f3d7d6f8f7287.6a172afbb5b503f0847eded07665c53e18b0e59a4037239fd82f8d3833fc3cd5.tfidf_vectors_sparse.npz
https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2020-10-09/rxnorm/nmslib_index.bin not found in cache, downloading to /tmp/tmp9xqtaio4
Finished download, copying /tmp/tmp9xqtaio4 to cache at /home/jiangsharon9/.scispacy/datasets/aa5b2402cea8e729db0d13f67180731987f414e88662254657c6e7d1047a68c1.35fcd4b3ca0df29ece260a995bb2dcffc4580d10e8eb2f6a7d80b67c0d8bed99.nmslib_index.bin
https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2020-10-09/rxnorm/tfidf_vectorizer.joblib not found in cache, downloading to /tmp/tmp9eazqqzk
Finished download, copying /tmp/tmp9eazqqz



https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2020-10-09/rxnorm/concept_aliases.json not found in cache, downloading to /tmp/tmp0z0of3js
Finished download, copying /tmp/tmp0z0of3js to cache at /home/jiangsharon9/.scispacy/datasets/a65018bff2c6c9ef7e02f3658b2b5253fc4d52c823d985d58fcc2614ae9c5bf5.a74273b8c58718a2cd4635a4b3db50dfd129410fbbfd23fcc97c3f39314e5753.concept_aliases.json
https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/kbs/2020-10-09/umls_2020_rxnorm.jsonl not found in cache, downloading to /tmp/tmp1phr45rn
Finished download, copying /tmp/tmp1phr45rn to cache at /home/jiangsharon9/.scispacy/datasets/b82f1e42068c00f53c786f44bfc56353d65f7e9aec08b6b46d9c6d2c36538a76.ea8986981f7bafd0fcc8b5dc575df9adfb54145107af0e88c2ef5472b578f2b6.umls_2020_rxnorm.jsonl


### b) Med7 

No need for entity linker

# 2. Setup data structures

## Categorizing type of conflict

The first larger task is to categorize by the type of conflict to check for since our method will likely be different (at least for the rule based). We wrote up a short list [here](https://docs.google.com/document/d/1fEBk0JHeyQWshYWW5w_VTkaYyRfm9MBxJ9DAGoVa8Yw/edit?usp=sharing). 

To do this, we're using the semantic type that is identified by the UMLS linker. Here's a table of the semantic types we're filtering for, and which conflict they'll be used for.

Here's a [full list](https://metamap.nlm.nih.gov/Docs/SemanticTypes_2018AB.txt) of semantic types. You can look up definitions of semantic types [here](http://linkedlifedata.com/resource/umls-semnetwork/T033).

| Conflict | Semantic Type |
| --- | ----------- |
| Diagnoses-related errors | Disease or Syndrome (T047), Diagnostic Procedure(T060) |
| Inaccurate description of medical history (symptoms) | Sign or Symptom (T184) |
| Inaccurate description of medical history (operations) | Therapeutic or Preventive Procedure (T061) |
| Inaccurate description of medical history (other) | [all of the above and below] |
| Medication or allergies | Clinical Drug (T200), Pharmacologic Substance (T121) |
| Test procedures or results | Laboratory Procedure (T059), Laboratory or Test Result (T034) | 


For clarity, the concepts we'll keep from the UMLS linker are anything falling into these semantic types (which we will then categorize by type of conflict using the table above):

* T047 - Disease or Syndrome
* T121 - Pharmacologic Substance
* T023 - Body Part, Organ, or Organ Component
* T061 - Therapeutic or Preventive Procedure 
* T060 - Diagnostic Procedure
* T059 - Laboratory Procedure
* T034 - Laboratory or Test Result 
* T184 - Sign or Symptom 
* T200 - Clinical Drug

We'll store this info into a dictionary now.

<!-- Some useful def's 
Finding - 
That which is discovered by direct observation or measurement of an organism attribute or condition, including the clinical history of the patient. The history of the presence of a disease is a 'Finding' and is distinguished from the disease itself.  -->

In [11]:
SEMANTIC_TYPES = ['T047', 'T121', 'T023', 'T061', 'T060', 'T059', 'T034', 'T184', 'T200']
SEMANTIC_NAMES = ['Disease or Syndrome', 'Pharmacologic Substance', 'Body Part, Organ, or Organ Component', \
                  'Therapeutic or Preventive Procedure', 'Diagnostic Procedure', 'Laboratory Procedure', \
                  'Laboratory or Test Result', 'Sign or Symptom', 'Clinical Drug']
SEMANTIC_TYPE_TO_NAME = dict(zip(SEMANTIC_TYPES, SEMANTIC_NAMES))

SEMANTIC_TYPE_TO_NAME

{'T047': 'Disease or Syndrome',
 'T121': 'Pharmacologic Substance',
 'T023': 'Body Part, Organ, or Organ Component',
 'T061': 'Therapeutic or Preventive Procedure',
 'T060': 'Diagnostic Procedure',
 'T059': 'Laboratory Procedure',
 'T034': 'Laboratory or Test Result',
 'T184': 'Sign or Symptom',
 'T200': 'Clinical Drug'}

In [12]:
CONFLICT_TO_SEMANTIC_TYPE = {
    "diagnosis": {'T047', 'T060'},
    "med_history_symptom": {'T184'},
    "med_history_operation": {'T061'},
    "med_history_other": set(SEMANTIC_TYPES),
    "med_allergy": {'T200', 'T121'},
    "test_results": {'T059', 'T034'}
}

CONFLICT_TO_SEMANTIC_TYPE

{'diagnosis': {'T047', 'T060'},
 'med_history_symptom': {'T184'},
 'med_history_operation': {'T061'},
 'med_history_other': {'T023',
  'T034',
  'T047',
  'T059',
  'T060',
  'T061',
  'T121',
  'T184',
  'T200'},
 'med_allergy': {'T121', 'T200'},
 'test_results': {'T034', 'T059'}}

# 3. Load and process MedNLI data

In [13]:
from torch.utils.data import Dataset
from data_structures import Sentence

In [14]:
# Set path to csv's
train_file = "mednli_labeled/train.csv"
test_file  = "mednli_labeled/test.csv"
dev_file   = "mednli_labeled/dev.csv"

In [15]:
class DailyDataNull(object):
    """ Placeholder for DailyData (e.g. Note, PrescriptionOrder, LabResults) """
    def __init__(self, umls, rxnorm, med7, umls_linker, rxnorm_linker):
        self.umls   = umls
        self.rxnorm = rxnorm
        self.med7   = med7
        
        self.umls_linker   = umls_linker
        self.rxnorm_linker = rxnorm_linker
        
        self.time = None
        
class MedNLI(Dataset):
    """ MedNLI dataset. """
    def __init__(self, data_filepath):
        """
        Args:
            data_filepath (string): Path to the csv file with labeled data.
        """
        self.df = pd.read_csv(data_filepath)
        self.nullnote = DailyDataNull(umls_nlp, rxnorm_nlp, med7_nlp,
                                      umls_linker, rxnorm_linker)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        item  = self.df.iloc[idx]
        label = item['label']
        
        # create the sentences
        sentence1 = Sentence(self.nullnote, None,
                             filter_map=SEMANTIC_TYPE_TO_NAME,
                             conflict_map=CONFLICT_TO_SEMANTIC_TYPE,
                             sentence=item['sentence 1'])
        sentence2 = Sentence(self.nullnote, None,
                             filter_map=SEMANTIC_TYPE_TO_NAME,
                             conflict_map=CONFLICT_TO_SEMANTIC_TYPE,
                             sentence=item['sentence 2'])
        
        return (sentence1, sentence2), label

##Example loading training dataset

In [17]:
train_dataset = MedNLI(train_file)

In [25]:
# Get 1st pair
(s1, s2), label = train_dataset[7484]

print(f"Sentence 1: {s1.txt}\nSentence 2: {s2.txt}\nContradiction (0=no, 1=yes)? {label}")

Sentence 1: He received 2U PRBC, underwent tagged RBC scan, transferred to [**Hospital1 22**].
Sentence 2:  The patient is anemic. 
Contradiction (0=no, 1=yes)? 0


You can also process all the sentences upfront and save.

In [22]:
from tqdm import tqdm


In [30]:
all_sentence1 = []
all_sentence2 = []
all_labels    = []
for i in tqdm(range(len(train_dataset))):
    (s1, s2), label = train_dataset[i]
    
    all_sentence1.append(s1)
    all_sentence2.append(s2)
    all_labels.append(label)

100%|██████████| 7488/7488 [36:31<00:00,  3.42it/s]


In [33]:
all_sentence1[0].txt

'Labs were notable for Cr 1.7 (baseline 0.5 per old records) and lactate 2.4.'

In [37]:
# save all_sentence_1, all_sentence2, all_labels

import pickle

def load_data():
    try:
        with open("sentences.pickle", "rb") as f:
            all_sentence1, all_sentence2, all_labels = pickle.load(f)
    except:
        all_sentence1, all_sentence2, all_labels = [], []
    return all_sentence1, all_sentence2, all_labels

def save_data(all_sentence1, all_sentence2, all_labels):
    with open("sentences.pickle", "wb") as f:
        pickle.dump((all_sentence1, all_sentence2, all_labels), f)
   
        
save_data(all_sentence1, all_sentence2, all_labels)

TypeError: can't pickle nmslib.dist.FloatIndex objects

In [None]:
load_data()

In [38]:
import pickle

# write a file
f = open("example.dat", "wb")
pickle.dump(all_sentence1, f)
pickle.dump(all_sentence2, f)
pickle.dump(all_labels, f)
f.close()

f = open("example.dat", "rb")
all_sentence1 = pickle.load(f)
all_sentence2 = pickle.load(f)
all_labels = pickle.load(f)
f.close()

TypeError: can't pickle nmslib.dist.FloatIndex objects

In [41]:
!pip install dill

Collecting dill
  Downloading dill-0.3.3-py2.py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 4.2 MB/s eta 0:00:011
[?25hInstalling collected packages: dill
Successfully installed dill-0.3.3


In [42]:
import dill
x = all_sentence1
d = {'x':x}
# we check for unpicklable items in d (i.e. the iterator x)
dill.detect.baditems(d)
#[<listiterator object at 0x10b0e48d0>]
# note that nothing inside of the iterator is unpicklable!
dill.detect.baditems(x)

KeyboardInterrupt: 

What can we access from sentence?

In [26]:
# UMLS and RxNorm concepts 
s1.features

{'Scanning'}

In [27]:
# Med7 (prescription) entities
s1.med7_entities

[('2U', 'DOSAGE'), ('PRBC', 'DRUG')]

In [45]:
# "Doc" outputs from spacy are also saved, which can be useful (I think you have some exploration on this already)
s1.umls_doc    # "Doc" output for UMLS 
s1.rxnorm_doc  # "Doc" output for RxNorm 
s1.med7_doc    # "Doc" output for Med7

#displacy.render(s1.umls_doc)
#displacy.render(s1.rxnorm_doc)
#print(s1.rxnorm_doc)
#print(s1.med7_doc)
#displacy.render(s1.med7_doc)

Pregnancy was complicated by spotting at 18 weeks and 26 weeks.


In [29]:
# How can we tell which word the concept came from?
# This is a slightly modified version of what we do in 
# Data.get_umls_info() and Data.get_rxnorm_info(). 
# Check these functions in data_structures.py 
# for more info on how to access other info.

sentence_entities = s1.umls_doc.ents
umls_cui_map = umls_linker.umls.cui_to_entity # maps CUI to UMLS knowledge base
for ent in sentence_entities: # extract info (umls) for each entity
    try:
        cui, _ = ent._.umls_ents[0] # assuming `max_entites_per_mention=1` for now
    except IndexError:
        continue
    cui_info = umls_cui_map[cui]
    
    print(f"Original word: {ent}\nCUI: {cui}\nUMLS Info: {cui_info}")
    print("*********************")

Original word: RBC
CUI: C0014792
UMLS Info: CUI: C0014792, Name: Erythrocytes
Definition: Red blood cells. Mature erythrocytes are non-nucleated, biconcave disks containing HEMOGLOBIN whose function is to transport OXYGEN.
TUI(s): T025
Aliases (abbreviated, total: 48): 
	 Blood Cells, Red, Red Blood Cells, Blood erythrocytic cell, Blood Erythrocyte, Red Cell, Erythrocytic Cells, rbcs, Marrow erythrocyte, red blood cell, Blood red cell
*********************
Original word: scan
CUI: C0441633
UMLS Info: CUI: C0441633, Name: Scanning
Definition: A picture of structures inside the body. Scans often used in diagnosing, staging, and monitoring disease include liver scans, bone scans, and computed tomography (CT) or computerized axial tomography (CAT) scans and magnetic resonance imaging (MRI) scans. In liver scanning and bone scanning, radioactive substances that are injected into the bloodstream collect in these organs. A scanner that detects the radiation is used to create pictures. In CT s

## Feature generation

In [46]:
# if number of neg tokens is equal, return 0; otherwise, return 1
def check_if_number_neg_equal_umls(s1, s2):
    sent_doc_1_umls = s1.umls_doc    # "Doc" output for UMLS 
    sent_doc_2_umls = s2.umls_doc    # "Doc" output for UMLS
    
    negation_tokens_1 = [tok for tok in sent_doc_1_umls if tok.dep_ == 'neg']
    negation_tokens_2 = [tok for tok in sent_doc_2_umls if tok.dep_ == 'neg']

    if len(negation_tokens_1) != len(negation_tokens_2):
        return 1
    else:
        return 0

'\ndef check_if_number_neg_equal_rxnorm(s1, s2):\n    sent_doc_1_rxnorm = s1.rxnorm_doc    # "Doc" output for rxnorm \n    sent_doc_2_rxnorm = s2.rxnorm_doc    # "Doc" output for rxnorm\n\n    negation_tokens_1 = [tok for tok in sent_doc_1_rxnorm if tok.dep_ == \'neg\']\n    negation_tokens_2 = [tok for tok in sent_doc_2_rxnorm if tok.dep_ == \'neg\']\n\n    if len(negation_tokens_1) != len(negation_tokens_2):\n        return 1\n    else:\n        return 0\n    \n    \ndef check_if_number_neg_equal_med7(s1, s2):\n    sent_doc_1_umls = s1.med7_doc    # "Doc" output for Med7\n    sent_doc_2_umls = s2.med7_doc    # "Doc" output for Med7\n\n    negation_tokens_1 = [tok for tok in sent_doc_1_umls if tok.dep_ == \'neg\']\n    negation_tokens_2 = [tok for tok in sent_doc_2_umls if tok.dep_ == \'neg\']\n\n    if len(negation_tokens_1) != len(negation_tokens_2):\n        return 1\n    else:\n        return 0\n'

In [47]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# could do word vector for spacy? https://www.shanelynn.ie/word-embeddings-in-python-with-spacy-and-gensim/ 
def create_sentence_encoding(sentence_1, sentence_2):
    CountVec = CountVectorizer(ngram_range=(1,1), stop_words='english') # to use bigrams, ngram_range=(2,2)

    # transform
    Count_data = CountVec.fit_transform([sentence_1, sentence_2])

    # create dataframe
    cv_dataframe = pd.DataFrame(Count_data.toarray(), columns=CountVec.get_feature_names())
    return cv_dataframe


In [89]:
"""
Find dependent children using UMLS ontology
"""
def create_dep_encoding(s1, s2):
    
    sent_doc_1 = s1.umls_doc
    sent_doc_2 = s2.umls_doc

    dep_child_1 = []
    dep_child_2 = []

    for token in sent_doc_1:
        if token.dep_ == "ROOT": 
            index_to_check = token.i
            dep_list_1 = [token.text for token in sent_doc_1[index_to_check].children if token.dep_ != "punct"]
            dep_child_1.append(sent_doc_1[index_to_check].text)
            dep_child_1.extend(dep_list_1)

    for token in sent_doc_2:
        if token.dep_ == "ROOT":
            index_to_check = token.i
            dep_list_2 = [token.text for token in sent_doc_2[index_to_check].children if token.dep_ != "punct"]
            dep_child_2.append(sent_doc_2[index_to_check].text)
            dep_child_2.extend(dep_list_2)

    dep_sent_1 = list_to_string(dep_child_1)
    dep_sent_2 = list_to_string(dep_child_2)

    dep_doc_1 = umls_nlp(dep_sent_1)
    dep_doc_2 = umls_nlp(dep_sent_2)
    similarity = dep_doc_1.similarity(dep_doc_2)
    return similarity

def list_to_string(list1):
    str1 = ""
    for element in list1:
        str1 += " " + element

    return str1

In [69]:
"""
Given Sentence instances s1, s2
check if they share a shared concept feature (UMLS and RxNorm concepts) 
if they do not share a concept, return 1; otherwise, return 0
"""
def check_shared_feature_umls(s1,s2):
    #{'Scanning'}
    s1_features = s1.features
    s2_features = s2.features
    
    # check if share feature in common
    shared_feature = list(set(s1_features) & set(s2_features))
    
    # do not share concept
    if shared_feature != []:
        return 1
    else: 
        return 0
    

In [70]:
# Med7 (prescription) entities
# if not talking about same DRUG -> return 0
# if same DRUG but different other info -> return 1
def check_shared_feature_med7(s1,s2):
    
    # [('2U', 'DOSAGE'), ('PRBC', 'DRUG')]
    s1_features = s1.med7_entities
    s2_features = s2.med7_entities
    
    # get drug names for each sentence
    s1_names = [name for (name, word_type) in s1_features if word_type == "DRUG"]
    s2_names = [name for (name, word_type) in s2_features if word_type == "DRUG"]
    
    # check if drug name is in common
    shared_drug = list(set(s1_names) & set(s2_names))
    
    # share drug name, but linkers are different
    if shared_drug != [] and s1_features != s2_features:
        return 1
    else: 
        return 0
    

In [92]:
# given s1, s2 Sentence instances
def get_feature_df(s1, s2, label, pair_id, type_data):
    sentence_1 = s1.txt
    sentence_2 = s2.txt
    sentence_df = create_sentence_encoding(sentence_1, sentence_2)
    
    # check negation in docs
    neg_check_umls = check_if_number_neg_equal_umls(s1, s2)
    sentence_df["neg_check_umls"] = neg_check_umls

    # check shared features in UMLS and Med7
    check_umls = check_shared_feature_umls(s1,s2)
    check_med7 = check_shared_feature_med7(s1,s2)
    sentence_df["check_umls"] = check_umls
    sentence_df["check_med7"] = check_med7
    
    # find dependent children similarity
    dep_similarity = create_dep_encoding(s1, s2)
    sentence_df["dep_sim"] = dep_similarity
    
    # put contradiction label
    sentence_df["contradiction?"] = label
    
    # put pair_id for reference
    sentence_df["pair_id"] = pair_id
    sentence_df["type_data"] = type_data
    return sentence_df

In [93]:
get_feature_df(s1, s2, 0, 0, "train")

  extended_neighbors[empty_vectors_boolean_flags] = numpy.array(neighbors)[:-1]
  extended_distances[empty_vectors_boolean_flags] = numpy.array(distances)[:-1]


Unnamed: 0,baseline,cr,elevated,labs,lactate,notable,old,patient,records,neg_check_umls,check_umls,check_med7,dep_sim,contradiction?,pair_id,type_data
0,1,1,0,1,1,1,1,0,1,0,0,0,0.611129,0,0,train
1,0,1,1,0,0,0,0,1,0,0,0,0,0.611129,0,0,train


## Scale up features to train

In [73]:
len(train_dataset)

7488

In [105]:
total_features_df = pd.DataFrame()

for pair_id in tqdm(range(len(train_dataset))):
    
    (s1, s2), label = train_dataset[pair_id]
    type_data = "train"
    feature_df = get_feature_df(s1, s2, label, pair_id, type_data)
    total_features_df = pd.concat((total_features_df, feature_df), axis=0)
    

  extended_neighbors[empty_vectors_boolean_flags] = numpy.array(neighbors)[:-1]
  extended_distances[empty_vectors_boolean_flags] = numpy.array(distances)[:-1]
100%|██████████| 7488/7488 [1:52:17<00:00,  1.11it/s]


In [106]:
total_features_df.to_csv("train_features.csv")

In [107]:
#test_file  = "mednli_labeled/test.csv"

test_dataset = MedNLI(test_file)

test_total_features_df = pd.DataFrame()

for pair_id in tqdm(range(len(test_dataset))):
    (s1, s2), label = test_dataset[pair_id]
    type_data = "test"
    feature_df = get_feature_df(s1, s2, label, pair_id, type_data)
    test_total_features_df = pd.concat((test_total_features_df, feature_df), axis=0)
    
test_total_features_df.to_csv("test_features.csv")

100%|██████████| 948/948 [06:33<00:00,  2.41it/s]


In [108]:
#dev_file   = "mednli_labeled/dev.csv"

dev_dataset = MedNLI(dev_file)

dev_total_features_df = pd.DataFrame()

for pair_id in tqdm(range(len(dev_dataset))):
    (s1, s2), label = dev_dataset[pair_id]
    type_data = "dev"
    feature_df = get_feature_df(s1, s2, label, pair_id, type_data)
    dev_total_features_df = pd.concat((test_total_features_df, feature_df), axis=0)
    
dev_total_features_df.to_csv("dev_features.csv")

100%|██████████| 930/930 [06:20<00:00,  2.44it/s]
