In [None]:
# Import libraries
import os
import re
import random
import pickle
import subprocess
import numpy as np
import pandas as pd
import datetime as dt

from tqdm import tqdm
from datetime import datetime
from collections import Counter

# 1. Setup concept extractors

Some options were [MetaMap](https://metamap.nlm.nih.gov/) and [spaCy](https://spacy.io/). 

[MetaMap](https://metamap.nlm.nih.gov/) is specific to recognizing UMLS concepts. There is a [Python wrapper](https://github.com/AnthonyMRios/pymetamap), but known to be slow and bad.

[spaCy](https://spacy.io/) is a popular NLP Python package with an extensive library for named entity recognition. It has a wide variety of [extensions](https://spacy.io/universe) and models to choose from. We're going with the following.

* [scispaCy](https://spacy.io/universe/project/scispacy) contains spaCy models for processing biomedical, scientific or clinical text. It seems easy to use and has a wide variety of concepts it can recognize, including UMLS, RxNorm, etc.

* [negspaCy](https://spacy.io/universe/project/negspacy) identifies negations using some extension of regEx. Probably useful for things like, "this pt is diabetic" v. "this pt is not diabetic." [todo: negation identification of medspacy might be better, https://github.com/medspacy/medspacy]

* [Med7](https://github.com/kormilitzin/med7) is a model trained for recognizing entities in prescription text, e.g. identifies drug name, dosage, duration, etc., which could be useful stuff to check for conflicts. 

We're going with spaCy for this.. and coming up with a coherent way to integrate entities picked up by these three extensions/models.

## i) Installations

In [None]:
import sys; sys.executable

In [None]:
import spacy
import scispacy

from pprint import pprint
from collections import OrderedDict

from spacy import displacy
# from scispacy.abbreviation import AbbreviationDetector # UMLS already contains abbrev. detect
from scispacy.umls_linking import UmlsEntityLinker

# should be 2.3.5 and >=0.3.0
spacy.__version__, scispacy.__version__

## ii) Setting up the model

The model is used to form word/sentence embeddings for the NER task. Thus, it's important to choose model that has been tuned for our specific use case (e.g. clinical text, prescription information) so the embeddings are useful for naming the entity.

[Note to self:] one potential idea to look into if we have time remaining, something about using custom model for spacy pipeline (could we do smth with the romanov models since they've been trained specifically for conflict detection?) -- https://spacy.io/usage/v3

### a) scispaCy

For scispaCy, we set up one of their models that has been trained on biomedical data. Other models can be found [here](https://allenai.github.io/scispacy/). 

We load two models since we will be linking different entity linkers (knowledge bases that link text to named entites) later.

In [None]:
## uncomment to install model if not already installed
# !/opt/conda/envs/opennotes/bin/python -m pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_sm-0.2.5.tar.gz

In [None]:
# for umls (general biomedical concepts)
umls_nlp   = spacy.load("en_core_sci_sm")

# for rxnorm (prescriptions)
rxnorm_nlp = spacy.load("en_core_sci_sm")

### b) Med7

For Med7, we set up their model that has been trained specifically for NER of medication-related concepts: dosage, drug names, duration, form, frequency, route of administration, and strength. The model is trained on MIMIC-III, so it should work well for us.

In [None]:
# # installs Med7 model
# !pip install https://www.dropbox.com/s/xbgsy6tyctvrqz3/en_core_med7_lg.tar.gz?dl=1

In [None]:
med7_nlp = spacy.load("en_core_med7_lg")

## iii) Adding an entity linker

The EntityLinker is a spaCy component that links to a knowledge base. The linker compares words with the concepts in the specified knowledge base (e.g. scispaCy's UMLS does some form of character overlap-based nearest neighbor search, has option to resolve abbreviations first).

[Note: Entities generally get resolved to a list of different entities. This [blog post](http://sujitpal.blogspot.com/2020/08/disambiguating-scispacy-umls-entities.html) describes one potential way to disambiguate this by figuring out "most likely" set of entities. Gonna start off with just resolving to the 1st entity tho... hopefully that's sufficient.]

### a) scispaCy

#### UMLS Linker

UMLS linker maps entities to the UMLS concept. Main parts we'll be interested in are: semantic type and concept (mainly the common name, maybe the CUI might become important later).

* _Semantic type_ is the broader category that the entity falls under, e.g. disease, pharmacologic substance, etc. See [this](https://metamap.nlm.nih.gov/Docs/SemanticTypes_2018AB.txt) for a full list.

* _Concepts_ refer to the more fundamental entity itself, e.g. pneumothorax, ventillator, etc. Many concepts can fall under a semantic type.

More info on `UmlsEntityLinker` ([source code](https://github.com/allenai/scispacy/blob/4ade4ec897fa48c2ecf3187caa08a949920d126d/scispacy/linking.py#L9))

See source code for `.jsonl` file with the knowledge base.

In [None]:
from scispacy.umls_linking import UmlsEntityLinker

# abbreviation_pipe = AbbreviationDetector(nlp) # automatically included with UMLS linker
# nlp.add_pipe(abbreviation_pipe)
umls_linker = UmlsEntityLinker(k=10,                          # number of nearest neighbors to look up from
                               threshold=0.7,                 # confidence threshold to be added as candidate
                               max_entities_per_mention=1,    # number of entities returned per concept (todo: tune)
                               filter_for_definitions=False,  # no definition is OK
                               resolve_abbreviations=True)    # resolve abbreviations before linking
umls_nlp.add_pipe(umls_linker)

#### RxNorm Linker

RxNorm linker maps entities to RxNorm, an ontology for clinical drug names. It contains about 100k concepts for normalized names for clinical drugs. It is comprised of several other drug vocabularies commonly used in pharmacy management and drug interaction, including First Databank, Micromedex, and the Gold Standard Drug Database.

More info on `RxNorm` ([NIH page](https://www.nlm.nih.gov/research/umls/rxnorm/index.html), [source code](https://github.com/allenai/scispacy/blob/2290a80cfe0948e48d8ecfbd60064019d57a6874/scispacy/linking_utils.py#L120))

See source code for `.jsonl` file with the knowledge base.

In [None]:
from scispacy.linking import EntityLinker

# rxnorm_linker = EntityLinker(resolve_abbreviations=True, name="rxnorm")
rxnorm_linker = EntityLinker(k=10,                          # number of nearest neighbors to look up from
                             threshold=0.7,                 # confidence threshold to be added as candidate
                             max_entities_per_mention=1,    # number of entities returned per concept (todo: tune)
                             filter_for_definitions=False,  # no definition is OK
                             resolve_abbreviations=True,    # resolve abbreviations before linking
                             name="rxnorm")                 # RxNorm ontology

rxnorm_nlp.add_pipe(rxnorm_linker)

### b) Med7 

No need for entity linker

### c) Negspacy [TODO]

# 2. Setup data structures

## Categorizing type of conflict

The first larger task is to categorize by the type of conflict to check for since our method will likely be different (at least for the rule based). We wrote up a short list [here](https://docs.google.com/document/d/1fEBk0JHeyQWshYWW5w_VTkaYyRfm9MBxJ9DAGoVa8Yw/edit?usp=sharing). 

To do this, we're using the semantic type that is identified by the UMLS linker. Here's a table of the semantic types we're filtering for, and which conflict they'll be used for.

Here's a [full list](https://metamap.nlm.nih.gov/Docs/SemanticTypes_2018AB.txt) of semantic types. You can look up definitions of semantic types [here](http://linkedlifedata.com/resource/umls-semnetwork/T033).

| Conflict | Semantic Type |
| --- | ----------- |
| Diagnoses-related errors | Disease or Syndrome (T047), Diagnostic Procedure(T060) |
| Inaccurate description of medical history (symptoms) | Sign or Symptom (T184) |
| Inaccurate description of medical history (operations) | Therapeutic or Preventive Procedure (T061) |
| Inaccurate description of medical history (other) | [all of the above and below] |
| Medication or allergies | Clinical Drug (T200), Pharmacologic Substance (T121) |
| Test procedures or results | Laboratory Procedure (T059), Laboratory or Test Result (T034) | 


For clarity, the concepts we'll keep from the UMLS linker are anything falling into these semantic types (which we will then categorize by type of conflict using the table above):

* T047 - Disease or Syndrome
* T121 - Pharmacologic Substance
* T023 - Body Part, Organ, or Organ Component
* T061 - Therapeutic or Preventive Procedure 
* T060 - Diagnostic Procedure
* T059 - Laboratory Procedure
* T034 - Laboratory or Test Result 
* T184 - Sign or Symptom 
* T200 - Clinical Drug

We'll store this info into a dictionary now.

<!-- Some useful def's 
Finding - 
That which is discovered by direct observation or measurement of an organism attribute or condition, including the clinical history of the patient. The history of the presence of a disease is a 'Finding' and is distinguished from the disease itself.  -->

In [None]:
SEMANTIC_TYPES = ['T047', 'T121', 'T023', 'T061', 'T060', 'T059', 'T034', 'T184', 'T200']
SEMANTIC_NAMES = ['Disease or Syndrome', 'Pharmacologic Substance', 'Body Part, Organ, or Organ Component', \
                  'Therapeutic or Preventive Procedure', 'Diagnostic Procedure', 'Laboratory Procedure', \
                  'Laboratory or Test Result', 'Sign or Symptom', 'Clinical Drug']
SEMANTIC_TYPE_TO_NAME = dict(zip(SEMANTIC_TYPES, SEMANTIC_NAMES))

SEMANTIC_TYPE_TO_NAME

In [None]:
CONFLICT_TO_SEMANTIC_TYPE = {
    "diagnosis": {'T047', 'T060'},
    "med_history_symptom": {'T184'},
    "med_history_operation": {'T061'},
    "med_history_other": set(SEMANTIC_TYPES),
    "med_allergy": {'T200', 'T121'},
    "test_results": {'T059', 'T034'}
}

CONFLICT_TO_SEMANTIC_TYPE

In [None]:
from data_structures import Patient,\
                            Note, PrescriptionOrders, LabResults,\
                            Sentence, Prescription, Lab

In [None]:
# from importlib import reload # python 2.7 does not require this
# import data_structures
# reload(data_structures)
# from data_structures import Patient,\
#                             Note, PrescriptionOrders, LabResults,\
#                             Sentence, Prescription, Lab

# 3. Load and process data

In [None]:
# Load MIMIC tables
notes_df  = pd.read_csv('NOTEEVENTS.csv.gz',    compression='gzip', error_bad_lines=False)
drug_df   = pd.read_csv('PRESCRIPTIONS.csv.gz', compression='gzip', error_bad_lines=False)
lab_df    = pd.read_csv('LABEVENTS.csv.gz',     compression='gzip', error_bad_lines=False)
d_lab_df  = pd.read_csv('D_LABITEMS.csv.gz',    compression='gzip', error_bad_lines=False)

#### Updated script for processing HADM ID's with consecutive physician notes (does not count the autosaves)

In [None]:
# Load HADM ID's with consecutive physician notes
if os.path.exists("hadm_ids.pkl"):
    with open("hadm_ids.pkl", "rb") as f:
        hadm_ids = pickle.load(f)
else:
    hadm_ids = []
    for hadm_id in tqdm(notes_df.HADM_ID.unique()):
        hadm_data = notes_df.loc[notes_df.HADM_ID == hadm_id]
        hadm_phys_notes = hadm_data.loc[hadm_data.CATEGORY == "Physician "]

        if len(hadm_phys_notes.CHARTTIME.unique()) > 1: # ensure > 1 unique notes (not counting autosave)
            hadm_ids.append(hadm_id)

    with open("hadm_ids.pkl", "wb") as f:
        pickle.dump(hadm_ids, f)
        
print(f"There are {len(hadm_ids)} patients with consecutive physician notes.")

# 4. Generating Contradictions

Generate 25-50 examples of positive and negative contradictions, each.

For lab values: 

* Find 50-100 total data pairs (about 2-4 per patient) and insert contradiction, or label as not a contradiction

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
pd.set_option("display.max_colwidth", -1) # prints full text

In [None]:
from importlib import reload # python 2.7 does not require this
import data_structures
reload(data_structures)
from data_structures import Patient,\
                            Note, PrescriptionOrders, LabResults,\
                            Sentence, Prescription, Lab

In [None]:
def is_comparable_type(data_i, data_j):
    """ We only want to compare note-to-note OR note-to-structured data. 
    
    Comparable types:
    - sentence v. sentence
    - sentence v. prescription
    - sentence v. lab
    
    Uncomparable types:
    - lab v. lab 
    - lab v. prescription
    - prescription v. prescription
    """
    return (data_i.type == "sentence"     and data_j.type == "sentence") or \
           (data_i.type == "sentence"     and data_j.type == "prescription") or \
           (data_i.type == "prescription" and data_j.type == "sentence") or \
           (data_i.type == "sentence"     and data_j.type == "lab") or \
           (data_i.type == "lab"          and data_j.type == "sentence")

In [None]:
def generate_data_pairs(pat):
    processed_pairs = []  # for dataframe + csv
    data_inst_pairs = []  # for pipeline, list of tuples: ((Data 1, Data 2), label)
    pair_idx = 0

    # Iterate over all of the patient's DailyData instances (e.g. note, prescription order, lab results for same day)
    ## pat.dailydata = {[date]: [DailyData instance from that date], ...}
    for day, pat_dailydatas in pat.dailydata.items(): # pat_dailydatas is list of all DailyData instances for `day`
        print(f"********** Processing data for {day} **********")
        # Collect all the daily datas (note, prescription orders, lab results) for current day
        current_dds = []
        current_dds_features = []
        current_dds_txts = []
        current_dds_sem_types = []
        current_dds_sem_names = []
        for dd in pat_dailydatas: # iterating over DailyData instances, e.g. dd=physician note taken on `day`
            current_dds.extend(dd.datas)
            current_dds_features.extend(dd.datas_features)
            current_dds_txts.extend(dd.datas_txts)
            current_dds_sem_types.extend(dd.datas_semantic_types)
            current_dds_sem_names.extend(dd.datas_semantic_names)

        current_dds           = np.array(current_dds)
        current_dds_features  = np.array(current_dds_features)
        current_dds_txts      = np.array(current_dds_txts)
        current_dds_sem_types = np.array(current_dds_sem_types)
        current_dds_sem_names = np.array(current_dds_sem_names)

        # extract similar sentences for each semantic type
        for sem_type in SEMANTIC_TYPES:
            # data for this semantic type
            sem_type_bools   = [sem_type in x for x in current_dds_sem_types]
            sem_type_indices = np.where(sem_type_bools)[0]
            indices_map = dict(
                            zip(range(len(sem_type_indices)), 
                                sem_type_indices)
                          )  # maps regular indices in sem_type_current_dds_* lists to indices in current_dds_* lists

            sem_type_current_dds           = current_dds[sem_type_indices]
            sem_type_current_dds_features  = current_dds_features[sem_type_indices]
            sem_type_current_dds_txts      = current_dds_txts[sem_type_indices]
            sem_type_current_dds_sem_types = current_dds_sem_types[sem_type_indices]
            sem_type_current_dds_sem_names = current_dds_sem_names[sem_type_indices]

            # current_dds_featuresfor features (umls + rxnorm concepts)
            vectorizer = CountVectorizer()
            corpus = list(map(lambda x: ' '.join(x), sem_type_current_dds_features))
            if len(corpus) == 0: # skip rest if no candidate sentences exist
                continue
            X = vectorizer.fit_transform(corpus)
            X = X.toarray()

            # get cosine similarity using umls + rxnorm concepts
            similarity = cosine_similarity(X)     # larger=more similar
            sim_is, sim_js = np.where(similarity>0.5) # all pairs with at least 0.5 similarity

            for i, j in zip(sim_is, sim_js):
                data_i = sem_type_current_dds[i]
                data_j = sem_type_current_dds[j]
                # removing same sentence pairs, checking dates
                if i>j and is_comparable_type(data_i, data_j):
                    print(f"***** PAIR INDEX {pair_idx} *****")
                    print(f"Cosine similarity: {similarity[i, j]}")
                    print(f"----- Data i -----")
                    print(f">> Time: {data_i.time}\n" +\
                          f">> Type: {data_i.type}\n" +\
                          f">> Concepts: {data_i.features}\n" +\
                          f">> {data_i.txt}")
                    print(f"----- Data j -----")
                    print(f">> Time: {data_j.time}\n" +\
                          f">> Type: {data_j.type}\n" +\
                          f">> Concepts: {data_j.features}\n" +\
                          f">> {data_j.txt}")
                    print("**********************************")

                    # save
                    processed_pairs.append([data_i.txt,      data_j.txt, \
                                            data_i.time,     data_j.time, \
                                            data_i.type,     data_j.type, \
                                            data_i.features, data_j.features, \
                                            similarity[i, j], SEMANTIC_TYPE_TO_NAME[sem_type]])
            #                                 SEMANTIC_TYPE_TO_NAME[semantic_type]])

                    data_inst_pairs.append(((data_i, data_j), None))
                    pair_idx += 1

    ###############
    #### Final ####
    ###############        
    df = \
    pd.DataFrame(np.array(processed_pairs), \
                 columns=["sentence 1", "sentence 2", \
                          "time 1", "time 2", \
                          "type 1", "type 2", \
                          "concepts 1", "concepts 2", \
                          "cosine similarity", "semantic type"])
    
    return df, data_inst_pairs

## README: Store generated data here

In [None]:
generated_data_dict = {}

## Patient 1

In [None]:
#### Process patient data and iterate over pairs of Data instances to get pairs
# Step 1: Select a patient -- processes all the data
hadm_id = hadm_ids[0] # Note: `hadm_ids` is a list of all HADM id's with consecutive physician notes

# for storing data
generated_data_dict[int(hadm_id)] = {"contradiction": {}, "none": []}

print(f"Patient {int(hadm_id)}")

pat = Patient(hadm_id, notes_df, drug_df, lab_df, d_lab_df, \
              med7_nlp, umls_nlp, rxnorm_nlp, umls_linker, rxnorm_linker, \
              physician_only=True)

# Making data directory
processed_dir = 'processed'
os.makedirs(processed_dir, exist_ok=True)

pt_csv = os.path.join(processed_dir, f"{int(hadm_id)}.csv")

# Step 2: Generate pairs for this patient
df, data_inst_pairs = generate_data_pairs(pat)

# df.to_csv(pt_csv)
# print("Data has been saved!")

In [None]:
#### Inserting contradictions to Sentence instances
# IMPORTANT: We should only insert contradictions if it is a sentence from a note ("type" should be sentence, not lab or prescription)! 

# Step 3: Get all the pairs about lab values
semantic_type_ids   = CONFLICT_TO_SEMANTIC_TYPE['test_results']
semantic_type_names = [SEMANTIC_TYPE_TO_NAME[st_id] for st_id in semantic_type_ids]

is_lab = df['semantic type'].apply(lambda x: x in semantic_type_names)
lab_pairs_df = df.loc[(df['type 1'] == "lab") | (df['type 2'] == "lab") | is_lab]

lab_pairs_df.head(2)

In [None]:
lab_pairs_df 

In [None]:
# Step 4: Insert contradictions

# We should probably aim for 1-2 contradictions per patient. 
# So basically, copy/paste code for Steps 1-4 for each patient, and push to Github.
# Small heads up -- for a given patient, try not to insert contradictions 
# into two sentences that look really really similar. 
# There's a chance this might refer to the same underlying Sentence instance, 
# which could overwrite a contradiction you previously inserted. 

# Look through the sentence pairs by going through `prescription_pairs_df`.
# If you find a good one you want to insert a contradiction for, 
# make note of the row index (i.e. the number at the left), 
# and set this to `pair_idx` below. 
# Also make note of which sentence (i.e. sentence 1 or sentence 2)
# you want to modify, and set the `is_sentence2` flag appropriately.

In [None]:
pair_idx = 43
is_sentence2 = True

data_1 = data_inst_pairs[pair_idx][0][0]
data_2 = data_inst_pairs[pair_idx][0][1]

print(f"{data_1.type} 1:\t{data_1.txt}")
print(f"{data_2.type} 2:\t{data_2.txt}")

sentence_to_modify = data_inst_pairs[pair_idx][0][is_sentence2]

# Set `contradicting_txt` to the new contradicting sentence.
# This will just update the text for now.

contradicting_txt = "Potassium stable and normal range."
sentence_to_modify.update_text(contradicting_txt)

print(f"\nNew contradicting sentence: {contradicting_txt}")

# Store conflict
generated_data_dict[int(hadm_id)]['contradiction'][pair_idx] = (is_sentence2, contradicting_txt)

In [None]:
no_contradiction_pair_idx = [115, 172]

print("Examples of non-contradictions")
print("*****************************")
for pair_idx in no_contradiction_pair_idx:
    data_1 = data_inst_pairs[pair_idx][0][0]
    data_2 = data_inst_pairs[pair_idx][0][1]
    
    print(f"{data_1.type} 1:\t{data_1.txt}")
    print(f"{data_2.type} 2:\t{data_2.txt}")
    print("*****************************")
    
# Store negative examples
generated_data_dict[int(hadm_id)]['none'] = no_contradiction_pair_idx

## Patient 2

In [None]:
#### Process patient data and iterate over pairs of Data instances to get pairs
# Step 1: Select a patient -- processes all the data
hadm_id = hadm_ids[1] # Note: `hadm_ids` is a list of all HADM id's with consecutive physician notes

# for storing data
generated_data_dict[int(hadm_id)] = {"contradiction": {}, "none": []}

print(f"Patient {int(hadm_id)}")

pat = Patient(hadm_id, notes_df, drug_df, lab_df, d_lab_df, \
              med7_nlp, umls_nlp, rxnorm_nlp, umls_linker, rxnorm_linker, \
              physician_only=True)

# Making data directory
processed_dir = 'processed'
os.makedirs(processed_dir, exist_ok=True)

pt_csv = os.path.join(processed_dir, f"{int(hadm_id)}.csv")

# Step 2: Generate pairs for this patient
df, data_inst_pairs = generate_data_pairs(pat)

# df.to_csv(pt_csv)
# print("Data has been saved!")

In [None]:
#### Inserting contradictions to Sentence instances
# IMPORTANT: We should only insert contradictions if it is a sentence from a note ("type" should be sentence, not lab or prescription)! 

# Step 3: Get all the pairs about lab values
semantic_type_ids   = CONFLICT_TO_SEMANTIC_TYPE['test_results']
semantic_type_names = [SEMANTIC_TYPE_TO_NAME[st_id] for st_id in semantic_type_ids]

is_lab = df['semantic type'].apply(lambda x: x in semantic_type_names)
lab_pairs_df = df.loc[(df['type 1'] == "lab") | (df['type 2'] == "lab") | is_lab]

lab_pairs_df.head(2)

In [None]:
lab_pairs_df 

In [None]:
# Step 4: Insert contradictions

# We should probably aim for 1-2 contradictions per patient. 
# So basically, copy/paste code for Steps 1-4 for each patient, and push to Github.
# Small heads up -- for a given patient, try not to insert contradictions 
# into two sentences that look really really similar. 
# There's a chance this might refer to the same underlying Sentence instance, 
# which could overwrite a contradiction you previously inserted. 

# Look through the sentence pairs by going through `prescription_pairs_df`.
# If you find a good one you want to insert a contradiction for, 
# make note of the row index (i.e. the number at the left), 
# and set this to `pair_idx` below. 
# Also make note of which sentence (i.e. sentence 1 or sentence 2)
# you want to modify, and set the `is_sentence2` flag appropriately.

In [None]:
pair_idx = 12
is_sentence2 = True

data_1 = data_inst_pairs[pair_idx][0][0]
data_2 = data_inst_pairs[pair_idx][0][1]

print(f"{data_1.type} 1:\t{data_1.txt}")
print(f"{data_2.type} 2:\t{data_2.txt}")

sentence_to_modify = data_inst_pairs[pair_idx][0][is_sentence2]

# Set `contradicting_txt` to the new contradicting sentence.
# This will just update the text for now.

contradicting_txt = "Consistently low potassium levels."
sentence_to_modify.update_text(contradicting_txt)

print(f"\nNew contradicting sentence: {contradicting_txt}")

# Store conflict
generated_data_dict[int(hadm_id)]['contradiction'][pair_idx] = (is_sentence2, contradicting_txt)

In [None]:
no_contradiction_pair_idx = [15, 36]

print("Examples of non-contradictions")
print("*****************************")
for pair_idx in no_contradiction_pair_idx:
    data_1 = data_inst_pairs[pair_idx][0][0]
    data_2 = data_inst_pairs[pair_idx][0][1]
    
    print(f"{data_1.type} 1:\t{data_1.txt}")
    print(f"{data_2.type} 2:\t{data_2.txt}")
    print("*****************************")

# Store negative examples
generated_data_dict[int(hadm_id)]['none'] = no_contradiction_pair_idx

In [None]:
"""
Todo: ask Dr. Saenz
"""
potential_contradiction_pair_indices = [21]

print("Potential examples of contradictions")
print("*****************************")
for pair_idx in potential_contradiction_pair_indices:
    data_1 = data_inst_pairs[pair_idx][0][0]
    data_2 = data_inst_pairs[pair_idx][0][1]
    
    print(f"{data_1.type} 1:\t{data_1.txt}")
    print(f"{data_2.type} 2:\t{data_2.txt}")
    print("*****************************")

## Patient 3

In [None]:
#### Process patient data and iterate over pairs of Data instances to get pairs
# Step 1: Select a patient -- processes all the data
hadm_id = hadm_ids[2] # Note: `hadm_ids` is a list of all HADM id's with consecutive physician notes

# for storing data
generated_data_dict[int(hadm_id)] = {"contradiction": {}, "none": []}

print(f"Patient {int(hadm_id)}")

pat = Patient(hadm_id, notes_df, drug_df, lab_df, d_lab_df, \
              med7_nlp, umls_nlp, rxnorm_nlp, umls_linker, rxnorm_linker, \
              physician_only=True)

# Making data directory
processed_dir = 'processed'
os.makedirs(processed_dir, exist_ok=True)

pt_csv = os.path.join(processed_dir, f"{int(hadm_id)}.csv")

# Step 2: Generate pairs for this patient
df, data_inst_pairs = generate_data_pairs(pat)

# df.to_csv(pt_csv)
# print("Data has been saved!")

In [None]:
#### Inserting contradictions to Sentence instances
# IMPORTANT: We should only insert contradictions if it is a sentence from a note ("type" should be sentence, not lab or prescription)! 

# Step 3: Get all the pairs about lab values
semantic_type_ids   = CONFLICT_TO_SEMANTIC_TYPE['test_results']
semantic_type_names = [SEMANTIC_TYPE_TO_NAME[st_id] for st_id in semantic_type_ids]

is_lab = df['semantic type'].apply(lambda x: x in semantic_type_names)
lab_pairs_df = df.loc[(df['type 1'] == "lab") | (df['type 2'] == "lab") | is_lab]

lab_pairs_df.head(2)

In [None]:
lab_pairs_df 

In [None]:
# Step 4: Insert contradictions

# We should probably aim for 1-2 contradictions per patient. 
# So basically, copy/paste code for Steps 1-4 for each patient, and push to Github.
# Small heads up -- for a given patient, try not to insert contradictions 
# into two sentences that look really really similar. 
# There's a chance this might refer to the same underlying Sentence instance, 
# which could overwrite a contradiction you previously inserted. 

# Look through the sentence pairs by going through `prescription_pairs_df`.
# If you find a good one you want to insert a contradiction for, 
# make note of the row index (i.e. the number at the left), 
# and set this to `pair_idx` below. 
# Also make note of which sentence (i.e. sentence 1 or sentence 2)
# you want to modify, and set the `is_sentence2` flag appropriately.

In [None]:
pair_idx = 46
is_sentence2 = True

data_1 = data_inst_pairs[pair_idx][0][0]
data_2 = data_inst_pairs[pair_idx][0][1]

print(f"{data_1.type} 1:\t{data_1.txt}")
print(f"{data_2.type} 2:\t{data_2.txt}")

sentence_to_modify = data_inst_pairs[pair_idx][0][is_sentence2]

# Set `contradicting_txt` to the new contradicting sentence.
# This will just update the text for now.

contradicting_txt = "Pt lactate stable and normal range."
sentence_to_modify.update_text(contradicting_txt)

print(f"\nNew contradicting sentence: {contradicting_txt}")

# Store conflict
generated_data_dict[int(hadm_id)]['contradiction'][pair_idx] = (is_sentence2, contradicting_txt)

In [None]:
no_contradiction_pair_idx = [48, 76]

print("Examples of non-contradictions")
print("*****************************")
for pair_idx in no_contradiction_pair_idx:
    data_1 = data_inst_pairs[pair_idx][0][0]
    data_2 = data_inst_pairs[pair_idx][0][1]
    
    print(f"{data_1.type} 1:\t{data_1.txt}")
    print(f"{data_2.type} 2:\t{data_2.txt}")
    print("*****************************")
    
# Store negative examples
generated_data_dict[int(hadm_id)]['none'] = no_contradiction_pair_idx

## Patient 4

In [None]:
#### Process patient data and iterate over pairs of Data instances to get pairs
# Step 1: Select a patient -- processes all the data
hadm_id = hadm_ids[3] # Note: `hadm_ids` is a list of all HADM id's with consecutive physician notes

# for storing data
generated_data_dict[int(hadm_id)] = {"contradiction": {}, "none": []}

print(f"Patient {int(hadm_id)}")

pat = Patient(hadm_id, notes_df, drug_df, lab_df, d_lab_df, \
              med7_nlp, umls_nlp, rxnorm_nlp, umls_linker, rxnorm_linker, \
              physician_only=True)

# Making data directory
processed_dir = 'processed'
os.makedirs(processed_dir, exist_ok=True)

pt_csv = os.path.join(processed_dir, f"{int(hadm_id)}.csv")

# Step 2: Generate pairs for this patient
df, data_inst_pairs = generate_data_pairs(pat)

# df.to_csv(pt_csv)
# print("Data has been saved!")

In [None]:
#### Inserting contradictions to Sentence instances
# IMPORTANT: We should only insert contradictions if it is a sentence from a note ("type" should be sentence, not lab or prescription)! 

# Step 3: Get all the pairs about lab values
semantic_type_ids   = CONFLICT_TO_SEMANTIC_TYPE['test_results']
semantic_type_names = [SEMANTIC_TYPE_TO_NAME[st_id] for st_id in semantic_type_ids]

is_lab = df['semantic type'].apply(lambda x: x in semantic_type_names)
lab_pairs_df = df.loc[(df['type 1'] == "lab") | (df['type 2'] == "lab") | is_lab]

lab_pairs_df.head(2)

In [None]:
lab_pairs_df 

In [None]:
# Step 4: Insert contradictions

# We should probably aim for 1-2 contradictions per patient. 
# So basically, copy/paste code for Steps 1-4 for each patient, and push to Github.
# Small heads up -- for a given patient, try not to insert contradictions 
# into two sentences that look really really similar. 
# There's a chance this might refer to the same underlying Sentence instance, 
# which could overwrite a contradiction you previously inserted. 

# Look through the sentence pairs by going through `prescription_pairs_df`.
# If you find a good one you want to insert a contradiction for, 
# make note of the row index (i.e. the number at the left), 
# and set this to `pair_idx` below. 
# Also make note of which sentence (i.e. sentence 1 or sentence 2)
# you want to modify, and set the `is_sentence2` flag appropriately.

In [None]:
no_contradiction_pair_idx = [39, 42, 72]

print("Examples of non-contradictions")
print("*****************************")
for pair_idx in no_contradiction_pair_idx:
    data_1 = data_inst_pairs[pair_idx][0][0]
    data_2 = data_inst_pairs[pair_idx][0][1]
    
    print(f"{data_1.type} 1:\t{data_1.txt}")
    print(f"{data_2.type} 2:\t{data_2.txt}")
    print("*****************************")
    
# Store negative examples
generated_data_dict[int(hadm_id)]['none'] = no_contradiction_pair_idx

## Patient 5

In [None]:
#### Process patient data and iterate over pairs of Data instances to get pairs
# Step 1: Select a patient -- processes all the data
hadm_id = hadm_ids[4] # Note: `hadm_ids` is a list of all HADM id's with consecutive physician notes

# for storing data
generated_data_dict[int(hadm_id)] = {"contradiction": {}, "none": []}

print(f"Patient {int(hadm_id)}")

pat = Patient(hadm_id, notes_df, drug_df, lab_df, d_lab_df, \
              med7_nlp, umls_nlp, rxnorm_nlp, umls_linker, rxnorm_linker, \
              physician_only=True)

# Making data directory
processed_dir = 'processed'
os.makedirs(processed_dir, exist_ok=True)

pt_csv = os.path.join(processed_dir, f"{int(hadm_id)}.csv")

# Step 2: Generate pairs for this patient
df, data_inst_pairs = generate_data_pairs(pat)

# df.to_csv(pt_csv)
# print("Data has been saved!")

In [None]:
#### Inserting contradictions to Sentence instances
# IMPORTANT: We should only insert contradictions if it is a sentence from a note ("type" should be sentence, not lab or prescription)! 

# Step 3: Get all the pairs about lab values
semantic_type_ids   = CONFLICT_TO_SEMANTIC_TYPE['test_results']
semantic_type_names = [SEMANTIC_TYPE_TO_NAME[st_id] for st_id in semantic_type_ids]

is_lab = df['semantic type'].apply(lambda x: x in semantic_type_names)
lab_pairs_df = df.loc[(df['type 1'] == "lab") | (df['type 2'] == "lab") | is_lab]

lab_pairs_df.head(2)

In [None]:
lab_pairs_df 

In [None]:
# Step 4: Insert contradictions

# We should probably aim for 1-2 contradictions per patient. 
# So basically, copy/paste code for Steps 1-4 for each patient, and push to Github.
# Small heads up -- for a given patient, try not to insert contradictions 
# into two sentences that look really really similar. 
# There's a chance this might refer to the same underlying Sentence instance, 
# which could overwrite a contradiction you previously inserted. 

# Look through the sentence pairs by going through `prescription_pairs_df`.
# If you find a good one you want to insert a contradiction for, 
# make note of the row index (i.e. the number at the left), 
# and set this to `pair_idx` below. 
# Also make note of which sentence (i.e. sentence 1 or sentence 2)
# you want to modify, and set the `is_sentence2` flag appropriately.

In [None]:
pair_idx = 110
is_sentence2 = True

data_1 = data_inst_pairs[pair_idx][0][0]
data_2 = data_inst_pairs[pair_idx][0][1]

print(f"{data_1.type} 1:\t{data_1.txt}")
print(f"{data_2.type} 2:\t{data_2.txt}")

sentence_to_modify = data_inst_pairs[pair_idx][0][is_sentence2]

# Set `contradicting_txt` to the new contradicting sentence.
# This will just update the text for now.

contradicting_txt = data_2.txt
start, end = re.search("7.3", contradicting_txt).span()
contradicting_txt = contradicting_txt[:start] + "1.2" + contradicting_txt[end:]

start, end = re.search("7.3", contradicting_txt).span()
contradicting_txt = contradicting_txt[:start] + "2.4" + contradicting_txt[end:]
sentence_to_modify.update_text(contradicting_txt)

print(f"\nNew contradicting sentence: {contradicting_txt}")

# Store conflict
generated_data_dict[int(hadm_id)]['contradiction'][pair_idx] = (is_sentence2, contradicting_txt)

In [None]:
no_contradiction_pair_idx = [89]

print("Examples of non-contradictions")
print("*****************************")
for pair_idx in no_contradiction_pair_idx:
    data_1 = data_inst_pairs[pair_idx][0][0]
    data_2 = data_inst_pairs[pair_idx][0][1]
    
    print(f"{data_1.type} 1:\t{data_1.txt}")
    print(f"{data_2.type} 2:\t{data_2.txt}")
    print("*****************************")
    
# Store negative examples
generated_data_dict[int(hadm_id)]['none'] = no_contradiction_pair_idx

## Patient 6

In [None]:
#### Process patient data and iterate over pairs of Data instances to get pairs
# Step 1: Select a patient -- processes all the data
hadm_id = hadm_ids[5] # Note: `hadm_ids` is a list of all HADM id's with consecutive physician notes

# for storing data
generated_data_dict[int(hadm_id)] = {"contradiction": {}, "none": []}

print(f"Patient {int(hadm_id)}")

pat = Patient(hadm_id, notes_df, drug_df, lab_df, d_lab_df, \
              med7_nlp, umls_nlp, rxnorm_nlp, umls_linker, rxnorm_linker, \
              physician_only=True)

# Making data directory
processed_dir = 'processed'
os.makedirs(processed_dir, exist_ok=True)

pt_csv = os.path.join(processed_dir, f"{int(hadm_id)}.csv")

# Step 2: Generate pairs for this patient
df, data_inst_pairs = generate_data_pairs(pat)

# df.to_csv(pt_csv)
# print("Data has been saved!")

In [None]:
#### Inserting contradictions to Sentence instances
# IMPORTANT: We should only insert contradictions if it is a sentence from a note ("type" should be sentence, not lab or prescription)! 

# Step 3: Get all the pairs about lab values
semantic_type_ids   = CONFLICT_TO_SEMANTIC_TYPE['test_results']
semantic_type_names = [SEMANTIC_TYPE_TO_NAME[st_id] for st_id in semantic_type_ids]

is_lab = df['semantic type'].apply(lambda x: x in semantic_type_names)
lab_pairs_df = df.loc[(df['type 1'] == "lab") | (df['type 2'] == "lab") | is_lab]

lab_pairs_df.head(2)

In [None]:
lab_pairs_df 

In [None]:
# Step 4: Insert contradictions

# We should probably aim for 1-2 contradictions per patient. 
# So basically, copy/paste code for Steps 1-4 for each patient, and push to Github.
# Small heads up -- for a given patient, try not to insert contradictions 
# into two sentences that look really really similar. 
# There's a chance this might refer to the same underlying Sentence instance, 
# which could overwrite a contradiction you previously inserted. 

# Look through the sentence pairs by going through `prescription_pairs_df`.
# If you find a good one you want to insert a contradiction for, 
# make note of the row index (i.e. the number at the left), 
# and set this to `pair_idx` below. 
# Also make note of which sentence (i.e. sentence 1 or sentence 2)
# you want to modify, and set the `is_sentence2` flag appropriately.

In [None]:
pair_idx = 21
is_sentence2 = True

data_1 = data_inst_pairs[pair_idx][0][0]
data_2 = data_inst_pairs[pair_idx][0][1]

print(f"{data_1.type} 1:\t{data_1.txt}")
print(f"{data_2.type} 2:\t{data_2.txt}")

sentence_to_modify = data_inst_pairs[pair_idx][0][is_sentence2]

# Set `contradicting_txt` to the new contradicting sentence.
# This will just update the text for now.

contradicting_txt = "Attending    Evaluated pt who had normal resp " +\
                    "with ABG no evidence of fatigue."
sentence_to_modify.update_text(contradicting_txt)

print(f"\nNew contradicting sentence: {contradicting_txt}")

# Store conflict
generated_data_dict[int(hadm_id)]['contradiction'][pair_idx] = (is_sentence2, contradicting_txt)

## Patient 7

In [None]:
#### Process patient data and iterate over pairs of Data instances to get pairs
# Step 1: Select a patient -- processes all the data
hadm_id = hadm_ids[3] # Note: `hadm_ids` is a list of all HADM id's with consecutive physician notes

# for storing data
generated_data_dict[int(hadm_id)] = {"contradiction": {}, "none": []}

print(f"Patient {int(hadm_id)}")

pat = Patient(hadm_id, notes_df, drug_df, lab_df, d_lab_df, \
              med7_nlp, umls_nlp, rxnorm_nlp, umls_linker, rxnorm_linker, \
              physician_only=True)

# Making data directory
processed_dir = 'processed'
os.makedirs(processed_dir, exist_ok=True)

pt_csv = os.path.join(processed_dir, f"{int(hadm_id)}.csv")

# Step 2: Generate pairs for this patient
df, data_inst_pairs = generate_data_pairs(pat)

# df.to_csv(pt_csv)
# print("Data has been saved!")

In [None]:
#### Inserting contradictions to Sentence instances
# IMPORTANT: We should only insert contradictions if it is a sentence from a note ("type" should be sentence, not lab or prescription)! 

# Step 3: Get all the pairs about lab values
semantic_type_ids   = CONFLICT_TO_SEMANTIC_TYPE['test_results']
semantic_type_names = [SEMANTIC_TYPE_TO_NAME[st_id] for st_id in semantic_type_ids]

is_lab = df['semantic type'].apply(lambda x: x in semantic_type_names)
lab_pairs_df = df.loc[(df['type 1'] == "lab") | (df['type 2'] == "lab") | is_lab]

lab_pairs_df.head(2)

In [None]:
lab_pairs_df 

In [None]:
# Step 4: Insert contradictions

# We should probably aim for 1-2 contradictions per patient. 
# So basically, copy/paste code for Steps 1-4 for each patient, and push to Github.
# Small heads up -- for a given patient, try not to insert contradictions 
# into two sentences that look really really similar. 
# There's a chance this might refer to the same underlying Sentence instance, 
# which could overwrite a contradiction you previously inserted. 

# Look through the sentence pairs by going through `prescription_pairs_df`.
# If you find a good one you want to insert a contradiction for, 
# make note of the row index (i.e. the number at the left), 
# and set this to `pair_idx` below. 
# Also make note of which sentence (i.e. sentence 1 or sentence 2)
# you want to modify, and set the `is_sentence2` flag appropriately.

In [None]:
pair_idx = 79
is_sentence2 = True

data_1 = data_inst_pairs[pair_idx][0][0]
data_2 = data_inst_pairs[pair_idx][0][1]

print(f"{data_1.type} 1:\t{data_1.txt}")
print(f"{data_2.type} 2:\t{data_2.txt}")

sentence_to_modify = data_inst_pairs[pair_idx][0][is_sentence2]

# Set `contradicting_txt` to the new contradicting sentence.
# This will just update the text for now.

contradicting_txt = "Labs normal range    for WBC as above, HCT 40, K+ 3.2, Cr 1.1, lactate 4.3."
sentence_to_modify.update_text(contradicting_txt)

print(f"\nNew contradicting sentence: {contradicting_txt}")

# Store conflict
generated_data_dict[int(hadm_id)]['contradiction'][pair_idx] = (is_sentence2, contradicting_txt)

In [None]:
no_contradiction_pair_idx = [39, 42, 72, 78, 80]

print("Examples of non-contradictions")
print("*****************************")
for pair_idx in no_contradiction_pair_idx:
    data_1 = data_inst_pairs[pair_idx][0][0]
    data_2 = data_inst_pairs[pair_idx][0][1]
    
    print(f"{data_1.type} 1:\t{data_1.txt}")
    print(f"{data_2.type} 2:\t{data_2.txt}")
    print("*****************************")
    
# Store negative examples
generated_data_dict[int(hadm_id)]['none'] = no_contradiction_pair_idx

## Patient 8

In [None]:
#### Process patient data and iterate over pairs of Data instances to get pairs
# Step 1: Select a patient -- processes all the data
hadm_id = hadm_ids[7] # Note: `hadm_ids` is a list of all HADM id's with consecutive physician notes

# for storing data
generated_data_dict[int(hadm_id)] = {"contradiction": {}, "none": []}

print(f"Patient {int(hadm_id)}")

pat = Patient(hadm_id, notes_df, drug_df, lab_df, d_lab_df, \
              med7_nlp, umls_nlp, rxnorm_nlp, umls_linker, rxnorm_linker, \
              physician_only=True)

# Making data directory
processed_dir = 'processed'
os.makedirs(processed_dir, exist_ok=True)

pt_csv = os.path.join(processed_dir, f"{int(hadm_id)}.csv")

# Step 2: Generate pairs for this patient
df, data_inst_pairs = generate_data_pairs(pat)

# df.to_csv(pt_csv)
# print("Data has been saved!")

In [None]:
#### Inserting contradictions to Sentence instances
# IMPORTANT: We should only insert contradictions if it is a sentence from a note ("type" should be sentence, not lab or prescription)! 

# Step 3: Get all the pairs about lab values
semantic_type_ids   = CONFLICT_TO_SEMANTIC_TYPE['test_results']
semantic_type_names = [SEMANTIC_TYPE_TO_NAME[st_id] for st_id in semantic_type_ids]

is_lab = df['semantic type'].apply(lambda x: x in semantic_type_names)
lab_pairs_df = df.loc[(df['type 1'] == "lab") | (df['type 2'] == "lab") | is_lab]

lab_pairs_df.head(2)

In [None]:
lab_pairs_df 

In [None]:
# Step 4: Insert contradictions

# We should probably aim for 1-2 contradictions per patient. 
# So basically, copy/paste code for Steps 1-4 for each patient, and push to Github.
# Small heads up -- for a given patient, try not to insert contradictions 
# into two sentences that look really really similar. 
# There's a chance this might refer to the same underlying Sentence instance, 
# which could overwrite a contradiction you previously inserted. 

# Look through the sentence pairs by going through `prescription_pairs_df`.
# If you find a good one you want to insert a contradiction for, 
# make note of the row index (i.e. the number at the left), 
# and set this to `pair_idx` below. 
# Also make note of which sentence (i.e. sentence 1 or sentence 2)
# you want to modify, and set the `is_sentence2` flag appropriately.

In [None]:
pair_idx = 117
is_sentence2 = True

data_1 = data_inst_pairs[pair_idx][0][0]
data_2 = data_inst_pairs[pair_idx][0][1]

print(f"{data_1.type} 1:\t{data_1.txt}")
print(f"{data_2.type} 2:\t{data_2.txt}")

sentence_to_modify = data_inst_pairs[pair_idx][0][is_sentence2]

# Set `contradicting_txt` to the new contradicting sentence.
# This will just update the text for now.

contradicting_txt = "Standard WBC levels         no sign of CLL."
sentence_to_modify.update_text(contradicting_txt)

print(f"\nNew contradicting sentence: {contradicting_txt}")

# Store conflict
generated_data_dict[int(hadm_id)]['contradiction'][pair_idx] = (is_sentence2, contradicting_txt)

In [None]:
pair_idx = 212
is_sentence2 = True

data_1 = data_inst_pairs[pair_idx][0][0]
data_2 = data_inst_pairs[pair_idx][0][1]

print(f"{data_1.type} 1:\t{data_1.txt}")
print(f"{data_2.type} 2:\t{data_2.txt}")

sentence_to_modify = data_inst_pairs[pair_idx][0][is_sentence2]

# Set `contradicting_txt` to the new contradicting sentence.
# This will just update the text for now.

contradicting_txt = "WBC persistently    low sign of infection."
sentence_to_modify.update_text(contradicting_txt)

print(f"\nNew contradicting sentence: {contradicting_txt}")

# Store conflict
generated_data_dict[int(hadm_id)]['contradiction'][pair_idx] = (is_sentence2, contradicting_txt)

## Patient 9

In [None]:
#### Process patient data and iterate over pairs of Data instances to get pairs
# Step 1: Select a patient -- processes all the data
hadm_id = hadm_ids[8] # Note: `hadm_ids` is a list of all HADM id's with consecutive physician notes

# for storing data
generated_data_dict[int(hadm_id)] = {"contradiction": {}, "none": []}

print(f"Patient {int(hadm_id)}")

pat = Patient(hadm_id, notes_df, drug_df, lab_df, d_lab_df, \
              med7_nlp, umls_nlp, rxnorm_nlp, umls_linker, rxnorm_linker, \
              physician_only=True)

# Making data directory
processed_dir = 'processed'
os.makedirs(processed_dir, exist_ok=True)

pt_csv = os.path.join(processed_dir, f"{int(hadm_id)}.csv")

# Step 2: Generate pairs for this patient
df, data_inst_pairs = generate_data_pairs(pat)

# df.to_csv(pt_csv)
# print("Data has been saved!")

In [None]:
#### Inserting contradictions to Sentence instances
# IMPORTANT: We should only insert contradictions if it is a sentence from a note ("type" should be sentence, not lab or prescription)! 

# Step 3: Get all the pairs about lab values
semantic_type_ids   = CONFLICT_TO_SEMANTIC_TYPE['test_results']
semantic_type_names = [SEMANTIC_TYPE_TO_NAME[st_id] for st_id in semantic_type_ids]

is_lab = df['semantic type'].apply(lambda x: x in semantic_type_names)
lab_pairs_df = df.loc[(df['type 1'] == "lab") | (df['type 2'] == "lab") | is_lab]

lab_pairs_df.head(2)

In [None]:
lab_pairs_df 

In [None]:
# Step 4: Insert contradictions

# We should probably aim for 1-2 contradictions per patient. 
# So basically, copy/paste code for Steps 1-4 for each patient, and push to Github.
# Small heads up -- for a given patient, try not to insert contradictions 
# into two sentences that look really really similar. 
# There's a chance this might refer to the same underlying Sentence instance, 
# which could overwrite a contradiction you previously inserted. 

# Look through the sentence pairs by going through `prescription_pairs_df`.
# If you find a good one you want to insert a contradiction for, 
# make note of the row index (i.e. the number at the left), 
# and set this to `pair_idx` below. 
# Also make note of which sentence (i.e. sentence 1 or sentence 2)
# you want to modify, and set the `is_sentence2` flag appropriately.

In [None]:
no_contradiction_pair_idx = [16]

print("Examples of non-contradictions")
print("*****************************")
for pair_idx in no_contradiction_pair_idx:
    data_1 = data_inst_pairs[pair_idx][0][0]
    data_2 = data_inst_pairs[pair_idx][0][1]
    
    print(f"{data_1.type} 1:\t{data_1.txt}")
    print(f"{data_2.type} 2:\t{data_2.txt}")
    print("*****************************")
    
# Store negative examples
generated_data_dict[int(hadm_id)]['none'] = no_contradiction_pair_idx

## Patient 10

In [None]:
#### Process patient data and iterate over pairs of Data instances to get pairs
# Step 1: Select a patient -- processes all the data
hadm_id = hadm_ids[9] # Note: `hadm_ids` is a list of all HADM id's with consecutive physician notes

# for storing data
generated_data_dict[int(hadm_id)] = {"contradiction": {}, "none": []}

print(f"Patient {int(hadm_id)}")

pat = Patient(hadm_id, notes_df, drug_df, lab_df, d_lab_df, \
              med7_nlp, umls_nlp, rxnorm_nlp, umls_linker, rxnorm_linker, \
              physician_only=True)

# Making data directory
processed_dir = 'processed'
os.makedirs(processed_dir, exist_ok=True)

pt_csv = os.path.join(processed_dir, f"{int(hadm_id)}.csv")

# Step 2: Generate pairs for this patient
df, data_inst_pairs = generate_data_pairs(pat)

# df.to_csv(pt_csv)
# print("Data has been saved!")

In [None]:
#### Inserting contradictions to Sentence instances
# IMPORTANT: We should only insert contradictions if it is a sentence from a note ("type" should be sentence, not lab or prescription)! 

# Step 3: Get all the pairs about lab values
semantic_type_ids   = CONFLICT_TO_SEMANTIC_TYPE['test_results']
semantic_type_names = [SEMANTIC_TYPE_TO_NAME[st_id] for st_id in semantic_type_ids]

is_lab = df['semantic type'].apply(lambda x: x in semantic_type_names)
lab_pairs_df = df.loc[(df['type 1'] == "lab") | (df['type 2'] == "lab") | is_lab]

lab_pairs_df.head(2)

In [None]:
lab_pairs_df 

In [None]:
# Step 4: Insert contradictions

# We should probably aim for 1-2 contradictions per patient. 
# So basically, copy/paste code for Steps 1-4 for each patient, and push to Github.
# Small heads up -- for a given patient, try not to insert contradictions 
# into two sentences that look really really similar. 
# There's a chance this might refer to the same underlying Sentence instance, 
# which could overwrite a contradiction you previously inserted. 

# Look through the sentence pairs by going through `prescription_pairs_df`.
# If you find a good one you want to insert a contradiction for, 
# make note of the row index (i.e. the number at the left), 
# and set this to `pair_idx` below. 
# Also make note of which sentence (i.e. sentence 1 or sentence 2)
# you want to modify, and set the `is_sentence2` flag appropriately.

In [None]:
pair_idx = 48
is_sentence2 = True

data_1 = data_inst_pairs[pair_idx][0][0]
data_2 = data_inst_pairs[pair_idx][0][1]

print(f"{data_1.type} 1:\t{data_1.txt}")
print(f"{data_2.type} 2:\t{data_2.txt}")

sentence_to_modify = data_inst_pairs[pair_idx][0][is_sentence2]

# Set `contradicting_txt` to the new contradicting sentence.
# This will just update the text for now.

contradicting_txt = "36.5 %    12.2 g/dL    122 mg/dL    1.5 mg/dL    44 mg/dL    29 mEq/L    " +\
                    "84 mEq/L    4.3 mEq/L    120 mEq/L    12.5 K/uL         [image002.jpg]                               " +\
                    "[**2143-4-21**]   12:58 AM                               [**2143-4-21**]   04:50 AM    " +\
                    "WBC                                     12.5    Hct                                     36.5    " +\
                    "Plt                                      163    Cr                                      " +\
                    "1.5                                      1.5    TropT                                     0.03    " +\
                    "Glucose                                      130                                      122    " +\
                    "Other labs: PT / PTT / INR:14.7/290.0/1.3, CK / CKMB /    Troponin-T:30/2/0.03, ALT / AST:28/55, Alk Phos " +\
                    "/ T Bili:161/0.4,    Amylase / Lipase:27/13, Albumin:2.9 g/dL, LDH:347 IU/L, Ca++:8.2 mg/dL,    " +\
                    "Mg++:3.1 mg/dL, PO4:5.1 mg/dL'"
sentence_to_modify.update_text(contradicting_txt)

print(f"\nNew contradicting sentence: {contradicting_txt}")

# Store conflict
generated_data_dict[int(hadm_id)]['contradiction'][pair_idx] = (is_sentence2, contradicting_txt)

In [None]:
no_contradiction_pair_idx = [48, 49, 104]

print("Examples of non-contradictions")
print("*****************************")
for pair_idx in no_contradiction_pair_idx:
    data_1 = data_inst_pairs[pair_idx][0][0]
    data_2 = data_inst_pairs[pair_idx][0][1]
    
    print(f"{data_1.type} 1:\t{data_1.txt}")
    print(f"{data_2.type} 2:\t{data_2.txt}")
    print("*****************************")
    
# Store negative examples
generated_data_dict[int(hadm_id)]['none'] = no_contradiction_pair_idx

In [None]:
import pickle
data_dict_file = "generated_data_dict_lab.pkl"
with open(data_dict_file, "wb") as f:
    pickle.dump(generated_data_dict, f)

# 5. Loading contradictions data for pipeline [skip 4 if pickle file already created]

If `generated_data_dict_lab.pkl` has already been created, skip part 4. You should still run the inital cells, above "README" in that section though.

About 2 min per HADM_ID, 20 min total

In [None]:
# 9 - positive examples
# 16 - negative examples

In [None]:
import pickle

data_dict_files = {"lab": "generated_data_dict_lab.pkl",
                   "prescription": "generated_data_dict_prescription.pkl",
                   "diagnosis": "generated_data_dict_diagnosis.pkl"}

data_dict_unpickled = {}
for ctype, cfile in data_dict_files.items():
    try:
        with open(cfile, "rb") as f:
            generated_unpickled = pickle.load(f)
    except FileNotFoundError:
        continue 
        
    data_dict_unpickled[ctype] = generated_unpickled

# data_dict_file = "generated_data_dict_lab.pkl"
# with open(data_dict_file, "rb") as f:
#     generated_data_dict = pickle.load(f)

data_dict_unpickled.keys()

In [None]:
def insert_contradictions(hadm_generated_dict, generated_dataset, conflict_type=None):
    print("+++++ Inserting contradictions +++++")
    for pair_idx, (is_sentence2, contradicting_txt) in hadm_generated_dict['contradiction'].items():
        data_1 = data_inst_pairs[pair_idx][0][0]
        data_2 = data_inst_pairs[pair_idx][0][1]

        print(f"{data_1.type} 1:\t{data_1.txt}")
        print(f"{data_2.type} 2:\t{data_2.txt}")

        sentence_to_modify = data_inst_pairs[pair_idx][0][is_sentence2]

        # Set `contradicting_txt` to the new contradicting sentence.
        # Update text and reprocess features.
        sentence_to_modify.update_text(contradicting_txt, True)

        print(f"\nNew contradicting sentence: {contradicting_txt}")
        print("+++++++++++++++++++++++++++++++++++")
        
        # Add example to dataset
        if is_sentence2:
            sentences = (data_1, sentence_to_modify)
        else:
            sentences = (sentence_to_modify, data_2)
            
        if conflict_type is None:
            generated_dataset.append((sentences, 1)) # these are all contradictions
        else: # also add the conflict type if it is given
            generated_dataset.append((sentences, 1, conflict_type)) # these are all contradictions
        
    return generated_dataset

def insert_negative_ex(hadm_generated_dict, generated_dataset, conflict_type=None):
    print("+++++ Inserting negative examples +++++")
    for pair_idx in hadm_generated_dict['none']:
        data_1 = data_inst_pairs[pair_idx][0][0]
        data_2 = data_inst_pairs[pair_idx][0][1]

        print(f"{data_1.type} 1:\t{data_1.txt}")
        print(f"{data_2.type} 2:\t{data_2.txt}")

        if conflict_type is None:
            generated_dataset.append(((data_1, data_2), 0)) # these are all negatives
        else: # also add the conflict type if it is given
            generated_dataset.append(((data_1, data_2), 0, conflict_type)) # these are all negatives
#         generated_dataset.append(((data_1, data_2), 0))
        print("+++++++++++++++++++++++++++++++++++")
        
    return generated_dataset

In [None]:
generated_dataset = [] # list of tuples, ((data 1, data 2), label)

for hadm_id in hadm_ids[:10]:
    print("***********************************")
    print(f"Patient {int(hadm_id)}")        
    # Step 1: Select a patient -- process all data
    pat = Patient(hadm_id, notes_df, drug_df, lab_df, d_lab_df, \
                  med7_nlp, umls_nlp, rxnorm_nlp, umls_linker, rxnorm_linker, \
                  physician_only=True)

    # Step 2: Generate pairs for this patient
    df, data_inst_pairs = generate_data_pairs(pat)
    
    # Step 3: Insert contradictions + negative examples
    for ctype, c_generated_data_dict in data_dict_unpickled.items():    
        try:
            c_hadm_generated_dict = c_generated_data_dict[int(hadm_id)]
        except KeyError:
            print("This patient does not exist in contradiction set.")
            continue
        
        # Step 3A: Insert contradictions 
        generated_dataset = insert_contradictions(c_hadm_generated_dict, generated_dataset, ctype)
    
        # Step 3B: Insert negative examples (not contradictions)
        generated_dataset = insert_negative_ex(c_hadm_generated_dict, generated_dataset, ctype)

In [None]:
n = len(generated_dataset)
n_negatives = len(list(filter(lambda x: x[1]==0, generated_dataset)))
n_positives = len(list(filter(lambda x: x[1]==1, generated_dataset)))

print(f"We have {n} total examples\n\t- {n_negatives} negative examples\n\t- {n_positives} positive examples")

Processing dataset for Romanov baseline

In [None]:
generated_dataset_file_unlabeled = "processed/generated_dataset_unlabeled.txt"
generated_dataset_file_labeled   = "processed/generated_dataset_labeled.txt"
generated_dataset_file_all       = "processed/generated_dataset.txt"

for ((data_1, data_2), label, ctype) in generated_dataset:
    with open(generated_dataset_file_unlabeled, "a") as f:
        f.write(f"{data_1.txt}\t{data_2.txt}\n")
        
    with open(generated_dataset_file_labeled, "a") as f:
        f.write(f"{data_1.txt}\t{data_2.txt}\t{label}\n")
        
    with open(generated_dataset_file_all, "a") as f:
        f.write(f"{data_1.txt}\t{data_2.txt}\t{label}\t{ctype}\n")

## Feature Generation

In [None]:
# if number of neg tokens is equal, return 0; otherwise, return 1
def check_if_number_neg_equal_umls(s1, s2):
    sent_doc_1_umls = s1.umls_doc    # "Doc" output for UMLS 
    sent_doc_2_umls = s2.umls_doc    # "Doc" output for UMLS
    
    negation_tokens_1 = [tok for tok in sent_doc_1_umls if tok.dep_ == 'neg']
    negation_tokens_2 = [tok for tok in sent_doc_2_umls if tok.dep_ == 'neg']

    if len(negation_tokens_1) != len(negation_tokens_2):
        return 1
    else:
        return 0

In [None]:
"""
Get dependency tokens based on UMLS ontology
"""
def create_dep_encoding(s1, s2):

    sent_doc_1 = s1.umls_doc
    sent_doc_2 = s2.umls_doc

    dep_child_1 = []
    dep_child_2 = []

    for token in sent_doc_1:
        if token.dep_ == "ROOT": 
            index_to_check = token.i
            dep_list_1 = [token.text for token in sent_doc_1[index_to_check].children if token.dep_ != "punct"]
            dep_child_1.append(sent_doc_1[index_to_check].text)
            dep_child_1.extend(dep_list_1)

    for token in sent_doc_2:
        if token.dep_ == "ROOT":
            index_to_check = token.i
            dep_list_2 = [token.text for token in sent_doc_2[index_to_check].children if token.dep_ != "punct"]
            dep_child_2.append(sent_doc_2[index_to_check].text)
            dep_child_2.extend(dep_list_2)

    dep_sent_1 = list_to_string(dep_child_1)
    dep_sent_2 = list_to_string(dep_child_2)

    dep_doc_1 = umls_nlp(dep_sent_1)
    dep_doc_2 = umls_nlp(dep_sent_2)
    similarity = dep_doc_1.similarity(dep_doc_2)
    return similarity

def list_to_string(list1):
    str1 = ""
    for element in list1:
        str1 += " " + element

    return str1

In [None]:
"""
Given Sentence instances s1, s2
check if they share a shared concept feature (UMLS and RxNorm concepts) 
if they do not share a concept, return 1; otherwise, return 0
"""
def check_shared_feature_umls(s1,s2):
    #{'Scanning'}
    s1_features = s1.features
    s2_features = s2.features
    
    # check if share feature in common
    shared_feature = list(set(s1_features) & set(s2_features))
    
    # do not share concept
    if shared_feature != []:
        return 1
    else: 
        return 0

In [None]:
"""
Med7 (prescription) entities
# if not talking about same DRUG -> return 0
# if same DRUG but other info different -> return 1
"""
def check_shared_feature_med7(s1,s2):
    
    # [('2U', 'DOSAGE'), ('PRBC', 'DRUG')]
    s1_features = s1.med7_entities
    s2_features = s2.med7_entities
    
    # get drug names for each sentence
    s1_names = [name for (name, word_type) in s1_features if word_type == "DRUG"]
    s2_names = [name for (name, word_type) in s2_features if word_type == "DRUG"]
    
    # check if drug name is in common
    shared_drug = list(set(s1_names) & set(s2_names))
    
    # share drug name, but linkers are different
    if shared_drug != [] and s1_features != s2_features:
        return 1
    else: 
        return 0

In [None]:
"""
Use above methods to get features for data
s1, s2 are Sentence instances
"""
def get_feature_df(s1, s2, label, conflict_type, pair_id):
    sentence_1 = s1.txt
    sentence_2 = s2.txt
#     sentence_df = create_sentence_encoding(sentence_1, sentence_2)
    
    # check negation in docs
    neg_check_umls = check_if_number_neg_equal_umls(s1, s2)

    # check shared features in UMLS and Med7
    check_umls = check_shared_feature_umls(s1,s2)
    check_med7 = check_shared_feature_med7(s1,s2)
    
    # find dependent children similarity
    dep_similarity = create_dep_encoding(s1, s2)
    
    sentence_info = [neg_check_umls, check_umls, check_med7, \
                     sentence_1, sentence_2, dep_similarity, \
                     label, conflict_type, pair_id]
    
#     sentence_df["neg_check_umls"] = neg_check_umls
#     sentence_df["check_umls"] = check_umls
#     sentence_df["check_med7"] = check_med7
#     sentence_df["sentence_1"] = sentence_1
#     sentence_df["sentence_2"] = sentence_2
#     sentence_df["dep_sim"] = dep_similarity

#     # put contradiction label
#     sentence_df["contradiction?"] = label
    
#     # put pair_id for reference
#     sentence_df["conflict_type"] = conflict_type
#     sentence_df["pair_id"] = pair_id
    return sentence_info

### Scaling up features for generated dataset

In [None]:
# total_features_df = pd.DataFrame()
total_features = []

for pair_id in tqdm(range(len(generated_dataset))):
    (s1, s2), label, conflict_type = generated_dataset[pair_id]
    features = get_feature_df(s1, s2, label, conflict_type, pair_id)
#     total_features_df = pd.concat((total_features_df, feature_df), axis=0)
    total_features.append(features)

In [None]:
cols = \
["neg_check_umls", "check_umls", "check_med7", \
     "sentence_1", "sentence_2", "dep_sim", "contradiction?", \
     "conflict_type", "pair_id"]


total_features_df = pd.DataFrame(total_features, columns=cols)

In [None]:
total_features_df.to_csv("generated_features.csv")

# 6. Load and process hand-labeled MIMIC data

In [None]:
class DailyDataNull(object):
    """ Placeholder for DailyData (e.g. Note, PrescriptionOrder, LabResults) """
    def __init__(self, umls, rxnorm, med7, umls_linker, rxnorm_linker):
        self.umls   = umls
        self.rxnorm = rxnorm
        self.med7   = med7
        
        self.umls_linker   = umls_linker
        self.rxnorm_linker = rxnorm_linker
        
        self.time = None

In [None]:
sys.executable

In [None]:
# load per pt data
hand_labeled_dir = "hand_labeled_data"
raw_data_list = []

for xlsx_file in os.listdir(hand_labeled_dir):
    hadm_id = xlsx_file.split('.')[0]
    
    try: # skip non pt id data
        hadm_id_int = int(hadm_id)
    except ValueError:
        continue
    
    data_xls = pd.read_excel(os.path.join(hand_labeled_dir, xlsx_file), hadm_id)
    data_xls['hadm_id'] = hadm_id_int
    
    raw_data_list.append(data_xls)
    
labeled_setA_df = pd.concat(raw_data_list)

In [None]:
# load from google doc data
gdoc_data_file = 'from_google_doc_samedayonly.xlsx'
labeled_setB_df = pd.read_excel(os.path.join(hand_labeled_dir, gdoc_data_file), '129414') # just random sheet name

In [None]:
row['label'] == -1

## Process the dataset

In [None]:
nullnote = DailyDataNull(umls_nlp, rxnorm_nlp, med7_nlp,
                         umls_linker, rxnorm_linker)

In [None]:
hand_labeled_datas = [] # list of tuples ((s1, s2), label)

# Process set A
for _, row in tqdm(labeled_setA_df.iterrows()): # iterate over sentence pairs
    s1_txt = row['sentence 1']
    s2_txt = row['sentence 2']
    hadm_id = row['hadm_id']
    label   = row['label']
    
    if label == -1:
        continue
        
    # create the sentences
    sentence1 = Sentence(nullnote, None,
                         filter_map=SEMANTIC_TYPE_TO_NAME,
                         conflict_map=CONFLICT_TO_SEMANTIC_TYPE,
                         sentence=s1_txt)
    sentence2 = Sentence(nullnote, None,
                         filter_map=SEMANTIC_TYPE_TO_NAME,
                         conflict_map=CONFLICT_TO_SEMANTIC_TYPE,
                         sentence=s2_txt)
    
    hand_labeled_datas.append(((sentence1, sentence2), label, hadm_id))
    

# Process set B
for _, row in tqdm(labeled_setB_df.iterrows()): # iterate over sentence pairs
    s1_txt = row['Sentence 1']
    s2_txt = row['Sentence 2']
    hadm_id = row['Hadm_id']
    label   = row['label']
    
    if label == -1:
        continue
        
    # create the sentences
    sentence1 = Sentence(nullnote, None,
                         filter_map=SEMANTIC_TYPE_TO_NAME,
                         conflict_map=CONFLICT_TO_SEMANTIC_TYPE,
                         sentence=s1_txt)
    sentence2 = Sentence(nullnote, None,
                         filter_map=SEMANTIC_TYPE_TO_NAME,
                         conflict_map=CONFLICT_TO_SEMANTIC_TYPE,
                         sentence=s2_txt)
    
    hand_labeled_datas.append(((sentence1, sentence2), label, hadm_id))

## Feature Generation

In [None]:
is_diagnosis = lambda stype: stype in CONFLICT_TO_SEMANTIC_TYPE['diagnosis']
is_med       = lambda stype: stype in CONFLICT_TO_SEMANTIC_TYPE['med_allergy']
is_test      = lambda stype: stype in CONFLICT_TO_SEMANTIC_TYPE['test_results']

def get_conflict_type(s1, s2):
    conflict_types = []
    
    is_diag_s1 = any([is_diagnosis(stype) for stype in s1.semantic_types])
    is_diag_s2 = any([is_diagnosis(stype) for stype in s2.semantic_types])
    if is_diag_s1 and is_diag_s2: conflict_types.append("diagnosis")

    is_med_s1 = any([is_med(stype) for stype in s1.semantic_types])
    is_med_s2 = any([is_med(stype) for stype in s2.semantic_types])
    if is_med_s1 and is_med_s2: conflict_types.append("med")

    is_test_s1 = any([is_test(stype) for stype in s1.semantic_types])
    is_test_s2 = any([is_test(stype) for stype in s2.semantic_types])
    if is_test_s1 and is_test_s2: conflict_types.append("test")
        
    return conflict_types

In [None]:
"""
Use above methods to get features for data
s1, s2 are Sentence instances
"""
def get_feature_df_mimic(s1, s2, hadm_id, pair_id, label):
    sentence_1 = s1.txt
    sentence_2 = s2.txt
    
    # check negation in docs
    neg_check_umls = check_if_number_neg_equal_umls(s1, s2)

    # check shared features in UMLS and Med7
    check_umls = check_shared_feature_umls(s1,s2)
    check_med7 = check_shared_feature_med7(s1,s2)
    
    # find dependent children similarity
    dep_similarity = create_dep_encoding(s1, s2)
    
    # get conflict type -- returns list 
    conflict_type = get_conflict_type(s1, s2)
    
    sentence_info = [neg_check_umls, check_umls, check_med7, \
                     sentence_1, sentence_2, dep_similarity, \
                     hadm_id, conflict_type, pair_id, label]
    
    return sentence_info

In [None]:
# total_features_df = pd.DataFrame()
hand_labeled_features = []

for pair_id in tqdm(range(len(hand_labeled_datas))):
    (s1, s2), label, hadm_id = hand_labeled_datas[pair_id]
    features = get_feature_df_mimic(s1, s2, hadm_id, pair_id, label)
    hand_labeled_features.append(features)

In [None]:
cols = \
["neg_check_umls", "check_umls", "check_med7", \
     "sentence_1", "sentence_2", "dep_sim", \
     "hadm_id", "conflict_type", "pair_id", "contradiction?"]


hand_labeled_features_df = pd.DataFrame(hand_labeled_features, columns=cols)

In [None]:
hand_labeled_features_df.to_csv("hand_labeled_mimic_features.csv")

In [None]:
hand_labeled_features_df.hadm_id.unique().shape

In [None]:
hand_labeled_features_df.shape

# 7. Generating evaluation data (unlabeled) from MIMIC

We'll avoid the first 10 patients since they were used for generated contradictions

In [None]:
processed_dir = "processed"
os.makedirs(processed_dir, exist_ok=True)

In [None]:
per_pat_dataset_dict = {} # maps HADMID to patient's dataset in the form [((data 1, data 2), label), ...]
df_list = []
for hadm_id in hadm_ids[10:20]:
    print("***********************************")
    print(f"Patient {int(hadm_id)}")
        
    # Step 1: Select a patient -- process all data
    pat = Patient(hadm_id, notes_df, drug_df, lab_df, d_lab_df, \
                  med7_nlp, umls_nlp, rxnorm_nlp, umls_linker, rxnorm_linker, \
                  physician_only=True)

    # Step 2: Generate pairs for this patient
    df, data_inst_pairs = generate_data_pairs(pat)
    df['HADM_ID'] = hadm_id
    per_pat_dataset_dict[hadm_id] = data_inst_pairs
    df_list.append(df)
    
#     df.to_csv(f"{processed_dir}/{int(hadm_id)}.csv")

In [None]:
full_df = pd.concat(df_list)

df.to_csv(f"{processed_dir}/mimic_qual_eval.csv")

## Feature Generation

In [None]:
is_diagnosis = lambda stype: stype in CONFLICT_TO_SEMANTIC_TYPE['diagnosis']
is_med       = lambda stype: stype in CONFLICT_TO_SEMANTIC_TYPE['med_allergy']
is_test      = lambda stype: stype in CONFLICT_TO_SEMANTIC_TYPE['test_results']

def get_conflict_type(s1, s2):
    conflict_types = []
    
    is_diag_s1 = any([is_diagnosis(stype) for stype in s1.semantic_types])
    is_diag_s2 = any([is_diagnosis(stype) for stype in s2.semantic_types])
    if is_diag_s1 and is_diag_s2: conflict_types.append("diagnosis")

    is_med_s1 = any([is_med(stype) for stype in s1.semantic_types])
    is_med_s2 = any([is_med(stype) for stype in s2.semantic_types])
    if is_med_s1 and is_med_s2: conflict_types.append("med")

    is_test_s1 = any([is_test(stype) for stype in s1.semantic_types])
    is_test_s2 = any([is_test(stype) for stype in s2.semantic_types])
    if is_test_s1 and is_test_s2: conflict_types.append("test")
        
    return conflict_types

In [None]:
"""
Use above methods to get features for data
s1, s2 are Sentence instances
"""
def get_feature_df_mimic(s1, s2, hadm_id, pair_id):
    sentence_1 = s1.txt
    sentence_2 = s2.txt
    
    # check negation in docs
    neg_check_umls = check_if_number_neg_equal_umls(s1, s2)

    # check shared features in UMLS and Med7
    check_umls = check_shared_feature_umls(s1,s2)
    check_med7 = check_shared_feature_med7(s1,s2)
    
    # find dependent children similarity
    dep_similarity = create_dep_encoding(s1, s2)
    
    # get conflict type -- returns list 
    conflict_type = get_conflict_type(s1, s2)
    
    sentence_info = [neg_check_umls, check_umls, check_med7, \
                     sentence_1, sentence_2, dep_similarity, \
                     hadm_id, conflict_type, pair_id]
    
    return sentence_info

### Scaling up features for generated dataset

In [None]:
# total_features_df = pd.DataFrame()
total_features = []

for hadm_id, hadm_mimic_dataset in per_pat_dataset_dict.items():
    print(f"Processing patient {int(hadm_id)}")
    for pair_id in tqdm(range(len(hadm_mimic_dataset))):
        (s1, s2), _ = hadm_mimic_dataset[pair_id]
        features = get_feature_df_mimic(s1, s2, hadm_id, pair_id)
        total_features.append(features)

In [None]:
cols = \
["neg_check_umls", "check_umls", "check_med7", \
     "sentence_1", "sentence_2", "dep_sim", \
     "hadm_id", "conflict_type", "pair_id"]

mimic_total_features_df = pd.DataFrame(total_features, columns=cols)

In [None]:
mimic_total_features_df.to_csv("mimic_features.csv")

### Getting History + Allergy Information - @Sharon, you can ignore everytihng below

In [None]:
# todo: 
# - DONE function to re-process all data from Patient instance -- pat.process_notes(); pat.process_by_date()
# - function to update Note -- should update dataframe of patient directly
#   - can go back to dataframe, but can't map tokenized sentence to original note in df -- todo
#   - function to update tokenized sentence
# - later: function to update original dataframe from patient dataframe

import re

def get_section(regex_dict, txt):
    """ Given a dictionary of start and end regex's for a
        particular section, gets the start and endpoint of 
        section in the text and returns indices. 
        Returns None if section does not exist.
    """
    try:
        start    = re.search(regex_dict["start"], txt).start()
        end      = re.search(regex_dict["end"],   txt).start()
    except AttributeError:
        start, end = None, None
    
    return start, end

note = pat.notes[4]

# Sections to store 
# note: most of these sections have already been removed,
#       but if they haven't might have to remove then 
#       reprocess everything
allergy_regex = {"start": "Allergies:",
                 "end":   "Last dose of Antibiotics:"}
history_regex = {"start": "Past medical history:",
                 "end":   "Other:"}

allergy_start, allergy_end = get_section(allergy_regex, note.txt)
history_start, history_end = get_section(history_regex, note.txt)

pt_allergies = "" if allergy_start is None else note.txt[allergy_start:allergy_end]
pt_histories = "" if history_start is None else note.txt[history_start:history_end]

print("******** Allergies ********")
print(pt_allergies[:100])
print("******** Histories ********")
print(pt_histories[:100])