In [1]:
import json
import os
import re

import pandas as pd
import spacy

In [2]:
data_dir = os.path.abspath("data/training_20180910")
data_dir

'/home/rakshit/Documents/learn/git/clinical-notes/data/training_20180910'

## Try to read annotations

In [3]:
tpat = re.compile("(T\d+)\s+([a-zA-Z\-]+)\s+(\d+\s+\d+(\s*;\s*\d+\s+\d+)*)\s+(.*)")
rpat = re.compile("(R\d+)\s+([a-zA-Z\-]+)\s+Arg1:(T\d+)\s+Arg2:(T\d+)")

In [None]:
with open("data/training_20180910/107047.ann") as f:
    for line in f:
        if line.startswith("T"):
            match = re.match(tpat, line.strip())
            groups = match.groups()
            pos = groups[2].split(";")
            se = pos[0].strip().split()
            start = int(se[0].strip())
            if len(pos) > 1:
                se = pos[-1].strip().split()
            end = int(se[1].strip())
            positions = [{"start": int(se[0].strip()), "end": int(se[1].strip())}]
#             for p in pos:
#                 se = p.strip().split()
#                 positions.append({"start": int(se[0].strip()), "end": int(se[1].strip())})
            if groups[4] == ""
            print({
                "tag": groups[0],
                "entity": groups[1],
                "positions": positions,
                "text": groups[4]
            })
            break
        else:
            match = re.match(rpat, line.strip())
            groups = match.groups()
            print({
                "tag": groups[0],
                "entity": groups[1],
                "arg1": groups[2],
                "arg2": groups[3]
            })
            break

## Try to read patient file

In [None]:
patient_keys = {
    re.compile("admission date:\s*\[\*+(\d+\-\d+\-\d+)\*+\]"): "Admission Date:",
    re.compile("discharge date:\s*\[\*+(\d+\-\d+\-\d+)\*+\]"): "Discharge Date:",
    re.compile("date of birth:\s*\[\*+(\d+\-\d+\-\d+)\*+\]"): "Date of Birth:",
    re.compile("sex:\s*([mf])"): "Sex:",
    re.compile("service:\s*(.*)"): "Service:",
    re.compile("attending:\s*\[\*+(.*)\*+\]"): "Attending:",
    re.compile("present illness:\s*(.*)"): "History of Present Illness:",
    re.compile("allergies:\s*(.*)"): "Allergies:",
    re.compile("chief complaint:\s*(.*)"): "Chief Complaint:",
    re.compile("major surgical or invasive procedure:\s*(.*)"): "Major Surgical or Invasive Procedure:",
    re.compile("past medical history:\s*(.*)"): "Past Medical History:",
    re.compile("social history:\s*(.*)"): "Social History:",
    re.compile("family history:\s*(.*)"): "Family History:",
    re.compile("physical exam:\s*(.*)"): "Physical Exam:",
    re.compile("pertinent results:\s*(.*)"): "Pertinent Results:",
    re.compile("clinical information:\s*(.*)"): "CLINICAL INFORMATION:",
    re.compile("findings:\s*(.*)"): "FINDINGS:",
    re.compile("brief hospital course:\s*(.*)"): "Brief Hospital Course:",
    re.compile("medications on admission:\s*(.*)"): "Medications on Admission:",
    re.compile("discharge medications:\s*(.*)"): "Discharge Medications:",
    re.compile("discharge disposition:\s*(.*)"): "Discharge Disposition:",
    re.compile("facility:\s*(.*)"): "Facility:",
    re.compile("discharge diagnosis:\s*(.*)"): "Discharge Diagnosis:",
    re.compile("discharge condition:\s*(.*)"): "Discharge Condition:",
    re.compile("discharge instructions:\s*(.*)"): "Discharge Instructions:",
    re.compile("followup instructions:\s*(.*)"): "Followup Instructions:"
}
multi_line_patient_keys = {
    "Allergies:", "Chief Complaint:", "Major Surgical or Invasive Procedure:",
    "History of Present Illness:", "Past Medical History:", "Social History:",
    "Family History:", "Physical Exam:", "Pertinent Results:", "CLINICAL INFORMATION:",
    "FINDINGS:", "Brief Hospital Course:", "Medications on Admission:",
    "Discharge Medications:", "Discharge Disposition:", "Facility:", "Discharge Diagnosis:",
    "Discharge Condition:", "Discharge Instructions:", "Followup Instructions:"
}

In [None]:
patient_file = {}
key = None
with open("data/training_20180910/134445.txt") as f:
    for line in f:
        sl = line.strip()
        single_line_data = {}
        for pat, val in patient_keys.items():
            match = re.search(pat, sl.lower())
            if match:
                single_line_data[val] = match.groups()[0] + " "
        if single_line_data:
            patient_file.update(single_line_data)
            multi = multi_line_patient_keys.intersection(set(single_line_data.keys()))
            if multi:
                key = multi.pop()
                continue
            else:
                key = None
        if key:
            patient_file[key] += line
print(json.dumps(patient_file))

In [None]:
data = {}
annotator_comments = {}
for f in os.listdir(data_dir):
    fp = os.path.join(data_dir, f)
    if os.path.isfile(fp):
        key = f[:-4]
        if key not in data:
            data[key] = {"patient_id": key}
        if fp.endswith(".ann"):
            if key not in annotator_comments:
                annotator_comments[key] = []
            with open(fp) as fl:
                anns = {"T": [], "R": []}
                for line in fl:
                    if line.startswith("#"):
                        annotator_comments[key].append(line)
                    elif line.startswith("T"):
                        match = re.match(tpat, line.strip())
                        groups = match.groups()
                        pos = groups[2].split(";")
                        positions = []
                        for p in pos:
                            se = p.strip().split()
                            positions.append({"start": se[0], "end": se[1]})
                        anns["T"].append({
                            "tag_type": "T",
                            "tag": groups[0],
                            "entity": groups[1],
                            "positions": positions,
                            "text": groups[4]
                        })
                    else:
                        match = re.match(rpat, line.strip())
                        if not match:
                            print(f)
                            print(line)
                            print("*" * 80)
                        groups = match.groups()
                        anns["R"].append({
                            "tag_type": "R",
                            "tag": groups[0],
                            "entity": groups[1],
                            "arg1": groups[2],
                            "arg2": groups[3]
                        })
                data[key]["annotations"] = anns
                data[key]["annotations_dump"] = json.dumps(anns)
        elif fp.endswith(".txt"):
            file_key = None
            patient_file = {}
            with open(fp) as fl:
                text = fl.read()
            data[key]["full_text"] = text
            match = re.search(re.compile("present illness:"), text.lower())
            if match:
                data[key]["present_history_start"] = match.span()[1]
#             try:
#                 data[key]["present_history_start"] = text.index("History of Present Illness:") + 28
#             except ValueError:
#                 print("*" * 80)
#                 print(key)
#                 print("*" * 80)
            for line in text.split("\n"):
                sl = line.strip()
                single_line_data = {}
                for pat, val in patient_keys.items():
                    match = re.search(pat, sl.lower())
                    if match:
                        single_line_data[val] = match.groups()[0] + " "
                if single_line_data:
                    patient_file.update(single_line_data)
                    multi = multi_line_patient_keys.intersection(set(single_line_data.keys()))
                    if multi:
                        file_key = multi.pop()
                        continue
                    else:
                        file_key = None
                if file_key:
                    patient_file[file_key] += line
            data[key].update(patient_file)
        else:
            raise IOError(f"Error Reading file {fp}")
        if "present_history_start" in data[key] and "annotations" in data[key]:
            data[key]["present_history_end"] = data[key]["present_history_start"] + len(data[key]["History of Present Illness:"])
            atomic = pd.DataFrame(data[key]["annotations"]["T"])
            relations = pd.DataFrame(data[key]["annotations"]["R"])
            anf = atomic[atomic.positions.map(lambda x: any(int(xi["start"]) < data[key]["present_history_end"] and int(xi["start"]) >= data[key]["present_history_start"] for xi in x))]
            rnf = relations[relations.arg1.isin(anf.tag)&relations.arg2.isin(anf.tag)]
            hist_anns = {"T": anf.to_dict(orient="records"), "R": rnf.to_dict(orient="records")}
            data[key]["present_history_annotations"] = hist_anns
            data[key]["present_history_annotations_dump"] = json.dumps(hist_anns)
data.keys()

In [None]:
[(key, val) for key, val in annotator_comments.items() if val]

In [None]:
df = pd.DataFrame(list(data.values()))
df = df.drop(["present_history_annotations", "annotations"], axis=1)
df.head()

In [None]:
df2 = df[["patient_id", "Chief Complaint:", "History of Present Illness:", "present_history_annotations_dump"]]
df2.head()

In [None]:
df.to_csv("data/complete_patient_data.csv", index=False)
df2.to_csv("data/present_history_data.csv", index=False)

In [None]:
with open("data/training_20180910/134445.txt") as f:
    text = f.read()
text[:2000]

## Try keyword extraction with scispacy models

In [None]:
nlp_lg = spacy.load("en_core_sci_lg")
# nlp_sm = spacy.load("en_core_sci_sm")
# nlp_md = spacy.load("en_core_sci_md")
nlp = spacy.load("en_core_sci_scibert")

In [None]:
text = data["100035"]["History of Present Illness:"]
nlp(text).ents

In [None]:
text = data["100035"]["History of Present Illness:"]
nlp_lg(text).ents

## Try keyword extraction on "Chief Complaint:" sections to see different kinds of complaints

In [None]:
complaint = data["106621"]["Chief Complaint:"]
doc2 = nlp(complaint)
doc2.ents

In [None]:
complaints = {}
for pid, patient_file in data.items():
    complaint = patient_file.get("Chief Complaint:", "")
    cdoc = nlp(complaint)
    complaints[pid] = cdoc.ents
complaints

In [None]:
complaint_docs = {}
for pid, patient_file in data.items():
    complaint = patient_file.get("Chief Complaint:", "")
    cdoc = nlp(complaint.lower())
    text = patient_file.get("History of Present Illness:", "")
    doc = nlp(text.lower())
    for ent in cdoc.ents:
        key = ent.text.strip()
        if key not in complaint_docs:
            complaint_docs[key] = []
        complaint_docs[key].append(doc)
complaint_docs.keys()

## Check similarity scores of keywords extracted from "History of Present Illness:"

In [None]:
set1 = set(ent.text for ent in complaint_docs["hypotension"][0].ents)
set1

In [None]:
pos_tag = ['PROPN', 'ADJ', 'NOUN']
set2 = set(token.text for token in complaint_docs["hypotension"][0] if token.text not in nlp.Defaults.stop_words and token.pos_ in pos_tag)
set2

In [None]:
set3 = set1.intersection(set2)
set3

In [None]:
token_1=nlp_lg("hypotension")
for tok in set3:
    token_2=nlp_lg(tok)
    similarity_score=token_1.similarity(token_2)
    print(tok, "---", similarity_score)

## Above scores show words like "lightheadedness", "myalgias", "pain", etc from "History  of Present Illness" section having higher scores with the word "hypotension" from "Chief Complaint". This could be good news.

## ========================================================================

## Try to get entities from "History of Present Illness" sorted based on the above similarity scores and limit to top 20 entities.

In [None]:
top_complaint_entities = {}
for complaint, docs in complaint_docs.items():
    if not complaint:
        continue
    token_1=nlp_lg(complaint)
    ents = []
    doc_ents = list(set(ent.text for doc in docs for ent in doc.ents))
    for ent in doc_ents:
        token_2=nlp_lg(ent)
        similarity_score=token_1.similarity(token_2)
        ents.append((-similarity_score, ent))
    top_complaint_entities[complaint] = list(zip(*(sorted(ents)[:20])))[1]
top_complaint_entities

In [None]:
token_1=nlp_lg("sob")
token_2=nlp_lg("vancomycin 1 gm")
token_1.similarity(token_2)

## Next, we extract top keywords on document level. We do it in following steps:
### - Extract entities from "Chief Complaint:" section.
### - Extract top 10 keywords for each entity from "History of Present Illness:" section
### - Sort all top keywords based on combined similarity score and keep top 10

In [None]:
def extract_keywords(row):
    complaint = row["Chief Complaint:"].lower()
    text = row["History of Present Illness:"].lower()
    # If we were not able to extract complaint or history, skip
    if not (complaint and text):
        row["top_keywords"] = []
        return row
    
    cdoc = nlp_lg(complaint)
    doc = nlp_lg(text)
    ents = {}
    combined = {}
    # Calculate scores for each entity in chief complaint with each entity
    # in history of present illness.
    for cent in cdoc.ents:
        token_1=nlp_lg(cent.text)
        ents[cent.text] = []
        for ent in doc.ents:
            token_2=nlp_lg(ent.text)
            similarity_score=token_1.similarity(token_2)
            if ent.text not in combined:
                combined[ent.text] = 1
            # We will take combined score as the multiplication of all the scores for now
            combined[ent.text] *= similarity_score
            ents[cent.text].append({"score": similarity_score, "entity": ent.text})
    # Add combined score with all complaint entities to each history entity
    result_ents = {}
    for cent, history_ents in ents.items():
        result_ents[cent] = []
        for ent in history_ents:
            ent["combined_score"] = combined[ent["entity"]]
            result_ents[cent].append(ent)
    # Sort and get top 10 history entities for each complaint entity
    ents = []
    for cent in result_ents:
        top10 = sorted(result_ents[cent], key=lambda x: -x["score"])[:10]
        ents += top10
    # Finally sort and take top 10 entities based on combined score
    row["top_keywords"] = json.dumps([ent["entity"] for ent in sorted(ents, key=lambda x: -x["combined_score"])[:10]])
    return row

df3 = df2.fillna("").apply(extract_keywords, axis=1)

In [None]:
df3.head()

In [None]:
df3.to_csv("data/output.csv", index=False)

## The above method captures some important keywords.
## The problem with the method is - because of similarity score, the method is only able to pickup words pertaining to the disease/problem and not the medicines if any were given.
## This could be because spacy's model is a statistical model and the model might have not seen the medicines in context of the issues.

## So we will try to train the entity model to give better entities.
## We find the clinical notes data to train our model from Harvard DBMI portal - https://portal.dbmi.hms.harvard.edu/

## =======================================================================

## We will do the task in 3 steps:
### - First we preprocess the data to convert it into spacy binary format.
### - Train the NER model using spacy train
### - Evaluate the NER model using spacy evaluate

# Preprocessing:

## NOTE: My request to data access is pending, so, for now, let's assume that the data is in same format as this assignment. If there are any changes, corresponding changes can be made to the preprocess script.

In [50]:
import random
import shutil

from typing import Any, Dict, List

from spacy.util import get_words_and_spaces, compile_infix_regex
from spacy.tokens import Doc, DocBin
from spacy.tokenizer import Tokenizer

In [5]:
new_dir = os.path.abspath(os.path.join("data", "data"))
corpus_dir = os.path.abspath(os.path.join("data", "corpus"))
if not os.path.isdir(corpus_dir):
    os.makedirs(corpus_dir)

In [72]:
def separate_train_eval(old_dir: str, new_dir: str, split_ratio: float=0.8):
    """
    Function to separate data in training and validation.
    
    Args:
        old_dir: str, Current data directory
        new_dir: str, New data directory with train and validation separated
        split_ratio: float, Fraction of data to keep in training.
                     Defaults to 80-20 train and validation split.
    
    Returns: None
    """
    train_dir = os.path.join(new_dir, "train")
    eval_dir = os.path.join(new_dir, "eval")
    if not os.path.isdir(train_dir):
        os.makedirs(train_dir)
    if not os.path.isdir(eval_dir):
        os.makedirs(eval_dir)
    for f in os.listdir(old_dir):
        ofp = os.path.join(old_dir, f)
        if os.path.isfile(ofp) and ofp.endswith(".txt"):
            if random.random() <= split_ratio:
                nfp = os.path.join(train_dir, f)
            else:
                nfp = os.path.join(eval_dir, f)
            shutil.copy(ofp, nfp)
            shutil.copy(ofp.replace(".txt", ".ann"), nfp.replace(".txt", ".ann"))
            
def get_file_paths(data_dir: str) -> Dict[str, Dict[str, str]]:
    """
    Function to return all file paths from a data directory in a json format.
    
    Args:
        data_dir: str, Data directory containing annotation and patient files.
    
    Return: dict
    """
    files = {}
    for f in os.listdir(data_dir):
        fp = os.path.join(data_dir, f)
        if os.path.isfile(fp):
            patient_id = f[:-4]
            if patient_id not in files:
                files[patient_id] = {}
            if f.endswith(".ann"):
                files[patient_id]["annotation"] = fp
            elif f.endswith(".txt"):
                files[patient_id]["data"] = fp
            else:
                raise IOError(f"File type for {fp} not supported.")
#     print(files)
    return files

def read_annotations(fp: str) -> List[Dict[str, Any]]:
    """
    Function to read annotations from a given annotation file.
    Args:
        fp: str, File path of an annotation file
    
    Returns: dict
        Annotations in a dictionary format.
        
    Sample: [
        {"start":10, "end":15, "label":"DRUG"}
        ...
        {"start":200, "end":208, "label":"REASON"}
    ]
    """
    annotations = []
    with open(fp) as f:
        for line in f:
            if line.startswith("T"):
                match = re.match(tpat, line.strip())
                groups = match.groups()
                pos = groups[2].split(";")
                se = pos[0].strip().split()
                start = int(se[0].strip())
                if len(pos) > 1:
                    se = pos[-1].strip().split()
                end = int(se[1].strip())
                ann = {
                    "start": start,
                    "end": end,
                    "label": groups[1],
                    "text": groups[4]
                }
                annotations.append(ann)
#                 positions = []
#                 for p in pos:
#                     se = p.strip().split()
#                     positions.append({"start": int(se[0].strip()), "end": int(se[1].strip())})
#                 for pos in positions:
#                     ann = { "label": groups[1], "text": groups[4] }
#                     ann.update(pos)
#                     if not (ann["start"] and ann["end"] and ann["label"].strip()):
#                         print(ann)
#                         print(fp)
#                         print("=" * 80)
#                     annotations.append(ann)
    return annotations

def preprocess(ip_path: str, op_path: str):
    """
    Function to convert the given data in spacy binary format.
    
    Args:
        ip_path: str, Directory containing both the patient files and annotation files.
        op_path: str, Path where to save the spacy binary data.
    
    Returns: None
    """
    files = get_file_paths(ip_path)
    doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"])
    w = 0
    for patient_id, val in files.items():
        if "annotation" not in val:
            print("Annotation file not given for", patient_id)
        annotations = read_annotations(val["annotation"])
        if "data" not in val:
            print("Data file not given for", patient_id)
        with open(val["data"]) as f:
            text = f.read()
        doc = nlp.make_doc(text)
        toks = [token.text for token in doc]
        dents = []
        nanns = []
        for ann in annotations:
            if text[ann["start"]:ann["end"]] in toks:
                dents.append(doc.char_span(ann["start"], ann["end"], label=ann["label"]))
                nanns.append(ann)
        doc.ents = dents
#         try:
#             doc.ents = dents
# #             doc.ents = [
# #                 doc.char_span(ann["start"], ann["end"], label=ann["label"])
# #                 for ann in annotations
# #             ]
#         except TypeError:
#             for ann in nanns:
# #                 toks = [token.text for token in doc]
#                 if text[ann["start"]:ann["end"]] not in toks:
#                     print(patient_id, ann)
#                     break
# #             print(toks)
#             w += 1
# #             print(annotations)
#             return
        doc_bin.add(doc)
    print(w, len(files))
    doc_bin.to_disk(op_path)

In [73]:
class MyTokenizer(Tokenizer):
    def __call__(self, text):
        pat = r"(\d+)\s*(m?g)"
        text = re.sub(pat, lambda m: "{}_{}".format(m.group(1), m.group(2)), text)
        doc = super().__call__(text)
        pat = r"(\d+)_(m?g)"
        new_toks = [re.sub(pat, lambda m: "{} {}".format(m.group(1), m.group(2)), tok.text) for tok in doc]

        return Doc(doc.vocab, words=new_toks)

In [76]:
nlp = spacy.blank("en")
nlp.tokenizer = MyTokenizer(
        nlp.vocab,
        prefix_search=nlp.tokenizer.prefix_search,
        suffix_search=nlp.tokenizer.suffix_search,
        infix_finditer=nlp.tokenizer.infix_finditer,
        token_match=nlp.tokenizer.token_match,
)
# suffixes = nlp.Defaults.suffixes + [r"\d+\s*m?g"]
# suffix_regex = spacy.util.compile_suffix_regex(suffixes)
# nlp.tokenizer.suffix_search = suffix_regex.search
# infixes = ([r"\d+\s*m?g"])
# infix_re = compile_infix_regex(infixes)
# nlp.tokenizer.infix_finditer = infix_re.finditer
# tokenizer = spacy.load("en_core_sci_lg")

In [77]:
# separate_train_eval(data_dir, new_dir)
train_dir = os.path.join(new_dir, "train")
eval_dir = os.path.join(new_dir, "eval")
preprocess(train_dir, os.path.join(corpus_dir, "train.spacy"))
# preprocess(eval_dir, os.path.join(corpus_dir, "eval.spacy"))

TypeError: object of type 'NoneType' has no len()

In [27]:
with open("data/training_20180910/107047.txt") as f:
    text = f.read()
text[14083:14093]

'citalopram'

In [59]:
[t.text for t in nlp("20 mg")]

['20 mg']

In [54]:
s = "20  dfdf"
pat = r"(\d+)\s*(m?g)"
re.sub(pat, lambda m: "{}_{}".format(m.group(1), m.group(2)), s)

'20  dfdf'