In [1]:
import json
import os
import re

import pandas as pd
import spacy

In [2]:
data_dir = os.path.abspath("data/training_20180910")
data_dir

'/home/rakshit/Documents/learn/git/clinical-notes/data/training_20180910'

## Try to read annotations

In [3]:
tpat = re.compile("(T\d+)\s+([a-zA-Z\-]+)\s+(\d+\s+\d+(\s*;\s*\d+\s+\d+)*)\s+(.*)")
rpat = re.compile("(R\d+)\s+([a-zA-Z\-]+)\s+Arg1:(T\d+)\s+Arg2:(T\d+)")

In [4]:
with open("data/training_20180910/100035.ann") as f:
    for line in f:
        if line.startswith("T"):
            match = re.match(tpat, line.strip())
            groups = match.groups()
            pos = groups[2].split(";")
            positions = []
            for p in pos:
                se = p.strip().split()
                positions.append({"start": int(se[0].strip()), "end": int(se[1].strip())})
            print({
                "tag": groups[0],
                "entity": groups[1],
                "positions": positions,
                "text": groups[4]
            })
            break
        else:
            match = re.match(rpat, line.strip())
            groups = match.groups()
            print({
                "tag": groups[0],
                "entity": groups[1],
                "arg1": groups[2],
                "arg2": groups[3]
            })
            break

{'tag': 'T1', 'entity': 'Reason', 'positions': [{'start': 10179, 'end': 10197}], 'text': 'recurrent seizures'}


## Try to read patient file

In [5]:
patient_keys = {
    re.compile("admission date:\s*\[\*+(\d+\-\d+\-\d+)\*+\]"): "Admission Date:",
    re.compile("discharge date:\s*\[\*+(\d+\-\d+\-\d+)\*+\]"): "Discharge Date:",
    re.compile("date of birth:\s*\[\*+(\d+\-\d+\-\d+)\*+\]"): "Date of Birth:",
    re.compile("sex:\s*([mf])"): "Sex:",
    re.compile("service:\s*(.*)"): "Service:",
    re.compile("attending:\s*\[\*+(.*)\*+\]"): "Attending:",
    re.compile("present illness:\s*(.*)"): "History of Present Illness:",
    re.compile("allergies:\s*(.*)"): "Allergies:",
    re.compile("chief complaint:\s*(.*)"): "Chief Complaint:",
    re.compile("major surgical or invasive procedure:\s*(.*)"): "Major Surgical or Invasive Procedure:",
    re.compile("past medical history:\s*(.*)"): "Past Medical History:",
    re.compile("social history:\s*(.*)"): "Social History:",
    re.compile("family history:\s*(.*)"): "Family History:",
    re.compile("physical exam:\s*(.*)"): "Physical Exam:",
    re.compile("pertinent results:\s*(.*)"): "Pertinent Results:",
    re.compile("clinical information:\s*(.*)"): "CLINICAL INFORMATION:",
    re.compile("findings:\s*(.*)"): "FINDINGS:",
    re.compile("brief hospital course:\s*(.*)"): "Brief Hospital Course:",
    re.compile("medications on admission:\s*(.*)"): "Medications on Admission:",
    re.compile("discharge medications:\s*(.*)"): "Discharge Medications:",
    re.compile("discharge disposition:\s*(.*)"): "Discharge Disposition:",
    re.compile("facility:\s*(.*)"): "Facility:",
    re.compile("discharge diagnosis:\s*(.*)"): "Discharge Diagnosis:",
    re.compile("discharge condition:\s*(.*)"): "Discharge Condition:",
    re.compile("discharge instructions:\s*(.*)"): "Discharge Instructions:",
    re.compile("followup instructions:\s*(.*)"): "Followup Instructions:"
}
multi_line_patient_keys = {
    "Allergies:", "Chief Complaint:", "Major Surgical or Invasive Procedure:",
    "History of Present Illness:", "Past Medical History:", "Social History:",
    "Family History:", "Physical Exam:", "Pertinent Results:", "CLINICAL INFORMATION:",
    "FINDINGS:", "Brief Hospital Course:", "Medications on Admission:",
    "Discharge Medications:", "Discharge Disposition:", "Facility:", "Discharge Diagnosis:",
    "Discharge Condition:", "Discharge Instructions:", "Followup Instructions:"
}

In [6]:
patient_file = {}
key = None
with open("data/training_20180910/134445.txt") as f:
    for line in f:
        sl = line.strip()
        single_line_data = {}
        for pat, val in patient_keys.items():
            match = re.search(pat, sl.lower())
            if match:
                single_line_data[val] = match.groups()[0] + " "
        if single_line_data:
            patient_file.update(single_line_data)
            multi = multi_line_patient_keys.intersection(set(single_line_data.keys()))
            if multi:
                key = multi.pop()
                continue
            else:
                key = None
        if key:
            patient_file[key] += line
print(json.dumps(patient_file))

{"Admission Date:": "2186-7-14 ", "Discharge Date:": "2186-7-27 ", "Date of Birth:": "2152-11-6 ", "Sex:": "f ", "Service:": " ", "History of Present Illness:": "the patient is a 33-year-old right-handed woman transferred from [**Hospital1 1474**] [**Hospital3 417**]\nHospital with having throbbing headaches and intermittent\nphotophobia and dizziness, worse in the morning and with\nbending forwards, [**8-16**] in severity, starts in the neck and\nwraps around to the frontal region and lasts all day,\nprogressively worse over the last four weeks.\n\n", "Past Medical History:": " 1.  GERD.\n2.  Cholecystectomy.\n3.  Bipolar disease.\n4.  Herniated disk at the L5-S1 level.\n\nADMISSION MEDICATIONS:\n1.  Wellbutrin 100 b.i.d.\n2.  Ibuprofen p.r.n.\n3.  Depo Provera.\n4.  Ativan 0.5 b.i.d. p.r.n.\n5.  Promex 40 mg q.d.\n\n", "Allergies:": "codeine (rash). \nPHYSICAL EXAMINATION ON ADMISSION:  Vital signs:  Temperature\n98.8, BP 104/60, heart rate 56, respiratory rate 16,\nsaturations 98% o

In [7]:
data = {}
annotator_comments = {}
for f in os.listdir(data_dir):
    fp = os.path.join(data_dir, f)
    if os.path.isfile(fp):
        key = f[:-4]
        if key not in data:
            data[key] = {"patient_id": key}
        if fp.endswith(".ann"):
            if key not in annotator_comments:
                annotator_comments[key] = []
            with open(fp) as fl:
                anns = {"T": [], "R": []}
                for line in fl:
                    if line.startswith("#"):
                        annotator_comments[key].append(line)
                    elif line.startswith("T"):
                        match = re.match(tpat, line.strip())
                        groups = match.groups()
                        pos = groups[2].split(";")
                        positions = []
                        for p in pos:
                            se = p.strip().split()
                            positions.append({"start": se[0], "end": se[1]})
                        anns["T"].append({
                            "tag_type": "T",
                            "tag": groups[0],
                            "entity": groups[1],
                            "positions": positions,
                            "text": groups[4]
                        })
                    else:
                        match = re.match(rpat, line.strip())
                        if not match:
                            print(f)
                            print(line)
                            print("*" * 80)
                        groups = match.groups()
                        anns["R"].append({
                            "tag_type": "R",
                            "tag": groups[0],
                            "entity": groups[1],
                            "arg1": groups[2],
                            "arg2": groups[3]
                        })
                data[key]["annotations"] = anns
                data[key]["annotations_dump"] = json.dumps(anns)
        elif fp.endswith(".txt"):
            file_key = None
            patient_file = {}
            with open(fp) as fl:
                text = fl.read()
            data[key]["full_text"] = text
            match = re.search(re.compile("present illness:"), text.lower())
            if match:
                data[key]["present_history_start"] = match.span()[1]
#             try:
#                 data[key]["present_history_start"] = text.index("History of Present Illness:") + 28
#             except ValueError:
#                 print("*" * 80)
#                 print(key)
#                 print("*" * 80)
            for line in text.split("\n"):
                sl = line.strip()
                single_line_data = {}
                for pat, val in patient_keys.items():
                    match = re.search(pat, sl.lower())
                    if match:
                        single_line_data[val] = match.groups()[0] + " "
                if single_line_data:
                    patient_file.update(single_line_data)
                    multi = multi_line_patient_keys.intersection(set(single_line_data.keys()))
                    if multi:
                        file_key = multi.pop()
                        continue
                    else:
                        file_key = None
                if file_key:
                    patient_file[file_key] += line
            data[key].update(patient_file)
        else:
            raise IOError(f"Error Reading file {fp}")
        if "present_history_start" in data[key] and "annotations" in data[key]:
            data[key]["present_history_end"] = data[key]["present_history_start"] + len(data[key]["History of Present Illness:"])
            atomic = pd.DataFrame(data[key]["annotations"]["T"])
            relations = pd.DataFrame(data[key]["annotations"]["R"])
            anf = atomic[atomic.positions.map(lambda x: any(int(xi["start"]) < data[key]["present_history_end"] and int(xi["start"]) >= data[key]["present_history_start"] for xi in x))]
            rnf = relations[relations.arg1.isin(anf.tag)&relations.arg2.isin(anf.tag)]
            hist_anns = {"T": anf.to_dict(orient="records"), "R": rnf.to_dict(orient="records")}
            data[key]["present_history_annotations"] = hist_anns
            data[key]["present_history_annotations_dump"] = json.dumps(hist_anns)
data.keys()

dict_keys(['107047', '106621', '105050', '143562', '134445', '127741', '185759', '174150', '105254', '100847', '114004', '102087', '112628', '115432', '108658', '110037', '123475', '123324', '189471', '182160', '100883', '107869', '149687', '121514', '160574', '102053', '114220', '113824', '102136', '125206', '109176', '130076', '107139', '103377', '102173', '112615', '111298', '195784', '168331', '108754', '151688', '110342', '103293', '102557', '115191', '112832', '111542', '114452', '115143', '108032', '122093', '164366', '110326', '111923', '106026', '172474', '105014', '107872', '116451', '149614', '188551', '114144', '101276', '112140', '106415', '104979', '115244', '115232', '187782', '111882', '106384', '114680', '198406', '186788', '187736', '103926', '102527', '118418', '108809', '101331', '115347', '104799', '161477', '177370', '105547', '166834', '102283', '106038', '112030', '101427', '117609', '161384', '123771', '177331', '106423', '105747', '130440', '117156', '103317',

In [8]:
[(key, val) for key, val in annotator_comments.items() if val]

[('115157', ['#1\tAnnotatorNotes T178\tdrug name changed out in error\n']),
 ('110445',
  ['#1\tAnnotatorNotes T35\tlikely a med that got mistook for a name\n'])]

In [9]:
df = pd.DataFrame(list(data.values()))
df = df.drop(["present_history_annotations", "annotations"], axis=1)
df.head()

Unnamed: 0,patient_id,annotations_dump,full_text,present_history_start,Admission Date:,Discharge Date:,Date of Birth:,Sex:,Service:,Allergies:,...,Discharge Disposition:,Facility:,Discharge Diagnosis:,Discharge Condition:,Discharge Instructions:,Followup Instructions:,present_history_end,present_history_annotations_dump,FINDINGS:,CLINICAL INFORMATION:
0,107047,"{""T"": [{""tag_type"": ""T"", ""tag"": ""T1"", ""entity""...",Admission Date: [**2134-10-9**] ...,355.0,2134-10-9,2134-11-19,2071-4-28,f,medicine,Bactrim / Dicloxacillin / Levofloxacin,...,Home With Service,[**Hospital3 **] VNA,Burkitt's lymphoma,Mental Status: Clear and coherent.Level of Co...,"Dear Ms. [**Known lastname 52**],You were adm...",The following appointment has been scheduled ...,1815.0,"{""T"": [{""tag_type"": ""T"", ""tag"": ""T43"", ""entity...",,
1,106621,"{""T"": [{""tag_type"": ""T"", ""tag"": ""T1"", ""entity""...",Admission Date: [**2167-8-29**] ...,382.0,2167-8-29,2167-9-9,2096-2-18,m,medicine,Bactrim / Penicillins,...,Extended Care,[**Hospital1 **] Senior Living,"Primary diagnosis: change in mental status, m...",stable,You were admitted to the hospital after an ac...,Provider: [**First Name8 (NamePattern2) 539**...,2386.0,"{""T"": [{""tag_type"": ""T"", ""tag"": ""T37"", ""entity...",,
2,105050,"{""T"": [{""tag_type"": ""T"", ""tag"": ""T1"", ""entity""...",Admission Date: [**2125-2-5**] D...,403.0,2125-2-5,2125-2-26,2079-7-20,m,medicine,Patient recorded as having No Known Allergies...,...,Extended Care,[**Hospital3 2857**] - [**Location (un) 1121*...,"primary: Hypertensive emergency, Headache NOS...",Mental Status:Clear and coherentLevel of Cons...,You were admitted to [**Hospital1 18**] for h...,Please continue your [**Last Name (NamePatter...,2096.0,"{""T"": [{""tag_type"": ""T"", ""tag"": ""T31"", ""entity...",there are scattered periventricular and t2 hyp...,
3,143562,"{""T"": [{""tag_type"": ""T"", ""tag"": ""T1"", ""entity""...",Admission Date: [**2150-6-25**] ...,442.0,2150-6-25,2150-7-29,2096-5-5,m,medicine,Patient recorded as having No Known Allergies...,...,Home With Service,[**Hospital6 486**] [**Location (un) 69471**],End Stage liver diseaseEnd stage renal diseas...,"Fair, ambulating, tolerating PO diet, requiri...","Call your doctor if you develop chest pain, s...",Please arrange for hemodialysis three times w...,1480.0,"{""T"": [{""tag_type"": ""T"", ""tag"": ""T1"", ""entity""...",,
4,134445,"{""T"": [{""tag_type"": ""T"", ""tag"": ""T1"", ""entity""...",Admission Date: [**2186-7-14**] Dischar...,162.0,2186-7-14,2186-7-27,2152-11-6,f,,codeine (rash). PHYSICAL EXAMINATION ON ADMISS...,...,,,,,,,542.0,"{""T"": [], ""R"": []}",,


In [10]:
df2 = df[["patient_id", "Chief Complaint:", "History of Present Illness:", "present_history_annotations_dump"]]
df2.head()

Unnamed: 0,patient_id,Chief Complaint:,History of Present Illness:,present_history_annotations_dump
0,107047,Hypotension,Mrs. [**Known lastname 52**] is a 63-year-old...,"{""T"": [{""tag_type"": ""T"", ""tag"": ""T43"", ""entity..."
1,106621,altered mental status; transfer from MICU,71M with multiple myeloma currently on Velcad...,"{""T"": [{""tag_type"": ""T"", ""tag"": ""T37"", ""entity..."
2,105050,"headache, hypertensive urgency","45m with dmi, esrd on hd, and multiple admissi...","{""T"": [{""tag_type"": ""T"", ""tag"": ""T31"", ""entity..."
3,143562,direct admit from OSH for transplant work-up,"Pt is 54 yo male with alcoholic cirrhosis, h/...","{""T"": [{""tag_type"": ""T"", ""tag"": ""T1"", ""entity""..."
4,134445,,the patient is a 33-year-old right-handed woma...,"{""T"": [], ""R"": []}"


In [11]:
df.to_csv("data/complete_patient_data.csv", index=False)
df2.to_csv("data/present_history_data.csv", index=False)

In [12]:
with open("data/training_20180910/134445.txt") as f:
    text = f.read()
text[:2000]

'Admission Date:  [**2186-7-14**]       Discharge Date:  [**2186-7-27**]\n\nDate of Birth:   [**2152-11-6**]       Sex:  F\n\nService:\n\nHISTORY OF THE PRESENT ILLNESS:  The patient is a 33-year-old\nright-handed woman transferred from [**Hospital1 1474**] [**Hospital3 417**]\nHospital with having throbbing headaches and intermittent\nphotophobia and dizziness, worse in the morning and with\nbending forwards, [**8-16**] in severity, starts in the neck and\nwraps around to the frontal region and lasts all day,\nprogressively worse over the last four weeks.\n\nPAST MEDICAL HISTORY:\n1.  GERD.\n2.  Cholecystectomy.\n3.  Bipolar disease.\n4.  Herniated disk at the L5-S1 level.\n\nADMISSION MEDICATIONS:\n1.  Wellbutrin 100 b.i.d.\n2.  Ibuprofen p.r.n.\n3.  Depo Provera.\n4.  Ativan 0.5 b.i.d. p.r.n.\n5.  Promex 40 mg q.d.\n\nALLERGIES:  Codeine (rash).\n\nPHYSICAL EXAMINATION ON ADMISSION:  Vital signs:  Temperature\n98.8, BP 104/60, heart rate 56, respiratory rate 16,\nsaturations 98% on 

## Try keyword extraction with scispacy models

In [13]:
nlp_lg = spacy.load("en_core_sci_lg")
# nlp_sm = spacy.load("en_core_sci_sm")
# nlp_md = spacy.load("en_core_sci_md")
nlp = spacy.load("en_core_sci_scibert")



In [14]:
text = data["100035"]["History of Present Illness:"]
nlp(text).ents

(year,
 gentleman,
 PMH,
 signifciantwith,
 dilated cardiomyopathy,
 AICD,
 asthma,
 HTN,
 OSH,
 dyspnea,
 MICU,
 PEA,
 patient,
 initially,
 LGH,
 ED,
 hypoxemicrespiratory distress,
 OSH,
 CTX,
 azithromycin,
 SC,
 epinephrine,
 solumedrol,
 OSH,
 confused,
 PEA,
 intubated,
 epinephrine,
 atropine,
 magnesium,
 bicarb,
 bilateral needle,
 report of,
 left,
 hadbilateral chest tubes,
 minutesof rescucitation,
 ROSC,
 vecuronium,
 wasstarted,
 asthma,
 cooling,
 evaluation,
 thepatient,
 LGH,
 [**1-4**],
 dyspnea,
 diagnosed,
 CAP,
 asthma,
 treated with,
 CTXand azithromycin,
 Per his family,
 multipleadmissions,
 winter,
 asthma,
 exacerbations,
 ED,
 anABG,
 CTH,
 CTA,
 chest,
 PEA arrest,
 Rescucitation,
 minutes,
 multiplerounds,
 epi,
 bicarb,
 ROSC,
 management,
 patient,
 intubated,
 sedated,
 parlyzed)

In [15]:
text = data["100035"]["History of Present Illness:"]
nlp_lg(text).ents

(3234**],
 year,
 gentleman,
 PMH,
 dilated cardiomyopathy s/p AICD,
 asthma,
 HTN,
 admittedto,
 OSH,
 dyspnea,
 admitted,
 MICU,
 PEA,
 patient,
 LGH,
 ED,
 hypoxemicrespiratory distress,
 OSH,
 CTX,
 azithromycin,
 SC,
 epinephrine,
 solumedrol,
 OSH,
 episode,
 PEA,
 arrestand,
 intubated,
 epinephrine,
 atropine,
 magnesium,
 bicarb,
 bilateral,
 report,
 air,
 left,
 hadbilateral chest tubes,
 minutesof,
 rescucitation,
 ROSC,
 vecuronium,
 wasstarted,
 epi gtt,
 asthma,
 cooling,
 wasthen,
 [**Hospital1 18**],
 evaluation,
 thepatient,
 admitted,
 LGH,
 [**1-4**],
 dyspnea,
 diagnosed,
 CAP,
 asthma,
 treated with,
 CTXand,
 azithromycin,
 family,
 multipleadmissions,
 winter,
 asthma exacerbations,
 [**Hospital1 18**],
 ED,
 anABG,
 CTH,
 CTA,
 chest,
 PEA,
 arrest,
 Rescucitation,
 minutes,
 multiplerounds,
 epi,
 bicarb,
 ROSC,
 admitted,
 management,
 patient,
 intubated,
 sedated,
 parlyzed)

## Try keyword extraction on "Chief Complaint:" sections to see different kinds of complaints

In [18]:
complaint = data["106621"]["Chief Complaint:"]
doc2 = nlp(complaint)
doc2.ents

(altered, mental status, transfer, MICU)

In [19]:
complaints = {}
for pid, patient_file in data.items():
    complaint = patient_file.get("Chief Complaint:", "")
    cdoc = nlp(complaint)
    complaints[pid] = cdoc.ents
complaints

{'107047': (),
 '106621': (altered, mental status, transfer, MICU),
 '105050': (hypertensive, urgency),
 '143562': (OSH, transplant, work-up),
 '134445': (),
 '127741': (SOB,),
 '185759': (Abnormal, outpatient, labs),
 '174150': (Shortness of breath,),
 '105254': (),
 '100847': (Left Foot Infection, DKA),
 '114004': (hematuria,),
 '102087': (OSH,),
 '112628': (transfer,),
 '115432': (),
 '108658': (atrial mass,),
 '110037': (unwittnessed, fall),
 '123475': (Chest pain, back, jaw),
 '123324': (Respiratory failure,),
 '189471': (bradycardia, hypotension),
 '182160': (Abdominal aortic aneurysm,),
 '100883': (respiratory distress,),
 '107869': (Biliary obstuction,),
 '149687': (),
 '121514': (Lithium, Toxicity),
 '160574': (right knee pain,),
 '102053': (),
 '114220': (dizziness, palpitations),
 '113824': (elevated, WBC),
 '102136': (),
 '125206': (fall, inability, walk),
 '109176': (neck swelling,),
 '130076': (Shortness of Breath,),
 '107139': (aortoenteric fistula,),
 '103377': (nauesa/

In [20]:
complaint_docs = {}
for pid, patient_file in data.items():
    complaint = patient_file.get("Chief Complaint:", "")
    cdoc = nlp(complaint.lower())
    text = patient_file.get("History of Present Illness:", "")
    doc = nlp(text.lower())
    for ent in cdoc.ents:
        key = ent.text.strip()
        if key not in complaint_docs:
            complaint_docs[key] = []
        complaint_docs[key].append(doc)
complaint_docs.keys()

dict_keys(['hypotension', 'altered', 'mental status', 'transfer', 'micu', 'hypertensive', 'urgency', 'transplant', 'work-up', 'sob', 'abnormal', 'outpatient', 'labs', 'shortness of breath', 'left foot infection', 'dka', 'brbrp hematuria', 'atrial mass', 'unwittnessed', 'fall', 'chest pain', 'back', 'jaw', 'respiratory failure', 'bradycardia', 'abdominal aortic aneurysm', 'respiratory distress', 'biliary obstuction', 'lithium', 'toxicity', 'right knee pain', 'lightheadedness', 'dizziness', 'palpitations', 'elevated', 'inability', 'walk', 'neck swelling', 'aortoenteric fistula', 'nauesa/vomiting', 'vertigo', 'cxr', 'tachycardia', 'headaches', 'increased', 'worsening', 'dysphagia', 'metastatic', 'brain lesions', 'hemoptysis', 'white blood cells', 'low', 'platelets', 'routinecbc', 'abdominal pain', 'transferred', 'renal failure', 'nausea', 'vomitting', 'stab', 'wound', 'liver transplant', 'vomiting', 'fever', 'fatigue', 'seizures', 'concern', 'status epilepticus', 'severe', 'creatinine', '

## Check similarity scores of keywords extracted from "History of Present Illness:"

In [21]:
set1 = set(ent.text for ent in complaint_docs["hypotension"][0].ents)
set1

{'[**2134-10-9**].she',
 'abdominal pain',
 'acute process',
 'afebrile',
 'appointment',
 'arrival',
 'asymptomatic',
 'bed',
 'blood pressure',
 'cabinets',
 'cefepime',
 'chest pain',
 'chest x-ray and head ct',
 'chills',
 'cough',
 'denies',
 'diarrhea',
 'done.both',
 'dry',
 'dysuria',
 'fever',
 'floor',
 'fluidresuscitation',
 'frequency',
 'givena dose',
 "history of recurrentburkitt's",
 'home',
 'hospital',
 'hr99',
 'infection',
 'ivf ontransport',
 'kitchen',
 'lightheadedness',
 'lying',
 'lymphoma',
 'morning',
 'mouth',
 'myalgias',
 'negative',
 'neutropenia',
 'osh)and hypotension',
 'outpatient',
 'outside',
 'pain',
 'pmh).she',
 'poolof feces',
 'ra.',
 'recurrence',
 'shortness of breath',
 'standing',
 'theentire',
 'transfer',
 'transport',
 'treatment',
 'urgency',
 'vancomycin',
 'vitals',
 'woman'}

In [22]:
pos_tag = ['PROPN', 'ADJ', 'NOUN']
set2 = set(token.text for token in complaint_docs["hypotension"][0] if token.text not in nlp.Defaults.stop_words and token.pos_ in pos_tag)
set2

{'%',
 '*',
 '+',
 '63-year-old',
 '7[**hospital',
 '90s/40s',
 'abdominal',
 'acute',
 'afebrile',
 'appointment',
 'arrival',
 'asymptomatic',
 'bed',
 'blood',
 'bp',
 'breath',
 'cabinets',
 'cefepime',
 'chest',
 'chills',
 'clinic',
 'cough',
 'ct',
 'd',
 'diarrhea',
 'dose',
 'dry',
 'dysuria',
 'early',
 'ed',
 'feces',
 'fever',
 'floor',
 'fluidresuscitation',
 'frequency',
 'head',
 'history',
 'home',
 'hospital',
 'hospital1',
 'hospital3',
 'hr99',
 'hypotension',
 'infection',
 'ivf',
 'kitchen',
 'lastname',
 'lightheadedness',
 'lymphoma',
 'morning',
 'mouth',
 'mrs',
 'myalgias',
 'negative',
 'neutropenia',
 'o2',
 'ofconcern',
 'ontransport',
 'osh',
 'osh)and',
 'outpatient',
 'outside',
 'pain',
 'pmh).she',
 'point',
 'poolof',
 'pressure',
 'process',
 'ra',
 'recurrence',
 'recurrentburkitt',
 'rr',
 'shortness',
 'sonfound',
 't',
 'theentire',
 'time',
 'transfer',
 'transport',
 'treatment',
 'urgency',
 'vancomycin',
 'verylightheaded',
 'vitals',
 'wbc',

In [23]:
set3 = set1.intersection(set2)
set3

{'afebrile',
 'appointment',
 'arrival',
 'asymptomatic',
 'bed',
 'cabinets',
 'cefepime',
 'chills',
 'cough',
 'diarrhea',
 'dry',
 'dysuria',
 'fever',
 'floor',
 'fluidresuscitation',
 'frequency',
 'home',
 'hospital',
 'hr99',
 'infection',
 'kitchen',
 'lightheadedness',
 'lymphoma',
 'morning',
 'mouth',
 'myalgias',
 'negative',
 'neutropenia',
 'outpatient',
 'outside',
 'pain',
 'pmh).she',
 'recurrence',
 'theentire',
 'transfer',
 'transport',
 'treatment',
 'urgency',
 'vancomycin',
 'vitals',
 'woman'}

In [24]:
token_1=nlp_lg("hypotension")
for tok in set3:
    token_2=nlp_lg(tok)
    similarity_score=token_1.similarity(token_2)
    print(tok, "---", similarity_score)

  similarity_score=token_1.similarity(token_2)


pmh).she --- 0.0
hospital --- 0.13174769665926925
diarrhea --- 0.4382135209314492
morning --- 0.2286663437246812
chills --- 0.4968325198874529
appointment --- 0.04172161220111544
cough --- 0.47507853563030206
cefepime --- 0.0792249761217259
mouth --- 0.17383768351057774
myalgias --- 0.5129479504338302
urgency --- 0.26600438959005007
arrival --- 0.09403126649001205
theentire --- 0.0
bed --- 0.15686499456447944
fever --- 0.4669544082581854
dry --- 0.08692703993351812
home --- 0.08540644395459075
transport --- -0.02803162490466038
cabinets --- -0.014013862149845022
dysuria --- 0.38180535468721405
transfer --- -0.06378935520318023
vitals --- 0.30164597091488515
kitchen --- 0.06335161910683405
floor --- 0.09318753208867926
outside --- -0.06070708322436919
woman --- 0.22151927986958256
pain --- 0.4048266521410172
afebrile --- 0.37860389709270365
vancomycin --- 0.08182768453243007
lightheadedness --- 0.5965108275684237
recurrence --- 0.2142103934529524
neutropenia --- 0.4791622106586267
negat

## Above scores show words like "lightheadedness", "myalgias", "pain", etc from "History  of Present Illness" section having higher scores with the word "hypotension" from "Chief Complaint". This could be good news.

## ========================================================================

## Try to get entities from "History of Present Illness" sorted based on the above similarity scores and limit to top 20 entities.

In [25]:
top_complaint_entities = {}
for complaint, docs in complaint_docs.items():
    if not complaint:
        continue
    token_1=nlp_lg(complaint)
    ents = []
    doc_ents = list(set(ent.text for doc in docs for ent in doc.ents))
    for ent in doc_ents:
        token_2=nlp_lg(ent)
        similarity_score=token_1.similarity(token_2)
        ents.append((-similarity_score, ent))
    top_complaint_entities[complaint] = list(zip(*(sorted(ents)[:20])))[1]
top_complaint_entities

  similarity_score=token_1.similarity(token_2)


{'hypotension': ('osh)and hypotension',
  'hypotension',
  'bradycardia',
  'syncope',
  'presyncope',
  'syncopeor presyncope',
  'paroxysmal nocturnal dyspnea',
  'hypotensive',
  'palpitations',
  'lightheadedness',
  'didreport lethargy',
  'orthopnea',
  'vomiting',
  'near syncope',
  'hypoxemic',
  'hypertension',
  'nausea',
  'nausea andfeeling',
  'dyspnea',
  'headache'),
 'altered': ('altered',
  'altered mentalstatus',
  'impaired',
  'increased',
  'elevated',
  'improved',
  'decreased appetite',
  'unchanged',
  'beminimally responsive',
  'responsive',
  'mimic',
  'metabolic derangements',
  'absence',
  'decline',
  'lost',
  'respond tosternal',
  'inappropriate',
  'liver function',
  'assessed',
  'attenuation'),
 'mental status': ('mental status',
  'mental statusrequiring',
  'hismental status',
  'status',
  'health',
  'service',
  'ofnormal mood',
  'medical care',
  'psychiatric seclusion',
  'anxiety',
  'critical illness',
  'further care',
  'sedated stat

In [26]:
token_1=nlp_lg("sob")
token_2=nlp_lg("vancomycin 1 gm")
token_1.similarity(token_2)

0.13724286433483865

## Next, we extract top keywords on document level. We do it in following steps:
### - Extract entities from "Chief Complaint:" section.
### - Extract top 10 keywords for each entity from "History of Present Illness:" section
### - Sort all top keywords based on combined similarity score and keep top 10

In [27]:
def extract_keywords(row):
    complaint = row["Chief Complaint:"].lower()
    text = row["History of Present Illness:"].lower()
    # If we were not able to extract complaint or history, skip
    if not (complaint and text):
        row["top_keywords"] = []
        return row
    
    cdoc = nlp_lg(complaint)
    doc = nlp_lg(text)
    ents = {}
    combined = {}
    # Calculate scores for each entity in chief complaint with each entity
    # in history of present illness.
    for cent in cdoc.ents:
        token_1=nlp_lg(cent.text)
        ents[cent.text] = []
        for ent in doc.ents:
            token_2=nlp_lg(ent.text)
            similarity_score=token_1.similarity(token_2)
            if ent.text not in combined:
                combined[ent.text] = 1
            # We will take combined score as the multiplication of all the scores for now
            combined[ent.text] *= similarity_score
            ents[cent.text].append({"score": similarity_score, "entity": ent.text})
    # Add combined score with all complaint entities to each history entity
    result_ents = {}
    for cent, history_ents in ents.items():
        result_ents[cent] = []
        for ent in history_ents:
            ent["combined_score"] = combined[ent["entity"]]
            result_ents[cent].append(ent)
    # Sort and get top 10 history entities for each complaint entity
    ents = []
    for cent in result_ents:
        top10 = sorted(result_ents[cent], key=lambda x: -x["score"])[:10]
        ents += top10
    # Finally sort and take top 10 entities based on combined score
    row["top_keywords"] = json.dumps([ent["entity"] for ent in sorted(ents, key=lambda x: -x["combined_score"])[:10]])
    return row

df3 = df2.fillna("").apply(extract_keywords, axis=1)

  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_

  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_

  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_score=token_1.similarity(token_2)
  similarity_

In [28]:
df3.head()

Unnamed: 0,patient_id,Chief Complaint:,History of Present Illness:,present_history_annotations_dump,top_keywords
0,107047,Hypotension,Mrs. [**Known lastname 52**] is a 63-year-old...,"{""T"": [{""tag_type"": ""T"", ""tag"": ""T43"", ""entity...",[]
1,106621,altered mental status; transfer from MICU,71M with multiple myeloma currently on Velcad...,"{""T"": [{""tag_type"": ""T"", ""tag"": ""T37"", ""entity...","[""mental status"", ""health"", ""ofnormal mood"", ""..."
2,105050,"headache, hypertensive urgency","45m with dmi, esrd on hd, and multiple admissi...","{""T"": [{""tag_type"": ""T"", ""tag"": ""T31"", ""entity...","[""hypertensive urgency"", ""hypertensive"", ""hype..."
3,143562,direct admit from OSH for transplant work-up,"Pt is 54 yo male with alcoholic cirrhosis, h/...","{""T"": [{""tag_type"": ""T"", ""tag"": ""T1"", ""entity""...","[""evaluation"", ""therapetuic paracentesis"", ""de..."
4,134445,,the patient is a 33-year-old right-handed woma...,"{""T"": [], ""R"": []}",[]


In [29]:
df3.to_csv("data/output.csv", index=False)

## The above method captures some important keywords.
## The problem with the method is - because of similarity score, the method is only able to pickup words pertaining to the disease/problem and not the medicines if any were given.
## This could be because spacy's model is a statistical model and the model might have not seen the medicines in context of the issues.

## So we will try to train the entity model to give better entities.
## We find the clinical notes data to train our model from Harvard DBMI portal - https://portal.dbmi.hms.harvard.edu/

## =======================================================================

## We will do the task in 3 steps:
### - First we preprocess the data to convert it into spacy binary format.
### - Train the NER model using spacy train
### - Evaluate the NER model using spacy evaluate

# Preprocessing:

## NOTE: My request to data access is pending, so, for now, let's assume that the data is in same format as this assignment. If there are any changes, corresponding changes can be made to the preprocess script.

In [32]:
import random
import shutil

from typing import Dict

In [31]:
def separate_train_eval(old_dir: str, new_dir: str, split_ratio: float=0.8):
    """
    Function to separate data in training and validation.
    
    Args:
        old_dir: str, Current data directory
        new_dir: str, New data directory with train and validation separated
        split_ratio: float, Fraction of data to keep in training.
                     Defaults to 80-20 train and validation split.
    
    Returns: None
    """
    train_dir = os.path.join(new_dir, "train")
    eval_dir = os.path.join(new_dir, "eval")
    if not os.path.isdir(train_dir):
        os.makedirs(train_dir)
    if not os.path.isdir(eval_dir):
        os.makedirs(eval_dir)
    for f in os.listdir(old_dir):
        ofp = os.path.join(old_dir, f)
        if os.path.isfile(ofp):
            if random.random() <= split_ratio:
                nfp = os.path.join(train_dir, f)
            else:
                nfp = os.path.join(eval_dir, f)
            shutil.copy(ofp, nfp)
            
def get_file_paths(data_dir: str) -> Dict[str, Dict[str, str]]:
    """
    Function to 
    """
    files = {}
    for f in os.listdir(data_dir):
        fp = os.path.join(data_dir, f)
        if os.path.isfile(fp):
            patient_id = f[:-4]
            if patient_id not in files:
                files[patient_id] = {}
            if f.endswith(".ann"):
                files[patient_id]["annotation"] = fp
            elif f.endswith(".txt"):
                files[patient_id]["data"] = fp
            else:
                raise IOError(f"File type for {fp} not supported.")
    return files