In [2]:
%%capture
!unzip Project3Data.zip

In [70]:
import pandas as pd

# Look at a sample clinical note
data = pd.read_xml('Project3Data/Project3_data/140-05.xml')

data['TEXT'][0]

"Record date: 2088-02-02\n\n Renal Transplant Admission Note\n\nDate this visit: 02/02/88\n\nCause of ESRD: DM T1\nDate of this kidney transplant: 9/83\n\nCC: SOB/DOE\n\nHPI:  48 y/o F h/o ESRD 2/2 DM T1 s/p renal tx 09/83.  Pt. was seen in clinic today with worsening peripheral edema to waist.  She also reports worsening DOE, and some paroxysmal nocturnal dyspnea requiring at least 2 pillows to sleep. She reports worsening of her symptoms over the past week.  She denies n/v/d.   No CP/Abd pain/f/c/s.\n\nCurrent immunosuppression:  Tacrolimus 1mg qd, Imuran 50mg qd,  prednisone 5 mg QOD\nOther meds: Toprol XL 200mg qam, Lasix 20-40mg qd, Felodipine 10mg qd,  valtrex 1,000 mg qod, CaCarb 1250 mg bid, Vit D 800 units qd, Plavix 75 mg qd, Humalog SS, Lantus 10 units qpm, ASA 81 mg qd, Lipitor 20 mg qd, Avapro 75 mg qd, Epo 6000 qweek, Zantac 150mg bid, ferrous gluconate 325mg qd.\n\nOther medical problems: \nESRD 2/2 DM type I s/p renal tx c/b both cellular and humoral rejection.\nHyperte

The goal is to provide a frequency distribution of the vital signs/physical exam readings observed across the patient clinical notes.

What are the vital signs/physical exams?
*   Blood pressure
*   Pulse
*   Temperature
*   Respitory rate
*   Height
*   Weight
*   Oxygen saturation

Since we are analyzing free-text data that is only semi-structured, I read through a bunch of clinical notes and manually extracted aliases for each vital sign.
*   Blood pressure: ```[Blood pressure, BP]```
*   Pulse: ```[Pulse, P, HR, heart rate]```
*   Temperature: ```[Tempreture, T, Temp]```
*   Respitory rate: ```[Respitory rate, Respirations, RR, Fs]```
*   Height: ```[Height, Ht]```
*   Weight: ```[Wt, none (only a number followed by 'pounds' or 'lbs'), weighing, W.]```
*   Oxygen saturation: ```[Oxygen saturation, SaO2, O2 saturation, saturating, Sat, O2 sat, O2sat]```

In [12]:
# Extract the name of all the files to be analyzed
file_names = []
with open('Project3Data/Project3_data/MIscellaneous/pl.txt', 'r') as f:
  for file_name in f:
    file_names.append(file_name.rstrip('\n'))
f.close()

In [72]:
# Define the aliases and regex patterns for each vital sign
patterns = {
    'Blood pressure': r'\b(?:blood pressure|bp)\b(?:(?!\b(?:blood pressure|bp|pulse|p\b|hr|heart rate|temperature|t\b|temp|respiratory rate|respirations|rr|fs|height|ht|wt|weight|weighing|w\.|oxygen saturation|sao2|o2 saturation|saturating|sat|o2 sat)\b).)*?(\d+/\d+)',
    'Pulse': r'\b(?:pulse|p\b|hr|heart rate)\b(?:(?!\b(?:blood pressure|bp|pulse|p\b|hr|heart rate|temperature|t\b|temp|respiratory rate|respirations|rr|fs|height|ht|wt|weight|weighing|w\.|oxygen saturation|sao2|o2 saturation|saturating|sat|o2 sat)\b).)*?(\d+)',
    'Temperature': r'\b(?:temperature|t\b|temp)\b(?:(?!\b(?:blood pressure|bp|pulse|p\b|hr|heart rate|temperature|t\b|temp|respiratory rate|respirations|rr|fs|height|ht|wt|weight|weighing|w\.|oxygen saturation|sao2|o2 saturation|saturating|sat|o2 sat)\b).)*?(\d+(?:\.\d+)?)',
    'Respiratory rate': r'\b(?:respiratory rate|respirations|rr|fs)\b(?:(?!\b(?:blood pressure|bp|pulse|p\b|hr|heart rate|temperature|t\b|temp|respiratory rate|respirations|rr|fs|height|ht|wt|weight|weighing|w\.|oxygen saturation|sao2|o2 saturation|saturating|sat|o2 sat)\b).)*?(\d+)',
    'Height': r'\b(?:height|ht)\b(?:(?!\b(?:blood pressure|bp|pulse|p\b|hr|heart rate|temperature|t\b|temp|respiratory rate|respirations|rr|fs|height|ht|wt|weight|weighing|w\.|oxygen saturation|sao2|o2 saturation|saturating|sat|o2 sat)\b).)*?(\d+)',
    'Weight': r'\b(?:wt|weight|weighing|w\.)\b(?:(?!\b(?:blood pressure|bp|pulse|p\b|hr|heart rate|temperature|t\b|temp|respiratory rate|respirations|rr|fs|height|ht|wt|weight|weighing|w\.|oxygen saturation|sao2|o2 saturation|saturating|sat|o2 sat)\b).)*?(\d+)',
    'Oxygen saturation': r'\b(?:oxygen saturation|sao2|o2 saturation|saturating|sat|o2 sat|o2sat)\b(?:(?!\b(?:blood pressure|bp|pulse|p\b|hr|heart rate|temperature|t\b|temp|respiratory rate|respirations|rr|fs|height|ht|wt|weight|weighing|w\.|oxygen saturation|sao2|o2 saturation|saturating|sat|o2 sat)\b).)*?(\d+%)'
}

In [73]:
import re

for sign, pattern in patterns.items():
  matches = re.findall(pattern, data['TEXT'][0], re.IGNORECASE)
  print(f'{sign}: {matches}')

Blood pressure: []
Pulse: ['09', '48']
Temperature: ['10', '96.7']
Respiratory rate: []
Height: []
Weight: ['138']
Oxygen saturation: ['93%']
