In [2]:
%%capture
!unzip Project3Data.zip

In [90]:
import pandas as pd

# Look at a sample clinical note
data = pd.read_xml('Project3Data/Project3_data/400-01.xml')

data['TEXT'][0]

'Record date: 2062-06-10\n\nMedical Walk In Unit\n\nHartsville General Hospital\n\n21 Brook St \n\nPataskala, RI 33689\n\nPhone: 929-462-2794   Fax: 648-875-5821 \n\n\n\nPCP: \n\n\n\n\n\nNursing Assessment:  \n\nWT:    Temp:    P: 100     BP: 190/116\n\n\n\nReason for visit: Pt just left the EW.  Noted HTN has been off meds for a while.  Was nor given any meds\n\n\n\nLab Data :\n\n\n\nAllergies:      \n\n\n\n\n\nMedication: \n\noff meds for more than 3 months\n\n\n\nHistory of the Present Illness:                                \n\n59 y/o female here for high blood pressure.  Pt was seen in EW for medications and was given Seroquel, Neurontin, and amytriptyline.  Pt has been out of meds for more than 3 months.\n\n\n\nPt remembers being atenolol and HCTZ.  Pt plans to come to the medical walk-in tomorrow with all her medications.\n\n\n\nPt denies HA, visual problems, chest pain, problems with urination.\n\n\n\nFather had CAD and died at age 55.  Mother died when she was 89 (10 months ag

The goal is to provide a frequency distribution of the vital signs/physical exam readings observed across the patient clinical notes.

What are the vital signs/physical exams?
*   Blood pressure
*   Pulse
*   Temperature
*   Respitory rate
*   Height
*   Weight
*   Oxygen saturation

Since we are analyzing free-text data that is only semi-structured, I read through a bunch of clinical notes and manually extracted aliases for each vital sign.
*   Blood pressure: ```[Blood pressure, BP]```
*   Pulse: ```[Pulse, P, HR, heart rate]```
*   Temperature: ```[Tempreture, T, Temp]```
*   Respitory rate: ```[Respitory rate, Respirations, RR, Fs]```
*   Height: ```[Height, Ht]```
*   Weight: ```[Wt, wgt, none (only a number followed by 'pounds' or 'lbs'), weighing, W.]```
*   Oxygen saturation: ```[Oxygen saturation, SaO2, O2 saturation, saturating, Sat, O2 sat, O2sat]```

In [93]:
# Extract the name of all the files to be analyzed
file_names = []
with open('Project3Data/Project3_data/MIscellaneous/pl.txt', 'r') as f:
  for file_name in f:
    file_names.append(file_name.rstrip('\n'))
f.close()
file_names[0]

'100-01.xml'

In [100]:
# Define the aliases and regex patterns for each vital sign
patterns = {
    'Blood pressure': r'\b(?:blood pressure|bp)\b(?:(?!\b(?:blood pressure|bp|pulse|p\b|hr|heart rate|temperature|t\b|temp|respiratory rate|respirations|rr|fs|height|ht|wt|weight|weighing|w\.|oxygen saturation|sao2|o2 saturation|saturating|sat|o2 sat)\b).)*?(\d+/\d+)',
    'Pulse': r'\b(?:pulse|p\b|hr|heart rate)\b(?:(?!\b(?:blood pressure|bp|pulse|p\b|hr|heart rate|temperature|t\b|temp|respiratory rate|respirations|rr|fs|height|ht|wt|weight|weighing|w\.|oxygen saturation|sao2|o2 saturation|saturating|sat|o2 sat)\b).)*?(\d+)',
    'Temperature': r'\b(?:temperature|t\b|temp)\b(?:(?!\b(?:blood pressure|bp|pulse|p\b|hr|heart rate|temperature|t\b|temp|respiratory rate|respirations|rr|fs|height|ht|wt|weight|weighing|w\.|oxygen saturation|sao2|o2 saturation|saturating|sat|o2 sat)\b).)*?(\d+(?:\.\d+)?)',
    'Respiratory rate': r'\b(?:respiratory rate|respirations|rr|fs)\b(?:(?!\b(?:blood pressure|bp|pulse|p\b|hr|heart rate|temperature|t\b|temp|respiratory rate|respirations|rr|fs|height|ht|wt|weight|weighing|w\.|oxygen saturation|sao2|o2 saturation|saturating|sat|o2 sat)\b).)*?(\d+)',
    'Height': r'\b(?:height|ht)\b(?:(?!\b(?:blood pressure|bp|pulse|p\b|hr|heart rate|temperature|t\b|temp|respiratory rate|respirations|rr|fs|height|ht|wt|weight|weighing|w\.|oxygen saturation|sao2|o2 saturation|saturating|sat|o2 sat)\b).)*?(\d+)',
    'Weight': r'\b(?:wt|wgt|weight|weighing|w\.)\b(?:(?!\b(?:blood pressure|bp|pulse|p\b|hr|heart rate|temperature|t\b|temp|respiratory rate|respirations|rr|fs|height|ht|wt|weight|weighing|w\.|oxygen saturation|sao2|o2 saturation|saturating|sat|o2 sat)\b).)*?(\d+)',
    'Oxygen saturation': r'\b(?:oxygen saturation|sao2|o2 saturation|saturating|sat|o2 sat|o2sat)\b(?:(?!\b(?:blood pressure|bp|pulse|p\b|hr|heart rate|temperature|t\b|temp|respiratory rate|respirations|rr|fs|height|ht|wt|weight|weighing|w\.|oxygen saturation|sao2|o2 saturation|saturating|sat|o2 sat)\b).)*?(\d+%)'
}

In [101]:
import re

for xml_file in file_names[:10]:
  clinical_note = pd.read_xml(f'Project3Data/Project3_data/{xml_file}')['TEXT'][0]
  print(f'For {xml_file}:')
  for sign, pattern in patterns.items():
    matches = re.findall(pattern, clinical_note, re.IGNORECASE)
    print(f'{sign}: {matches}')
  print()

For 100-01.xml:
Blood pressure: []
Pulse: []
Temperature: []
Respiratory rate: []
Height: []
Weight: []
Oxygen saturation: []

For 100-02.xml:
Blood pressure: ['119/90']
Pulse: ['82']
Temperature: ['97.9', '03']
Respiratory rate: []
Height: []
Weight: []
Oxygen saturation: []

For 100-03.xml:
Blood pressure: ['146/88']
Pulse: ['11', '11']
Temperature: []
Respiratory rate: ['16']
Height: []
Weight: ['195']
Oxygen saturation: []

For 100-04.xml:
Blood pressure: []
Pulse: ['70']
Temperature: ['2']
Respiratory rate: []
Height: []
Weight: []
Oxygen saturation: []

For 100-05.xml:
Blood pressure: []
Pulse: ['12', '10']
Temperature: []
Respiratory rate: []
Height: []
Weight: []
Oxygen saturation: []

For 101-01.xml:
Blood pressure: ['130/80']
Pulse: ['68', '108']
Temperature: []
Respiratory rate: []
Height: []
Weight: []
Oxygen saturation: []

For 101-02.xml:
Blood pressure: ['146/86']
Pulse: ['64']
Temperature: []
Respiratory rate: ['20']
Height: []
Weight: []
Oxygen saturation: []

For 101-