In [1]:
%%capture
!unzip Project3Data.zip

In [2]:
import pandas as pd

# Look at a sample clinical note
data = pd.read_xml('Project3Data/Project3_data/400-01.xml')

data['TEXT'][0]

'Record date: 2062-06-10\n\nMedical Walk In Unit\n\nHartsville General Hospital\n\n21 Brook St \n\nPataskala, RI 33689\n\nPhone: 929-462-2794   Fax: 648-875-5821 \n\n\n\nPCP: \n\n\n\n\n\nNursing Assessment:  \n\nWT:    Temp:    P: 100     BP: 190/116\n\n\n\nReason for visit: Pt just left the EW.  Noted HTN has been off meds for a while.  Was nor given any meds\n\n\n\nLab Data :\n\n\n\nAllergies:      \n\n\n\n\n\nMedication: \n\noff meds for more than 3 months\n\n\n\nHistory of the Present Illness:                                \n\n59 y/o female here for high blood pressure.  Pt was seen in EW for medications and was given Seroquel, Neurontin, and amytriptyline.  Pt has been out of meds for more than 3 months.\n\n\n\nPt remembers being atenolol and HCTZ.  Pt plans to come to the medical walk-in tomorrow with all her medications.\n\n\n\nPt denies HA, visual problems, chest pain, problems with urination.\n\n\n\nFather had CAD and died at age 55.  Mother died when she was 89 (10 months ag

The goal is to provide a frequency distribution of the vital signs/physical exam readings observed across the patient clinical notes.

What are the vital signs/physical exams?
*   Blood pressure
*   Pulse
*   Temperature
*   Respitory rate
*   Height
*   Weight
*   Oxygen saturation

Since we are analyzing free-text data that is only semi-structured, I read through a bunch of clinical notes and manually extracted aliases for each vital sign.
*   Blood pressure: ```[Blood pressure, BP]```
*   Pulse: ```[Pulse, P, HR, heart rate]```
*   Temperature: ```[Tempreture, T, Temp]```
*   Respitory rate: ```[Respitory rate, Respirations, RR, Fs]```
*   Height: ```[Height, Ht]```
*   Weight: ```[Wt, wgt, none (only a number followed by 'pounds' or 'lbs'), weighing, W.]```
*   Oxygen saturation: ```[Oxygen saturation, SaO2, O2 saturation, saturating, Sat, O2 sat, O2sat]```

In [3]:
# Extract the name of all the files to be analyzed
file_names = []
with open('Project3Data/Project3_data/MIscellaneous/pl.txt', 'r') as f:
  for file_name in f:
    file_names.append(file_name.rstrip('\n'))
f.close()
file_names[0]

'100-01.xml'

In [28]:
# Define the aliases and regex patterns for each vital sign
patterns = {
    'Blood pressure': r'\b(?:blood pressure|bp)\b(?:(?!\b(?:blood pressure|bp|pulse|p\b|hr|heart rate|temperature|t\b|temp|respiratory rate|respirations|rr|fs|height|ht|wt|weight|weighing|w\.|oxygen saturation|sao2|o2 saturation|saturating|sat|o2 sat)\b).)*?(\d+/\d+)',
    'Pulse': r'\b(?:pulse|p\b|hr|heart rate)\b\s*[:\-]?\s*(\d+)',
    'Temperature': r'\b(?:temperature|t\b|temp)\b(?:(?!\b(?:blood pressure|bp|pulse|p\b|hr|heart rate|temperature|t\b|temp|respiratory rate|respirations|rr|fs|height|ht|wt|weight|weighing|w\.|oxygen saturation|sao2|o2 saturation|saturating|sat|o2 sat)\b).)*?(\d+(?:\.\d+)?)',
    'Respiratory rate': r'\b(?:respiratory rate|respirations|rr|fs)\b(?:(?!\b(?:blood pressure|bp|pulse|p\b|hr|heart rate|temperature|t\b|temp|respiratory rate|respirations|rr|fs|height|ht|wt|weight|weighing|w\.|oxygen saturation|sao2|o2 saturation|saturating|sat|o2 sat)\b).)*?(\d+)',
    'Weight': r'\b(?:wt|wgt|weight|weighing|w\.)\b(?:(?!\b(?:blood pressure|bp|pulse|p\b|hr|heart rate|temperature|t\b|temp|respiratory rate|respirations|rr|fs|height|ht|wt|weight|weighing|w\.|oxygen saturation|sao2|o2 saturation|saturating|sat|o2 sat)\b).)*?(\d+)',
    'Oxygen saturation': r'\b(?:oxygen saturation|sao2|o2 saturation|saturating|sat|o2 sat|o2sat)\b(?:(?!\b(?:blood pressure|bp|pulse|p\b|hr|heart rate|temperature|t\b|temp|respiratory rate|respirations|rr|fs|height|ht|wt|weight|weighing|w\.|oxygen saturation|sao2|o2 saturation|saturating|sat|o2 sat)\b).)*?(\d+%)'
}

In [29]:
# Define the valid ranges for each vital sign
valid_ranges = {
    'Blood pressure': lambda x: 80 <= int(x.split('/')[0]) <= 180 and 40 <= int(x.split('/')[1]) <= 110,
    'Pulse': lambda x: 40 <= int(x) <= 120,
    'Temperature': lambda x: 90 <= float(x) <= 110,
    'Respiratory rate': lambda x: 8 <= int(x) <= 30,
    'Weight': lambda x: 100 <= int(x) <= 400,
    'Oxygen saturation': lambda x: 85 <= int(x.rstrip('%')) <= 100
}

In [37]:
# Function to calculate the median
def calculate_median(values):
    if not values:  # If the list is empty
        return None  # or you can return a specific string like 'No valid data'
    values.sort()
    n = len(values)
    if n == 1:
        return values[0]  # Only one value, return it directly
    elif n % 2 == 1:
        return values[n // 2]  # Odd number of values, return the middle one
    else:
        return (values[n // 2 - 1] + values[n // 2]) / 2  # Even number of values, return the average of the two middle values

In [40]:
import re

vital_signs_data = pd.DataFrame(columns=['filename', 'Blood pressue', 'Pulse', 'Temperature', 'Respiraoty rate', 'Weight', 'Oxygen saturation'])

for xml_file in file_names[::100]:
  print(f'For {xml_file}:')
  clinical_note = pd.read_xml(f'Project3Data/Project3_data/{xml_file}')['TEXT'][0]
  for sign, pattern in patterns.items():
    matches = re.findall(pattern, clinical_note, re.IGNORECASE)
    valid_matches = [match for match in matches if valid_ranges[sign](match)]
    if sign == 'Blood pressure':
            # Handle blood pressure separately
            systolic = [int(bp.split('/')[0]) for bp in valid_matches]
            diastolic = [int(bp.split('/')[1]) for bp in valid_matches]
            median_systolic = calculate_median(systolic)
            median_diastolic = calculate_median(diastolic)
            median_bp = f'{median_systolic}/{median_diastolic}'
            print(f'  {sign}: {median_bp}')
    else:
        # Convert matches to numbers for median calculation
        if valid_matches:
            if sign == 'Oxygen saturation':
                valid_matches = [int(match.rstrip('%')) for match in valid_matches]
            elif sign == 'Temperature':
                valid_matches = [float(match) for match in valid_matches]
            else:
                valid_matches = [int(match) for match in valid_matches]
            median_value = calculate_median(valid_matches)
            print(f'  {sign}: {median_value}')
        else:
            print(f'  {sign}: No valid data')
  print()

For 100-01.xml:
  Blood pressure: None/None
  Pulse: No valid data
  Temperature: No valid data
  Respiratory rate: No valid data
  Weight: No valid data
  Oxygen saturation: No valid data

For 123-01.xml:
  Blood pressure: 130/90
  Pulse: 64
  Temperature: No valid data
  Respiratory rate: No valid data
  Weight: 170
  Oxygen saturation: No valid data

For 146-03.xml:
  Blood pressure: None/None
  Pulse: 65
  Temperature: No valid data
  Respiratory rate: 14
  Weight: No valid data
  Oxygen saturation: No valid data

For 168-03.xml:
  Blood pressure: 135/80
  Pulse: 78
  Temperature: 97.4
  Respiratory rate: 18
  Weight: No valid data
  Oxygen saturation: 99

For 190-01.xml:
  Blood pressure: 160/80
  Pulse: No valid data
  Temperature: No valid data
  Respiratory rate: 16
  Weight: No valid data
  Oxygen saturation: No valid data

For 214-01.xml:
  Blood pressure: 139.5/66.5
  Pulse: 64
  Temperature: 95.7
  Respiratory rate: No valid data
  Weight: 144
  Oxygen saturation: No valid 