In [None]:
# mount to google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [63]:
# import the libraries necessary for reading files, creating dataframes, and utilizing regex formulas/equations
import os
import pandas as pd
import re

# create a dictionary for synonyms
synonyms = {
    "afib": "atrial fibrillation",
    "copd": "chronic obstructive pulmonary disease",
    "pna": "pneumonia",
    "cva": "cerebrovascular disease",
    "uti": "urinary tract infection",
    "htn": "hypertension",
    "chf": "chronic systolic heart failure",
    "cri": "chronic kidney disease",
    "ugib": "gastrointestinal hemorrhage",
    "gib": "gastrointestinal hemorrhage",
    "gi bleed": "gastrointestinal hemorrhage",
    "dm2": "type 2 diabetes mellitus",
    "dm": "type 1 diabetes mellitus",
    "diabetes type 2": "type 2 diabetes mellitus",
    "diabetes mellitus type ii": "type 2 diabetes mellitus",
    "cad": "atherosclerotic heart disease",
    "ards": "acute respiratory distress syndrome",
    "siadh": "syndrome of inappropriate secretion of antidiuretic hormone",
    "esrd on pd": "end stage renal disease",
    "esrd on hd": "end stage renal disease",
    "diabetes type": "type 1 diabetes mellitus",
    "nstemi": "non-st elevation myocardial infarcation",
    "shoulder oa": "shoulder osteoarthritis",
    "ruptured aaa": "abdominal aortic aneurysm, ruptured",
    "gram negative rod bacteremia": "bacteremia",
    "c difficile colitis": "clostridium difficile colitis",
    "hcv cirrhosis": "hepititus c virus cirrhosis"
    # additional synonyms found will go here
}

diagnosis_count = {} # dictionary for tracking diseases with their count of appearances in the EHR
notes_directory = "/content/drive/My Drive/Colab Notebooks/DischargeNotes-05" # directory of patient notes
note_list = os.listdir(notes_directory) # create a list containing strings of the name of each EHR text file
patients_with_diagnosis = 0 # initialize the amount of patients with a diagnosis as 0 before iterating through files

# read each file from the list of health records
for filename in note_list:
  with open(os.path.join(notes_directory, filename), 'r', encoding='utf-8') as f:
    patient_diagnoses = [] # form our list of diagnoses for the folder of EHR txt files
    reading_diagnoses = False # variable to flag whether we are reading diagnoses

    # iterarte through each line of the patient's health record
    for line in f:
      # check if we are starting to read the diagnoses
      if "Discharge Diagnosis:" in line or "Primary Diagnosis:" in line:
        patients_with_diagnosis += 1 # we add to the total number of patients with a diagnosis
        reading_diagnoses = True # we now know we are checking out a diagnosis summary so we must operate on it
        continue

      # check if we should stop reading diagnoses (reached an empty line)
      if reading_diagnoses and not line.strip():
        reading_diagnoses = False
        continue

      # operate if a discharge diagnosis is detected
      if reading_diagnoses:
        diagnosis = re.sub(r'^[1-9][0-9]*\)|[1-9][0-9]*\s*', '', line) # gets rid of all numbers and parantheses from diagnosis
        diagnosis = re.sub(r'\[.*?\]', '', diagnosis) # remove any portions of text including and in between square brackets
        diagnosis = diagnosis.replace(".", "") # replace periods with white space
        diagnosis = diagnosis.strip() # remove leading/trailing spaces
        diagnosis = diagnosis.lower() # changes diagnosis to lowercase

        # look through the synonym dictionary and replace any detected shorthand
        for synonym, term in synonyms.items():
          if synonym in diagnosis:
            diagnosis = term

        # add diagnosis to the patient diagnosis list
        if diagnosis:
          patient_diagnoses.append(diagnosis)

    # goes through the diagnosis list and adds to the count or creates a new count
    if patient_diagnoses:
      for diagnosis in patient_diagnoses:
        if diagnosis in diagnosis_count:
          diagnosis_count[diagnosis] += 1
        else:
          diagnosis_count[diagnosis] = 1

df = pd.DataFrame(diagnosis_count.items(), columns=['Diagnosis', 'Count']) # create pandas df with diagnosis and count columns
df = df.sort_values(by='Count', ascending=False) # ordering the count values by decending order
to_remove = ["secondary:", "primary:", "secondary diagnosis:", "primary diagnosis:", "primary diagnoses:"] # note the list false diagnoses, aka diagnosis headings
df = df[~df['Diagnosis'].str.lower().str.strip().str.contains('|'.join(to_remove))] # actual removal: takes out diagnosis headings, whitespaces, and makes diseases all lowercase
total_counts = df['Count'].sum() # count the total number of diseases detected through cleaning process

# Adding " patients" based on the count value
for index, row in df.iterrows():
    count = row['Count']
    if count == 1:
        df.at[index, 'Count'] = str(count) + " patient"
    elif count >= 2:
        df.at[index, 'Count'] = str(count) + " patients"

# printing out the top ten table (head) of the most prevalent diseases
top_10_diseases = df.head(10)
print(top_10_diseases.to_string(index=False))

# printing out of additional summary including the average number of diseases of patients with at least 1 disease
print()
print("Number of patients with at least 1 diagnosis:", patients_with_diagnosis)
print("Number of diseases found through record cleaning:", total_counts)
avg_diagnoses = (total_counts / patients_with_diagnosis)
print("Average number of diagnoses per patient with at least 1 diagnosis:", avg_diagnoses)

                            Diagnosis       Count
                         hypertension 11 patients
                  atrial fibrillation  8 patients
                  acute renal failure  7 patients
       chronic systolic heart failure  6 patients
          gastrointestinal hemorrhage  5 patients
             type 1 diabetes mellitus  5 patients
chronic obstructive pulmonary disease  5 patients
             congestive heart failure  5 patients
        atherosclerotic heart disease  4 patients
              urinary tract infection  4 patients

Number of patients with at least 1 diagnosis: 55
Number of diseases found through record cleaning: 260
Average number of diagnoses per patient with at least 1 diagnosis: 4.7272727272727275
