In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import contractions
import numpy as np

## Reading the Patient notes

In [2]:
#reading the patient notes data

file_path = "data/patient_notes.csv"
notes = pd.read_csv(file_path)
notes

Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...
...,...,...,...
42141,95330,9,Ms. Madden is a 20 yo female presenting w/ the...
42142,95331,9,A 20 YO F CAME COMPLAIN A DULL 8/10 HEADACHE T...
42143,95332,9,Ms. Madden is a 20yo female who presents with ...
42144,95333,9,Stephanie madden is a 20 year old woman compla...


### Missing value report

In [3]:
#missing value report
notes.isna().sum()

pn_num        0
case_num      0
pn_history    0
dtype: int64

### Preprocessing

In [4]:
print(f"There are {len(notes['case_num'].unique())} patients and {notes.shape[0]} patient notes in total.")
print(f"\nNumber of patient notes per case:\n{notes['case_num'].value_counts()}")

There are 10 patients and 42146 patient notes in total.

Number of patient notes per case:
case_num
3    9753
5    6909
4    5405
9    5151
8    4196
7    4101
0    2268
2    1958
6    1597
1     808
Name: count, dtype: int64


In [5]:
#setting up few additional contractions to preprocess the data

contractions.add('yo', 'year old')
contractions.add('y o', 'year old')
contractions.add('y.o.', 'year old')
contractions.add('y.o', 'year old')
contractions.add('mos', 'months')
contractions.add('mo', 'months')
contractions.add('min', 'minutes')
contractions.add('mins', 'minutes')
contractions.add('y/o', 'year old') 
contractions.add('yr,', 'year')
contractions.add('m', 'male')
contractions.add('f','female')
contractions.add('yr', 'year')

In [6]:
#taken from here, slightly edited: https://www.dukemychart.org/home/en-us/docs/commonmedicalabbreviations.pdf
# adding more medical abbrevations to contractions list

test = '''A/P: Assessment and Plan
BMI: Body Mass Index
BMP: Basic Metabolic Profile 
BP: Blood Pressure
C&S: Culture and Sensitivity
C/O: complains of
CBC: Complete Blood Count
CC: Chief Complaint
CCE: clubbing, cyanosis or edema
Chemistry: a blood test looking at levels of electrolytes and kidney or liver function
Chem Panel: a blood test looking at levels of electrolytes and kidney or liver function
CKD: Chronic Kidney Disease
CMP: a blood test looking at levels of electrolytes, kidney and liver function
D/Dx: Differential Diagnosis
DOE: Dyspnea on exertion
DM: Diabetes Mellitus
DMII: Diabetes Mellitus Type II
ECG/EKG: Electrocardiogram
EOMI: Extra-ocular eye movements intact
ESRD: End Stage Renal Disease
ETOH: Alcohol
ETT: Endotracheal tube 
EXT: Extremities
F/U: Follow-up
GI: Gastrointestinal
GU: Genito-urinary (referring to the Urinary Tract)
H&H: Hemoglobin and Hematocrit
H&P: History and Physical
HCT: Hematocrit
HGB: Hemoglobin
HgBA1C: A blood test that measures your average blood glucose control over the last 3 months
HPI: History of the Present Illness
HEENT: Head, Ears, Eyes, Nose and Throat
HTN: Hypertension (High Blood Pressure)
I&D: Incision and Drainage
IM: intra-muscular
IMP: Impression
IV: Intra-venous
LBP: low back pain
LMP: last menstrual period
ND: naso-duodenal 
Neuro: Neurologic 
NG: naso-gastric
NJ: naso-jejunal
N/V: nausea and vomiting
OT: Occupational Therapy 
P: pulse
PCP: Primary Care Provider
PERRLA: Pupils equal, round and reactive to light and accommodation
PLT: Platelets
PMHx:Past Medical History
PO: to be taken by mouth
PR: to be taken by rectum
PRN: As needed
PSHx: Past Surgical History
Pt: patient
Renal Function Panel: a blood test looking at levels of electrolytes and kidney function
R/O: Rule Out
RR: Respiratory Rate
SocHx or SH:Social History
SOB: Shortness of breath
SQ: Sub-cutaneous
ST: Speech Therapy
STI: Sexually transmitted infection
T: Temperature
TM: Tympanic membrane
UA: Urinalysis
URI: Upper Respiratory Infection
UTI: Urinary Tract Infection
VSS: Vital Signs Stable
WBC: White blood cell 
WCC: Well Child Check
WT: Weight
PMH: Past Medical History'''

for x in test.split('\n'):
    contractions.add(x.split(':')[0], x.split(':')[1])

In [7]:
def preprocess_text(text, flag):
    '''
    preprocessing the required text column to convert case, remove number, remove contractions and stopwords
    '''
    # Convert to lower case
    text = text.lower()
    
    ## add space inbetween numbers and letters (e.g. 5mg to 5 mg, 17yo to 17 yo)
    text = re.sub(r'(\d+)([a-zA-Z])', r'\1 \2', text)
    
    #remove numbers
    text = re.sub(r'\d+', '', text)

    # Expand contractions (e.g., "can't" to "can not")
    text = contractions.fix(text)
    
    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    
    #removing stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    # Using Lemmatizer or Stemmer
    if flag:
        lemmatizer = WordNetLemmatizer()
        text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    else:
        stemmer = PorterStemmer()
        text = ' '.join([stemmer.stem(word) for word in text.split()])

    return text

In [8]:
# Apply the preprocessing to the 'pn_history' column - stemming the text
notes['stemmed_pn_history'] = notes['pn_history'].apply(lambda x: preprocess_text(x, False))
for i in notes['stemmed_pn_history'].head(3):
    print(i)
    print()


year old male come student health clinic complain heart pound mr cleveland mother given verbal consent histori physic examin treatment began month ago sudden intermitt day last minut worsen non allev aggrav associ dispnea exers rest stress school report fe feel like heart jump chest ro deni chest pain dyaphoresi weight loss chill fever nausea vomit pedal edeam past medic histori non med aderol friend nkda fh father mi recent mother thyroid dz sh non smoker mariguana month ago beer weekend basketbal school sh std

year old male recurr palpit past month last minut happen time sinc begin one time durign baskebal game two day ago light headed pressur chest catch breath faint teh episod sweat diarrhea heat intoler weight loss tri aterol abl better concentr receiv roommat

dillon cleveland year old male patient signific past medic histori present complaint heart pound go month happen twice month cannot think trigger occur activ rest occasion accompani chest pressur pain locat center chest on

In [9]:
# Apply the preprocessing to the 'pn_history' column - lemmatizing the text
notes['lemmatize_pn_history'] = notes['pn_history'].apply(lambda x: preprocess_text(x, True))
for i in notes['lemmatize_pn_history'].head(3):
    print(i)
    print()


year old male come student health clinic complaining heart pounding mr cleveland mother given verbal consent history physical examination treatment began month ago sudden intermittent day lasting minute worsening non allev aggrav associated dispnea exersion rest stressed school report fe feel like heart jumping chest ro denies chest pain dyaphoresis weight loss chill fever nausea vomiting pedal edeam past medical history non med aderol friend nkda fh father mi recently mother thyroid dz sh non smoker mariguana month ago beer weekend basketball school sh std

year old male recurrent palpitation past month lasting minute happened time since beginning one time durign baskeball game two day ago light headedness pressure chest catching breath fainting teh episode sweating diarrhea heat intolerance weight loss tried aterol able better concentrate received roommate

dillon cleveland year old male patient significant past medical history present complaint heart pounding going month happens twi

<br>
<br>

### Notes:

- Comparing the processed text from both lemmatized and stemmed text, using lemmatizer makes more sense.

- Therefore, I am using Lemmatization.

In [10]:
#creating initial note length column
notes['initial_note_length'] = notes['pn_history'].apply(lambda x: len(x))


In [11]:

#dropping other unnecessary columns along with "stemmed_pn_history"
notes.drop(['pn_num', 'pn_history', 'stemmed_pn_history'], axis=1, inplace = True)

#renaming lemmatized column name
notes.rename({'lemmatize_pn_history' : 'processed_notes'}, inplace=True, axis = 1)

#creating processed note length column
notes['processed_note_length'] = notes['processed_notes'].apply(lambda x: len(x))
notes

Unnamed: 0,case_num,processed_notes,initial_note_length,processed_note_length
0,0,year old male come student health clinic compl...,695,563
1,0,year old male recurrent palpitation past month...,424,305
2,0,dillon cleveland year old male patient signifi...,835,571
3,0,year old male complains palpitation started mo...,700,540
4,0,year old male past medical history evaluation ...,948,673
...,...,...,...,...
42141,9,m madden year old female presenting w worst ha...,765,545
42142,9,year old female came complain dull headache as...,634,504
42143,9,m madden year old female present headache day ...,928,661
42144,9,stephanie madden year old woman complaining he...,697,558


In [12]:
notes.to_csv('data/processed_notes.csv', index = False)