""" 
IDEA Lab Project: PILOT NLP PROJECT on MIMICIII DATASET WITH AWS SageMaker/Jupyterlab
Prepared for : Atahabasca University IDEA Lab Reasercah Center
Prepared by: Russell Rupok, Graduate Research Student and MSc in IS student
Date: January 15,2022 

"""
    

In [1]:
#!/usr/bin/env python3

In [2]:
""" 
Project: PREVALENCE, PREDICTORS, AND CONSEQUENCES OF STIGMATIZING LANGUAGE IN NURSING NOTES

Data Source: MIMIC III AWS dataset from https://registry.opendata.aws/mimiciii/

Project goal and problem: 

Goal of our project is to determine prevalence, predictors, and consequences of stigmatizing language in nursing transfer/progress notes.

a.	Determine if stigmatizing language is present in nursing documentation from critical care admissions.
b.	Examine possible antecedents (e.g., admitting diagnosis, age, race/ethnicity)
c.	Examine possible consequences (e.g., pain medication practices, family involvement, morbidity, mortality, length of stay)
d.	Possible other language indicators related to above factors (i.e., Linguistic Inquiry and Word Count)
 
This program will “pull” patient’s health record from 7703 unique patient's id along with patient’s Subject_ID, hospital admission, chartdate, chart time , nurtses text. 

  
"""

"\u2003\nProject: PREVALENCE, PREDICTORS, AND CONSEQUENCES OF STIGMATIZING LANGUAGE IN NURSING NOTES\n\nData Source: MIMIC III AWS dataset from https://registry.opendata.aws/mimiciii/\n\nProject goal and problem: \n\nGoal of our project is to determine prevalence, predictors, and consequences of stigmatizing language in nursing transfer/progress notes.\n\na.\tDetermine if stigmatizing language is present in nursing documentation from critical care admissions.\nb.\tExamine possible antecedents (e.g., admitting diagnosis, age, race/ethnicity)\nc.\tExamine possible consequences (e.g., pain medication practices, family involvement, morbidity, mortality, length of stay)\nd.\tPossible other language indicators related to above factors (i.e., Linguistic Inquiry and Word Count)\n \nThis program will “pull” patient’s health record from 7703 unique patient's id along with patient’s Subject_ID, hospital admission, chartdate, chart time , nurtses text. \n\n  \n"

In [3]:
#importing python module to complete the analysis and craete a model

In [4]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
import numpy as np
import matplotlib 
import pandas as pd
import string
import statistics
import re
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
#nltk.download()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RRupok\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\RRupok\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


In [5]:
#check current local directory of this notebook

In [6]:
pwd

'C:\\Users\\RRupok\\MIMICiii'

In [7]:
#import mimiciii dataset after query using AWS Athena 

In [8]:
df=pd.read_csv("nursestransferprogressNotesV2.csv")
print(df)

       Unnamed: 0  subject_id   hadm_id        chartdate         charttime  \
0               0     67053.0  138302.0  2164-01-08 0:00   2164-01-08 7:36   
1               1     82482.0  113214.0  2194-12-15 0:00   2194-12-15 3:51   
2               2     68221.0  180926.0  2167-12-07 0:00  2167-12-07 21:49   
3               3     54461.0  110002.0  2117-11-06 0:00  2117-11-06 16:55   
4               4     62143.0  167116.0  2102-12-11 0:00   2102-12-11 0:53   
...           ...         ...       ...              ...               ...   
82633      197137     26991.0       NaN  2166-02-06 0:00   2166-02-06 5:58   
82634      197158     14131.0  138678.0  2118-02-23 0:00   2118-02-23 4:58   
82635      197172     43284.0  190269.0  2118-03-26 0:00   2118-03-26 6:42   
82636      197181     32414.0  189094.0  2106-05-15 0:00   2106-05-15 6:03   
82637      197193     79501.0  119347.0  2165-10-22 0:00   2165-10-22 6:03   

                 description  \
0      Nursing Progress Note   

In [9]:
#Examnine data and identify any inconsistancy

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82638 entries, 0 to 82637
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   82638 non-null  int64  
 1   subject_id   82638 non-null  float64
 2   hadm_id      82073 non-null  float64
 3   chartdate    82638 non-null  object 
 4   charttime    82637 non-null  object 
 5   description  82638 non-null  object 
 6   text         82637 non-null  object 
dtypes: float64(2), int64(1), object(4)
memory usage: 4.4+ MB


In [11]:
#df['duplicate']=df['charttime'].shift(1)
#df['charttime']=df.apply(lambda x:np.nan if x['charttime'] == x['duplicate']
                    #else x['charttime'], axis =1)
   
#df1=df.drop('duplicate',axis=1)

In [12]:
# fill empty entry in hospital admission , there are around 600 empty hospital admission which has a valid subject id and other data in place

In [17]:
df['hadm_id']=df['hadm_id'].fillna(0)
df['charttime']=df['charttime'].fillna(0)
df['text']=df['text'].fillna('there was no notes here')

In [18]:
#making sure no duplicates left in charttime after doing ETL in power BI

In [19]:
df1=df.drop_duplicates(subset=['charttime'])

In [20]:
# renaming column name "text" to nurses notes for better viewer understanding

In [21]:
df1=df.rename(columns={'text':'nursesnotes'})
print(df1)

       Unnamed: 0  subject_id   hadm_id        chartdate         charttime  \
0               0     67053.0  138302.0  2164-01-08 0:00   2164-01-08 7:36   
1               1     82482.0  113214.0  2194-12-15 0:00   2194-12-15 3:51   
2               2     68221.0  180926.0  2167-12-07 0:00  2167-12-07 21:49   
3               3     54461.0  110002.0  2117-11-06 0:00  2117-11-06 16:55   
4               4     62143.0  167116.0  2102-12-11 0:00   2102-12-11 0:53   
...           ...         ...       ...              ...               ...   
82633      197137     26991.0       0.0  2166-02-06 0:00   2166-02-06 5:58   
82634      197158     14131.0  138678.0  2118-02-23 0:00   2118-02-23 4:58   
82635      197172     43284.0  190269.0  2118-03-26 0:00   2118-03-26 6:42   
82636      197181     32414.0  189094.0  2106-05-15 0:00   2106-05-15 6:03   
82637      197193     79501.0  119347.0  2165-10-22 0:00   2165-10-22 6:03   

                 description  \
0      Nursing Progress Note   

In [22]:
df1.describe()

Unnamed: 0.1,Unnamed: 0,subject_id,hadm_id
count,82638.0,82638.0,82638.0
mean,77732.829364,59087.623309,148670.324923
std,55611.501248,25606.253697,30782.918532
min,0.0,23.0,0.0
25%,26085.25,42110.0,125351.25
50%,74266.0,60754.5,148413.0
75%,124205.75,79804.0,173929.0
max,197193.0,99999.0,199972.0


In [23]:
#df1['subject_id'].value_counts()

In [24]:
"""data_after_duplicate=df1.drop_duplicates()
print(data_after_duplicate)"""

'data_after_duplicate=df1.drop_duplicates()\nprint(data_after_duplicate)'

In [25]:
# used pandas fillna method to fill out hadm_id with 1 and charttime and nurses notes with text and intager value
# keeping entry empty will give AttributeError: 'float' object has no attribute 'i.e.lower'

In [26]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82638 entries, 0 to 82637
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   82638 non-null  int64  
 1   subject_id   82638 non-null  float64
 2   hadm_id      82638 non-null  float64
 3   chartdate    82638 non-null  object 
 4   charttime    82638 non-null  object 
 5   description  82638 non-null  object 
 6   nursesnotes  82638 non-null  object 
dtypes: float64(2), int64(1), object(4)
memory usage: 4.4+ MB


In [27]:
df1.isnull()

Unnamed: 0.1,Unnamed: 0,subject_id,hadm_id,chartdate,charttime,description,nursesnotes
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
82633,False,False,False,False,False,False,False
82634,False,False,False,False,False,False,False
82635,False,False,False,False,False,False,False
82636,False,False,False,False,False,False,False


In [28]:
print(f"Dataset has total {df1['hadm_id'].nunique()} Unique Hospital Admission")
print(f"Dataset has total {df1['subject_id'].nunique()} subject_id")

Dataset has total 9069 Unique Hospital Admission
Dataset has total 7703 subject_id


In [29]:
#df1['subject_id'].value_counts()

#df1['hadm_id'].value_counts()


In [30]:
#df1['hadm_id'].value_counts()

In [31]:
#df1['hadm_id'].nunique()

In [32]:
def lowertext(text):
    lower_case=(text.lower())
    return lower_case

In [33]:
df1["lowercasenursestext"]= df1['nursesnotes'].apply(lambda x:lowertext(x))
df1.head(10)

Unnamed: 0.1,Unnamed: 0,subject_id,hadm_id,chartdate,charttime,description,nursesnotes,lowercasenursestext
0,0,67053.0,138302.0,2164-01-08 0:00,2164-01-08 7:36,Nursing Progress Note,"Respiratory failure, acute (not ARDS/[**Doctor...","respiratory failure, acute (not ards/[**doctor..."
1,1,82482.0,113214.0,2194-12-15 0:00,2194-12-15 3:51,Nursing Progress Note,Subdural hemorrhage (SDH)\n Assessment:\n ...,subdural hemorrhage (sdh)\n assessment:\n ...
2,2,68221.0,180926.0,2167-12-07 0:00,2167-12-07 21:49,Nursing Progress Note,"Comfort care (CMO, Comfort Measures)\n Asses...","comfort care (cmo, comfort measures)\n asses..."
3,3,54461.0,110002.0,2117-11-06 0:00,2117-11-06 16:55,Nursing Progress Note,"The patient is a 37y/o male with a PMH of DM, ...","the patient is a 37y/o male with a pmh of dm, ..."
4,4,62143.0,167116.0,2102-12-11 0:00,2102-12-11 0:53,Nursing Progress Note,"67 yo M with history of CHF, COPD, Afib, CAD s...","67 yo m with history of chf, copd, afib, cad s..."
5,5,49671.0,109832.0,2174-01-15 0:00,2174-01-15 1:35,Nursing Progress Note,"CVA (Stroke, Cerebral infarction), Other\n A...","cva (stroke, cerebral infarction), other\n a..."
6,6,79511.0,109613.0,2176-11-21 0:00,2176-11-21 13:58,Nursing Progress Note,75y.o female transferred from [**Hospital 1294...,75y.o female transferred from [**hospital 1294...
7,7,54461.0,110002.0,2117-11-04 0:00,2117-11-04 17:04,Nursing Progress Note,"The patient is a 37y/o male with a PMH of DM, ...","the patient is a 37y/o male with a pmh of dm, ..."
8,8,65124.0,122850.0,2179-11-22 0:00,2179-11-22 15:04,Nursing Progress Note,57F with obesity and advanced [**Hospital 936*...,57f with obesity and advanced [**hospital 936*...
9,9,32195.0,188413.0,2160-01-14 0:00,2160-01-14 17:07,Nursing Progress Note,"30yo female w/ DM type I, HTN, ESRD on daily P...","30yo female w/ dm type i, htn, esrd on daily p..."


In [34]:
def word_count(text):
    word_list=text.split()
    return(len(word_list))

In [35]:
df1["totalnotesword"]= df1['nursesnotes'].apply(lambda x:word_count(x))
df1.head()

Unnamed: 0.1,Unnamed: 0,subject_id,hadm_id,chartdate,charttime,description,nursesnotes,lowercasenursestext,totalnotesword
0,0,67053.0,138302.0,2164-01-08 0:00,2164-01-08 7:36,Nursing Progress Note,"Respiratory failure, acute (not ARDS/[**Doctor...","respiratory failure, acute (not ards/[**doctor...",80
1,1,82482.0,113214.0,2194-12-15 0:00,2194-12-15 3:51,Nursing Progress Note,Subdural hemorrhage (SDH)\n Assessment:\n ...,subdural hemorrhage (sdh)\n assessment:\n ...,89
2,2,68221.0,180926.0,2167-12-07 0:00,2167-12-07 21:49,Nursing Progress Note,"Comfort care (CMO, Comfort Measures)\n Asses...","comfort care (cmo, comfort measures)\n asses...",83
3,3,54461.0,110002.0,2117-11-06 0:00,2117-11-06 16:55,Nursing Progress Note,"The patient is a 37y/o male with a PMH of DM, ...","the patient is a 37y/o male with a pmh of dm, ...",502
4,4,62143.0,167116.0,2102-12-11 0:00,2102-12-11 0:53,Nursing Progress Note,"67 yo M with history of CHF, COPD, Afib, CAD s...","67 yo m with history of chf, copd, afib, cad s...",541


In [36]:
df1["totalhadms"]= df1.groupby(['subject_id'])['hadm_id'].transform('count')
df1.head(10)

Unnamed: 0.1,Unnamed: 0,subject_id,hadm_id,chartdate,charttime,description,nursesnotes,lowercasenursestext,totalnotesword,totalhadms
0,0,67053.0,138302.0,2164-01-08 0:00,2164-01-08 7:36,Nursing Progress Note,"Respiratory failure, acute (not ARDS/[**Doctor...","respiratory failure, acute (not ards/[**doctor...",80,42
1,1,82482.0,113214.0,2194-12-15 0:00,2194-12-15 3:51,Nursing Progress Note,Subdural hemorrhage (SDH)\n Assessment:\n ...,subdural hemorrhage (sdh)\n assessment:\n ...,89,8
2,2,68221.0,180926.0,2167-12-07 0:00,2167-12-07 21:49,Nursing Progress Note,"Comfort care (CMO, Comfort Measures)\n Asses...","comfort care (cmo, comfort measures)\n asses...",83,32
3,3,54461.0,110002.0,2117-11-06 0:00,2117-11-06 16:55,Nursing Progress Note,"The patient is a 37y/o male with a PMH of DM, ...","the patient is a 37y/o male with a pmh of dm, ...",502,43
4,4,62143.0,167116.0,2102-12-11 0:00,2102-12-11 0:53,Nursing Progress Note,"67 yo M with history of CHF, COPD, Afib, CAD s...","67 yo m with history of chf, copd, afib, cad s...",541,73
5,5,49671.0,109832.0,2174-01-15 0:00,2174-01-15 1:35,Nursing Progress Note,"CVA (Stroke, Cerebral infarction), Other\n A...","cva (stroke, cerebral infarction), other\n a...",97,8
6,6,79511.0,109613.0,2176-11-21 0:00,2176-11-21 13:58,Nursing Progress Note,75y.o female transferred from [**Hospital 1294...,75y.o female transferred from [**hospital 1294...,790,10
7,7,54461.0,110002.0,2117-11-04 0:00,2117-11-04 17:04,Nursing Progress Note,"The patient is a 37y/o male with a PMH of DM, ...","the patient is a 37y/o male with a pmh of dm, ...",537,43
8,8,65124.0,122850.0,2179-11-22 0:00,2179-11-22 15:04,Nursing Progress Note,57F with obesity and advanced [**Hospital 936*...,57f with obesity and advanced [**hospital 936*...,422,17
9,9,32195.0,188413.0,2160-01-14 0:00,2160-01-14 17:07,Nursing Progress Note,"30yo female w/ DM type I, HTN, ESRD on daily P...","30yo female w/ dm type i, htn, esrd on daily p...",155,13


In [37]:
#df1["totalhadms_avg"]= df1.groupby(['subject_id'])['hadm_id'].transform('mean')
#df1.head(10)

In [38]:
df1.describe()

Unnamed: 0.1,Unnamed: 0,subject_id,hadm_id,totalnotesword,totalhadms
count,82638.0,82638.0,82638.0,82638.0,82638.0
mean,77732.829364,59087.623309,148670.324923,251.344938,32.19819
std,55611.501248,25606.253697,30782.918532,161.987493,36.047893
min,0.0,23.0,0.0,1.0,1.0
25%,26085.25,42110.0,125351.25,134.0,8.0
50%,74266.0,60754.5,148413.0,219.0,19.0
75%,124205.75,79804.0,173929.0,337.0,43.0
max,197193.0,99999.0,199972.0,3458.0,268.0


In [39]:
#selected_words=['hospice','euthanasia','PEOLC','comfortonly','terminal','donotresuscitate','dnr','goal of care','level of care','comfort care','palliative','end of life']

#df1.loc[df1["nursesnotes"].isin(selected_words),"nursesnotes"].value_counts()

In [63]:
def average_value (text):
    avg=round(sum(df1.totalhadms)/len(df1.totalhadms))
    return avg

In [64]:
average_value(df1.totalhadms)

32

In [65]:
df1["average_totalHADMnotes"]= df1['totalhadms'].apply(lambda x:average_value(x))
df1.head(10)

Unnamed: 0.1,Unnamed: 0,subject_id,hadm_id,chartdate,charttime,description,nursesnotes,lowercasenursestext,totalnotesword,totalhadms,average_totalHADMnotes,remove_punctuation,notes_nostopword,notes_lemmatize
0,0,67053.0,138302.0,2164-01-08 0:00,2164-01-08 7:36,Nursing Progress Note,"Respiratory failure, acute (not ARDS/[**Doctor...","respiratory failure, acute (not ards/[**doctor...",80,42,32,respiratory failure acute not ardsdoctor last ...,"[r, e, p, r, r, , f, l, u, r, e, , c, u, e, ...","[r, e, s, p, i, r, a, t, o, r, y, , f, a, i, ..."
1,1,82482.0,113214.0,2194-12-15 0:00,2194-12-15 3:51,Nursing Progress Note,Subdural hemorrhage (SDH)\n Assessment:\n ...,subdural hemorrhage (sdh)\n assessment:\n ...,89,8,32,subdural hemorrhage sdh\n assessment\n ...,"[u, b, u, r, l, , h, e, r, r, h, g, e, , h, ...","[s, u, b, d, u, r, a, l, , h, e, m, o, r, r, ..."
2,2,68221.0,180926.0,2167-12-07 0:00,2167-12-07 21:49,Nursing Progress Note,"Comfort care (CMO, Comfort Measures)\n Asses...","comfort care (cmo, comfort measures)\n asses...",83,32,32,comfort care cmo comfort measures\n assessme...,"[c, f, r, , c, r, e, , c, , c, f, r, , e, ...","[c, o, m, f, o, r, t, , c, a, r, e, , c, m, ..."
3,3,54461.0,110002.0,2117-11-06 0:00,2117-11-06 16:55,Nursing Progress Note,"The patient is a 37y/o male with a PMH of DM, ...","the patient is a 37y/o male with a pmh of dm, ...",502,43,32,the patient is a 37yo male with a pmh of dm et...,"[h, e, , p, e, n, , , , 3, 7, , l, e, , ...","[t, h, e, , p, a, t, i, e, n, t, , i, s, , ..."
4,4,62143.0,167116.0,2102-12-11 0:00,2102-12-11 0:53,Nursing Progress Note,"67 yo M with history of CHF, COPD, Afib, CAD s...","67 yo m with history of chf, copd, afib, cad s...",541,73,32,67 yo m with history of chf copd afib cad sp c...,"[6, 7, , , , w, h, , h, r, , f, , c, h, ...","[6, 7, , y, o, , m, , w, i, t, h, , h, i, ..."
5,5,49671.0,109832.0,2174-01-15 0:00,2174-01-15 1:35,Nursing Progress Note,"CVA (Stroke, Cerebral infarction), Other\n A...","cva (stroke, cerebral infarction), other\n a...",97,8,32,cva stroke cerebral infarction other\n asses...,"[c, v, , r, k, e, , c, e, r, e, b, r, l, , ...","[c, v, a, , s, t, r, o, k, e, , c, e, r, e, ..."
6,6,79511.0,109613.0,2176-11-21 0:00,2176-11-21 13:58,Nursing Progress Note,75y.o female transferred from [**Hospital 1294...,75y.o female transferred from [**hospital 1294...,790,10,32,75yo female transferred from hospital 1294 hos...,"[7, 5, , f, e, l, e, , r, n, f, e, r, r, e, ...","[7, 5, y, o, , f, e, m, a, l, e, , t, r, a, ..."
7,7,54461.0,110002.0,2117-11-04 0:00,2117-11-04 17:04,Nursing Progress Note,"The patient is a 37y/o male with a PMH of DM, ...","the patient is a 37y/o male with a pmh of dm, ...",537,43,32,the patient is a 37yo male with a pmh of dm et...,"[h, e, , p, e, n, , , , 3, 7, , l, e, , ...","[t, h, e, , p, a, t, i, e, n, t, , i, s, , ..."
8,8,65124.0,122850.0,2179-11-22 0:00,2179-11-22 15:04,Nursing Progress Note,57F with obesity and advanced [**Hospital 936*...,57f with obesity and advanced [**hospital 936*...,422,17,32,57f with obesity and advanced hospital 936 tra...,"[5, 7, f, , w, h, , b, e, , n, , v, n, c, ...","[5, 7, f, , w, i, t, h, , o, b, e, s, i, t, ..."
9,9,32195.0,188413.0,2160-01-14 0:00,2160-01-14 17:07,Nursing Progress Note,"30yo female w/ DM type I, HTN, ESRD on daily P...","30yo female w/ dm type i, htn, esrd on daily p...",155,13,32,30yo female w dm type i htn esrd on daily pd w...,"[3, 0, , f, e, l, e, , w, , , p, e, , , ...","[3, 0, y, o, , f, e, m, a, l, e, , w, , d, ..."


In [66]:
#sns.histplot(data=data_after_duplicate, x="totalhadms",color='red',binwidth=4)
#plt.title('Ranges of Patients Note', fontsize=18)
#plt.xlabel('Range of Hospital Admissions', fontsize=16)
#plt.ylabel('Subject_IDs Per HADM', fontsize=16)

In [67]:
#data_after_duplicate["average_totalnotes"]= data_after_duplicate.groupby(['subject_id'])["totalhadms"].transform('mean')
#data_after_duplicate.head(100)

In [68]:
print(f"Dataset has total {len(df1)} row")
print(f"Dataset has total {len(df1.columns)} columns")
print(f"Dataset has total {df1['hadm_id'].nunique()} Unique Hospital Admission")
print(f"Dataset has total {df1['subject_id'].nunique()} subject_id")
print(f"Number of Empty subject_id: {df1['subject_id'].isnull().sum()}")
print(f"Number of Empty nursesnotes: {df1['nursesnotes'].isnull().sum()}")
print(f"Number of Empty hadm_id: {df1['hadm_id'].isnull().sum()}")

Dataset has total 82638 row
Dataset has total 14 columns
Dataset has total 9069 Unique Hospital Admission
Dataset has total 7703 subject_id
Number of Empty subject_id: 0
Number of Empty nursesnotes: 0
Number of Empty hadm_id: 0


In [69]:
dir(string)

['Formatter',
 'Template',
 '_ChainMap',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_re',
 '_sentinel_dict',
 '_string',
 'ascii_letters',
 'ascii_lowercase',
 'ascii_uppercase',
 'capwords',
 'digits',
 'hexdigits',
 'octdigits',
 'printable',
 'punctuation',
 'whitespace']

In [70]:
def remove_punctuation(txt):
    txt_nopunt="".join([c for c in txt if c not in string.punctuation])
    return  txt_nopunt

In [71]:
df1["remove_punctuation"]= df1['lowercasenursestext'].apply(lambda x:remove_punctuation(x))
df1.head(5)

Unnamed: 0.1,Unnamed: 0,subject_id,hadm_id,chartdate,charttime,description,nursesnotes,lowercasenursestext,totalnotesword,totalhadms,average_totalHADMnotes,remove_punctuation,notes_nostopword,notes_lemmatize
0,0,67053.0,138302.0,2164-01-08 0:00,2164-01-08 7:36,Nursing Progress Note,"Respiratory failure, acute (not ARDS/[**Doctor...","respiratory failure, acute (not ards/[**doctor...",80,42,32,respiratory failure acute not ardsdoctor last ...,"[r, e, p, r, r, , f, l, u, r, e, , c, u, e, ...","[r, e, s, p, i, r, a, t, o, r, y, , f, a, i, ..."
1,1,82482.0,113214.0,2194-12-15 0:00,2194-12-15 3:51,Nursing Progress Note,Subdural hemorrhage (SDH)\n Assessment:\n ...,subdural hemorrhage (sdh)\n assessment:\n ...,89,8,32,subdural hemorrhage sdh\n assessment\n ...,"[u, b, u, r, l, , h, e, r, r, h, g, e, , h, ...","[s, u, b, d, u, r, a, l, , h, e, m, o, r, r, ..."
2,2,68221.0,180926.0,2167-12-07 0:00,2167-12-07 21:49,Nursing Progress Note,"Comfort care (CMO, Comfort Measures)\n Asses...","comfort care (cmo, comfort measures)\n asses...",83,32,32,comfort care cmo comfort measures\n assessme...,"[c, f, r, , c, r, e, , c, , c, f, r, , e, ...","[c, o, m, f, o, r, t, , c, a, r, e, , c, m, ..."
3,3,54461.0,110002.0,2117-11-06 0:00,2117-11-06 16:55,Nursing Progress Note,"The patient is a 37y/o male with a PMH of DM, ...","the patient is a 37y/o male with a pmh of dm, ...",502,43,32,the patient is a 37yo male with a pmh of dm et...,"[h, e, , p, e, n, , , , 3, 7, , l, e, , ...","[t, h, e, , p, a, t, i, e, n, t, , i, s, , ..."
4,4,62143.0,167116.0,2102-12-11 0:00,2102-12-11 0:53,Nursing Progress Note,"67 yo M with history of CHF, COPD, Afib, CAD s...","67 yo m with history of chf, copd, afib, cad s...",541,73,32,67 yo m with history of chf copd afib cad sp c...,"[6, 7, , , , w, h, , h, r, , f, , c, h, ...","[6, 7, , y, o, , m, , w, i, t, h, , h, i, ..."


In [72]:
#Removing StopWords from NOTES_CLEAN_Tokenized. Stop Words are words which does not have much meaning in NLP i.e: am, is, the
# Used NLTK English corpus 

In [73]:
import nltk
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [74]:
stopwords=nltk.corpus.stopwords.words('english')
stopwords[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [75]:
def remove_stopwords(txt_tokenized):
    txt_Nostopword = [word for word in txt_tokenized if word not in stopwords]
    return txt_Nostopword

df1['notes_nostopword']= df1['remove_punctuation'].apply(lambda x:remove_stopwords(x))
df1.head(5)
    

Unnamed: 0.1,Unnamed: 0,subject_id,hadm_id,chartdate,charttime,description,nursesnotes,lowercasenursestext,totalnotesword,totalhadms,average_totalHADMnotes,remove_punctuation,notes_nostopword,notes_lemmatize
0,0,67053.0,138302.0,2164-01-08 0:00,2164-01-08 7:36,Nursing Progress Note,"Respiratory failure, acute (not ARDS/[**Doctor...","respiratory failure, acute (not ards/[**doctor...",80,42,32,respiratory failure acute not ardsdoctor last ...,"[r, e, p, r, r, , f, l, u, r, e, , c, u, e, ...","[r, e, s, p, i, r, a, t, o, r, y, , f, a, i, ..."
1,1,82482.0,113214.0,2194-12-15 0:00,2194-12-15 3:51,Nursing Progress Note,Subdural hemorrhage (SDH)\n Assessment:\n ...,subdural hemorrhage (sdh)\n assessment:\n ...,89,8,32,subdural hemorrhage sdh\n assessment\n ...,"[u, b, u, r, l, , h, e, r, r, h, g, e, , h, ...","[s, u, b, d, u, r, a, l, , h, e, m, o, r, r, ..."
2,2,68221.0,180926.0,2167-12-07 0:00,2167-12-07 21:49,Nursing Progress Note,"Comfort care (CMO, Comfort Measures)\n Asses...","comfort care (cmo, comfort measures)\n asses...",83,32,32,comfort care cmo comfort measures\n assessme...,"[c, f, r, , c, r, e, , c, , c, f, r, , e, ...","[c, o, m, f, o, r, t, , c, a, r, e, , c, m, ..."
3,3,54461.0,110002.0,2117-11-06 0:00,2117-11-06 16:55,Nursing Progress Note,"The patient is a 37y/o male with a PMH of DM, ...","the patient is a 37y/o male with a pmh of dm, ...",502,43,32,the patient is a 37yo male with a pmh of dm et...,"[h, e, , p, e, n, , , , 3, 7, , l, e, , ...","[t, h, e, , p, a, t, i, e, n, t, , i, s, , ..."
4,4,62143.0,167116.0,2102-12-11 0:00,2102-12-11 0:53,Nursing Progress Note,"67 yo M with history of CHF, COPD, Afib, CAD s...","67 yo m with history of chf, copd, afib, cad s...",541,73,32,67 yo m with history of chf copd afib cad sp c...,"[6, 7, , , , w, h, , h, r, , f, , c, h, ...","[6, 7, , y, o, , m, , w, i, t, h, , h, i, ..."


In [76]:
selected_words=['hospice','euthanasia','PEOLC','comfortonly','terminal','donotresuscitate','dnr','goal of care','level of care','comfort care','palliative','end of life']

df1.loc[df1["remove_punctuation"].isin(selected_words),"remove_punctuation"].value_counts()

Series([], Name: remove_punctuation, dtype: int64)

In [77]:
#WordNet Lemmmatizer to gave the word to thier root dictonary word 

In [78]:
WordNetLemma=nltk.WordNetLemmatizer()
#PorterStemmer=nltk.PorterStemmer()
dir(WordNetLemma)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'lemmatize']

In [79]:
#creating the function to lemmatize text

In [80]:
def lemmatization(token_txt):
    txt=[WordNetLemma.lemmatize (word) for word in token_txt]
    return txt
# dataset['NOTES_Lemmatize']= dataset['NOTES_NoStopWord'].apply(lambda x:lemmatization(x))


In [81]:
df1['notes_lemmatize']= df1['remove_punctuation'].apply(lambda x:lemmatization(x))
df1.head(10)

Unnamed: 0.1,Unnamed: 0,subject_id,hadm_id,chartdate,charttime,description,nursesnotes,lowercasenursestext,totalnotesword,totalhadms,average_totalHADMnotes,remove_punctuation,notes_nostopword,notes_lemmatize
0,0,67053.0,138302.0,2164-01-08 0:00,2164-01-08 7:36,Nursing Progress Note,"Respiratory failure, acute (not ARDS/[**Doctor...","respiratory failure, acute (not ards/[**doctor...",80,42,32,respiratory failure acute not ardsdoctor last ...,"[r, e, p, r, r, , f, l, u, r, e, , c, u, e, ...","[r, e, s, p, i, r, a, t, o, r, y, , f, a, i, ..."
1,1,82482.0,113214.0,2194-12-15 0:00,2194-12-15 3:51,Nursing Progress Note,Subdural hemorrhage (SDH)\n Assessment:\n ...,subdural hemorrhage (sdh)\n assessment:\n ...,89,8,32,subdural hemorrhage sdh\n assessment\n ...,"[u, b, u, r, l, , h, e, r, r, h, g, e, , h, ...","[s, u, b, d, u, r, a, l, , h, e, m, o, r, r, ..."
2,2,68221.0,180926.0,2167-12-07 0:00,2167-12-07 21:49,Nursing Progress Note,"Comfort care (CMO, Comfort Measures)\n Asses...","comfort care (cmo, comfort measures)\n asses...",83,32,32,comfort care cmo comfort measures\n assessme...,"[c, f, r, , c, r, e, , c, , c, f, r, , e, ...","[c, o, m, f, o, r, t, , c, a, r, e, , c, m, ..."
3,3,54461.0,110002.0,2117-11-06 0:00,2117-11-06 16:55,Nursing Progress Note,"The patient is a 37y/o male with a PMH of DM, ...","the patient is a 37y/o male with a pmh of dm, ...",502,43,32,the patient is a 37yo male with a pmh of dm et...,"[h, e, , p, e, n, , , , 3, 7, , l, e, , ...","[t, h, e, , p, a, t, i, e, n, t, , i, s, , ..."
4,4,62143.0,167116.0,2102-12-11 0:00,2102-12-11 0:53,Nursing Progress Note,"67 yo M with history of CHF, COPD, Afib, CAD s...","67 yo m with history of chf, copd, afib, cad s...",541,73,32,67 yo m with history of chf copd afib cad sp c...,"[6, 7, , , , w, h, , h, r, , f, , c, h, ...","[6, 7, , y, o, , m, , w, i, t, h, , h, i, ..."
5,5,49671.0,109832.0,2174-01-15 0:00,2174-01-15 1:35,Nursing Progress Note,"CVA (Stroke, Cerebral infarction), Other\n A...","cva (stroke, cerebral infarction), other\n a...",97,8,32,cva stroke cerebral infarction other\n asses...,"[c, v, , r, k, e, , c, e, r, e, b, r, l, , ...","[c, v, a, , s, t, r, o, k, e, , c, e, r, e, ..."
6,6,79511.0,109613.0,2176-11-21 0:00,2176-11-21 13:58,Nursing Progress Note,75y.o female transferred from [**Hospital 1294...,75y.o female transferred from [**hospital 1294...,790,10,32,75yo female transferred from hospital 1294 hos...,"[7, 5, , f, e, l, e, , r, n, f, e, r, r, e, ...","[7, 5, y, o, , f, e, m, a, l, e, , t, r, a, ..."
7,7,54461.0,110002.0,2117-11-04 0:00,2117-11-04 17:04,Nursing Progress Note,"The patient is a 37y/o male with a PMH of DM, ...","the patient is a 37y/o male with a pmh of dm, ...",537,43,32,the patient is a 37yo male with a pmh of dm et...,"[h, e, , p, e, n, , , , 3, 7, , l, e, , ...","[t, h, e, , p, a, t, i, e, n, t, , i, s, , ..."
8,8,65124.0,122850.0,2179-11-22 0:00,2179-11-22 15:04,Nursing Progress Note,57F with obesity and advanced [**Hospital 936*...,57f with obesity and advanced [**hospital 936*...,422,17,32,57f with obesity and advanced hospital 936 tra...,"[5, 7, f, , w, h, , b, e, , n, , v, n, c, ...","[5, 7, f, , w, i, t, h, , o, b, e, s, i, t, ..."
9,9,32195.0,188413.0,2160-01-14 0:00,2160-01-14 17:07,Nursing Progress Note,"30yo female w/ DM type I, HTN, ESRD on daily P...","30yo female w/ dm type i, htn, esrd on daily p...",155,13,32,30yo female w dm type i htn esrd on daily pd w...,"[3, 0, , f, e, l, e, , w, , , p, e, , , ...","[3, 0, y, o, , f, e, m, a, l, e, , w, , d, ..."
