## Import Data

In [1]:
import pandas as pd
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import pickle
from sklearn.utils import resample
import matplotlib as plt
from matplotlib.pyplot import hist

In [2]:
notes_database = pd.read_csv("/home/sanjaycollege15/ekg_notes_v2.csv")

In [3]:
notes_database.head()

Unnamed: 0,SUBJECT_ID,DIAGNOSIS_CODE,TEXT
0,22043,4,Sinus tachycardia. Occasional interpolated ven...
1,96284,4,Sinus rhythm. Compared to the previous tracin...
2,11877,4,Atrial fibrillation with ventricular premature...
3,21312,4,Baseline artifact makes accurate interpretatio...
4,22285,4,Sinus rhythm with atrial sensed ventricular pa...


---

## Pre-process Text

### Change all labels of 4 to a 3 (typo in the data extraction)

In [4]:
notes_database['DIAGNOSIS_CODE']=notes_database['DIAGNOSIS_CODE'].replace(4,3)

### Drop Duplicates

In [7]:
before = len(notes_database)
notes_database.drop_duplicates(inplace=True)
print(f"{before-len(notes_database)} rows removed. {len(notes_database)} rows remain.")


0 rows removed. 34843 rows remain.


### Lowercase text

In [8]:
notes_database['lower_text']=notes_database.TEXT.str.lower()

In [9]:
notes_database.drop(columns='TEXT', inplace=True)

### Remove Identifiers

Privacy identifiers are in the form [\*\*2157-1-7\*\*], as shown in the example below. We can remove them with a simple regex.

In [8]:
notes_database.lower_text[0]

'sinus tachycardia. occasional interpolated ventricular premature beats.\nprobable incomplete right bundle-branch block and left anterior fascicular\nblock. possible prior inferior wall myocardial infarction. compared to the\nprevious tracing of [**2135-5-4**] the frequent interpolated ventricular premature\nbeats are new.\ntracing #1\n\n'

In [9]:
notes_database.lower_text.replace('(\[\*\*)(.*)(\*\*\])', '', regex=True)[0]

'sinus tachycardia. occasional interpolated ventricular premature beats.\nprobable incomplete right bundle-branch block and left anterior fascicular\nblock. possible prior inferior wall myocardial infarction. compared to the\nprevious tracing of  the frequent interpolated ventricular premature\nbeats are new.\ntracing #1\n\n'

In [10]:
notes_database['removedIdentifiers']=notes_database.lower_text.replace('(\[\*\*)(.*)(\*\*\])', '', regex=True)

In [11]:
notes_database.drop(columns="lower_text", inplace=True)

### Dropping stop words

Did not end up dropping stop words because they seemed important in interpreting the notes. Unlike sentiment analysis, negations like "not" or "nor" could be important in clinical notes. The models I'm going to be using will need to leverage the context from stop words.

In [13]:
# nltk.download('stopwords')
# stop_words = stopwords.words('english')

In [14]:
#notes_database['notes_without_stopwords'] = notes_database['removedIdentifiers'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [15]:
#notes_database.notes_without_stopwords[0]

### Noise removal, sentence splitting

Removed most punctuation, but leaving in '-' and ':' since they're commonly used medical terms.

In [12]:
notes_database['removedIdentifiers'][0]

'sinus tachycardia. occasional interpolated ventricular premature beats.\nprobable incomplete right bundle-branch block and left anterior fascicular\nblock. possible prior inferior wall myocardial infarction. compared to the\nprevious tracing of  the frequent interpolated ventricular premature\nbeats are new.\ntracing #1\n\n'

In [12]:
tokenizer = Tokenizer(
    #num_words = 150,
    filters='!"#$%&()*+/<=>@[\\]^_`{|}~\t\n',
    split = ' ', 
    char_level = False)

In [13]:
tokenizer.fit_on_texts(notes_database['removedIdentifiers'])

In [14]:
sequences = tokenizer.texts_to_sequences(notes_database['removedIdentifiers'])


In [15]:
denoised_sentences = []

for i in sequences:
    denoised_sentences.append(' '.join(tokenizer.index_word[w] for w in i))

In [16]:
notes_database['DENOISED_TEXT'] = denoised_sentences
notes_database.drop(columns='removedIdentifiers', inplace = True)

In [17]:
notes_database.head()

Unnamed: 0,SUBJECT_ID,DIAGNOSIS_CODE,DENOISED_TEXT
178417,19326,2,sinus rhythm marked left axis deviation - left...
178418,18179,2,sinus rhythm. since earlier this date the rhyt...
178419,21734,2,sinus bradycardia first degree a-v block poor ...
178420,12694,2,baseline artifact probable ectopic atrial rhyt...
178421,10928,2,sinus rhythm left axis deviation old inferior ...


### Removing Length 0 Notes

Now that we've denoised the data, some of the notes will be 0 characters long. Let's remove them from the dataset. 

In [18]:
notes_database['TEXT_LENGTHS'] = notes_database['DENOISED_TEXT'].str.len()

In [19]:
before = len(notes_database)
notes_database = notes_database[notes_database['TEXT_LENGTHS']>0]
print(f"Removed {before-len(notes_database)} rows.")

Removed 8 rows.


### Balance the Dataset

Now that we've dropped duplicate subjects, we have a very imbalanced dataset. We'll downsample the more common values in order to get a more balanced dataset. 

In [20]:
notes_database['DIAGNOSIS_CODE'].value_counts()

1    17790
2    17045
Name: DIAGNOSIS_CODE, dtype: int64

In [22]:
df_3_downsampled = resample(notes_database[notes_database.DIAGNOSIS_CODE == 3], 
                                 replace=False,    
                                 n_samples=17045,     
                                 random_state=123)

df_0_downsampled = resample(notes_database[notes_database.DIAGNOSIS_CODE == 0], 
                                 replace=False,    
                                 n_samples=17045,     
                                 random_state=123)

df_1_downsampled = resample(notes_database[notes_database.DIAGNOSIS_CODE == 1], 
                                 replace=False,    
                                 n_samples= 17045,     
                                 random_state=123)

df_downsampled = pd.concat([df_3_downsampled, 
                            df_0_downsampled, 
                            df_1_downsampled, 
                            notes_database[notes_database.DIAGNOSIS_CODE == 2]],
                            ignore_index=True)


df_downsampled['DIAGNOSIS_CODE'].value_counts()

In [24]:
notes_database = df_downsampled

## Save off dataframe

In [21]:
notes_database.drop(columns=['SUBJECT_ID', 'TEXT_LENGTHS'], inplace=True)
notes_database.rename(columns={'DENOISED_TEXT':'TEXT', 'DIAGNOSIS_CODE':'ICD9_CODE'}, inplace=True)

In [22]:
notes_database.head()

Unnamed: 0,ICD9_CODE,TEXT
178417,2,sinus rhythm marked left axis deviation - left...
178418,2,sinus rhythm. since earlier this date the rhyt...
178419,2,sinus bradycardia first degree a-v block poor ...
178420,2,baseline artifact probable ectopic atrial rhyt...
178421,2,sinus rhythm left axis deviation old inferior ...


In [23]:
notes_database.to_pickle('/home/sanjaycollege15/PredictingDiagnoses/Data/ekg_denoised_v3.pkl')

---