## Import Data

In [2]:
import pandas as pd
from keras.preprocessing.text import Tokenizer
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import pickle
from sklearn.utils import resample

In [2]:
notes_database = pd.read_csv("/home/sanjaycollege15/ekg_notes.csv")

In [3]:
notes_database.head()

Unnamed: 0,SUBJECT_ID,ICD9_CODE,TEXT
0,87424,4019,Sinus rhythm with 2:1 A-V block. Right bundle...
1,14211,4019,Sinus tachycardia\nConsider old septal myocard...
2,88174,4019,Sinus rhythm. Low limb lead voltage. Compare...
3,30927,4019,Sinus rhythm. Early precordial QRS transition...
4,30927,4019,Sinus rhythm. Early precordial QRS transition...


---

## Pre-process Text

### Drop Duplicates

In [4]:
before = len(notes_database)
notes_database.drop_duplicates(inplace=True)
print(f"{before-len(notes_database)} rows removed. {len(notes_database)} rows remain.")


293583 rows removed. 363460 rows remain.


### Drop duplicate subjects

Subjects can be classified in two different categories with very similar notes. We drop duplicate subjects to mitigate this effect.

In [5]:
before = len(notes_database)
notes_database.drop_duplicates(subset='SUBJECT_ID', inplace=True)
print(f"{before-len(notes_database)} rows removed. {len(notes_database)} rows remain.")

337966 rows removed. 25494 rows remain.


In [6]:
notes_database.drop(columns="SUBJECT_ID", inplace=True)

### Balance the Dataset

Now that we've dropped duplicate subjects, we have a very imbalanced dataset. We'll downsample the more common values in order to get a more balanced dataset. 

In [7]:
notes_database['ICD9_CODE'].value_counts()

4019     16518
4280      5183
41401     2159
42731     1634
Name: ICD9_CODE, dtype: int64

In [8]:
df_4019_downsampled = resample(notes_database[notes_database.ICD9_CODE == 4019], 
                                 replace=False,    
                                 n_samples=1634,     
                                 random_state=123)

df_4280_downsampled = resample(notes_database[notes_database.ICD9_CODE == 4280], 
                                 replace=False,    
                                 n_samples=1634,     
                                 random_state=123)

df_41401_downsampled = resample(notes_database[notes_database.ICD9_CODE == 41401], 
                                 replace=False,    
                                 n_samples=1634,     
                                 random_state=123)

df_downsampled = pd.concat([df_4019_downsampled, 
                            df_4280_downsampled, 
                            df_41401_downsampled, 
                            notes_database[notes_database.ICD9_CODE == 42731]],
                            ignore_index=True)


In [9]:
df_downsampled['ICD9_CODE'].value_counts()

41401    1634
4280     1634
4019     1634
42731    1634
Name: ICD9_CODE, dtype: int64

In [10]:
notes_database = df_downsampled

### Lowercase text

In [11]:
notes_database['lower_text']=notes_database.TEXT.str.lower()

In [12]:
notes_database.drop(columns='TEXT', inplace=True)

### Remove Identifiers

Privacy identifiers are in the form [\*\*2157-1-7\*\*], as shown in the example below. We can remove them with a simple regex.

In [13]:
notes_database.lower_text[0]

'sinus rhythm.  compared to tracing #1 no diagnostic interim change.\ntracing #2\n\n'

In [14]:
notes_database.lower_text.replace('(\[\*\*)(.*)(\*\*\])', '', regex=True)[0]

'sinus rhythm.  compared to tracing #1 no diagnostic interim change.\ntracing #2\n\n'

In [15]:
notes_database['removedIdentifiers']=notes_database.lower_text.replace('(\[\*\*)(.*)(\*\*\])', '', regex=True)

In [16]:
notes_database.drop(columns="lower_text", inplace=True)

In [17]:
notes_database.to_pickle('intermediate_ekg.pkl')

### Dropping stop words

Did not end up dropping stop words because they seemed important in interpreting the notes. Unlike sentiment analysis, negations like "not" or "nor" could be important in clinical notes. The models I'm going to be using will need to leverage the context from stop words.

In [13]:
# nltk.download('stopwords')
# stop_words = stopwords.words('english')

In [14]:
#notes_database['notes_without_stopwords'] = notes_database['removedIdentifiers'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [15]:
#notes_database.notes_without_stopwords[0]

### Noise removal, sentence splitting

Removed most punctuation, but leaving in '-' and ':' since they're commonly used medical terms.

In [18]:
notes_database['removedIdentifiers'][0]

'sinus rhythm.  compared to tracing #1 no diagnostic interim change.\ntracing #2\n\n'

In [19]:
tokenizer = Tokenizer(
    #num_words = 150,
    filters='!"#$%&()*+/<=>@[\\]^_`{|}~\t\n',
    split = ' ', 
    char_level = False)

In [20]:
tokenizer.fit_on_texts(notes_database['removedIdentifiers'])

In [21]:
sequences = tokenizer.texts_to_sequences(notes_database['removedIdentifiers'])


In [22]:
denoised_sentences = []

for i in sequences:
    denoised_sentences.append(' '.join(tokenizer.index_word[w] for w in i))

In [7]:
with open("/home/sanjaycollege15/PredictingDiagnoses/Data/denoised_sentences.txt", "wb") as fp:
    pickle.dump(denoised_sentences, fp)

Switch to Preprocess Text - Notebook 2 in order to continue constructing the dataframe. Ran out of memory in this notebook.

---

### Tokenize into Sentences

In [24]:
tokenized_sentences = []

for i in denoised_sentences:
    tokenized_sentences.append(sent_tokenize(i))

In [25]:
with open("/home/sanjaycollege15/PredictingDiagnoses/Data/tokenized_sentences.txt", "wb") as fp:
    pickle.dump(tokenized_sentences, fp)

Switch to Preprocess Text - Notebook 2 in order to continue constructing the dataframe. Ran out of memory in this notebook.

### Stemming