In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
from dask.distributed import Client
import seaborn as sns
import pickle
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import normalize
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical


In [None]:
# script to make analysis datasets. Code cells containing PHI have been removed.

In [2]:
# specify data folder (would need to be changed to run in another context)
datafolder = '/data/clin_notes_outcomes/prissmm_notes/'

In [3]:
# pull in the EHR text data for all patients in the cohort (82268 is OncDRS request ID number)
raw_notes = pd.read_parquet(datafolder+'parsed_82268.parquet')



In [4]:
print(raw_notes.info())
print(raw_notes.report_type.value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6526673 entries, 0 to 6526672
Data columns (total 7 columns):
dfci_mrn         object
date             datetime64[ns]
text             object
source           object
department       object
provider_name    object
report_type      object
dtypes: datetime64[ns](1), object(6)
memory usage: 348.6+ MB
None
Progress Note             1150740
Imaging                    964671
Lab                        499946
Telephone Encounter        311429
Pathology and Cytology     290885
                           ...   
QUESTIONNAIRE                   3
AUTOPSY NOTE                    3
EDUCATION                       2
Group Note                      1
REVIEW OF SYSTEMS               1
Name: report_type, Length: 92, dtype: int64


In [5]:
all_notes = raw_notes

In [6]:
# keep notes corresponding to progress note categories
all_notes = all_notes[all_notes.report_type.isin(["Progress Note", "PROGRESS NOTE", "CONSULT", "CONSULTATION NOTE", "VISIT NOTE", "COMPREHENSIVE H&P"])]

all_notes = all_notes[~all_notes.provider_name.isnull()]
all_notes.dfci_mrn = pd.to_numeric(all_notes.dfci_mrn)

all_notes.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1354217 entries, 0 to 6526665
Data columns (total 7 columns):
dfci_mrn         1354217 non-null int64
date             1354217 non-null datetime64[ns]
text             1354217 non-null object
source           1354217 non-null object
department       1288191 non-null object
provider_name    1354217 non-null object
report_type      1354217 non-null object
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 82.7+ MB


In [7]:
# define function to split notes into pre-assessment and plan and post-assessment and plan sections (using rules)

def find_ap(string):
    ap_search_phrase = "a/p|assessment/plan|assessment:|assessment and plan|impression and plan|in summary|plan:"
    splitup = re.split(ap_search_phrase, string)
    if len(splitup) < 2:
        return splitup + ['']
    else:
        return [splitup[0]] + [" ".join(splitup[1:])]


In [8]:
# test rules function

import re
find_ap("hello world. assessment and plan: progression of disease. assessment: progression")

['hello world. ', ': progression of disease.   progression']

In [11]:
# remove carriage returns and make all text lowercase
all_notes.text = all_notes.text.str.replace('\n|\r', ' ')
all_notes.text = all_notes.text.str.lower()
all_notes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1354217 entries, 0 to 6526665
Data columns (total 7 columns):
dfci_mrn         1354217 non-null int64
date             1354217 non-null datetime64[ns]
text             1354217 non-null object
source           1354217 non-null object
department       1288191 non-null object
provider_name    1354217 non-null object
report_type      1354217 non-null object
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 82.7+ MB


In [12]:
# make sure all notes have a valid patient ID
all_notes.dfci_mrn.isnull().value_counts()

False    1354217
Name: dfci_mrn, dtype: int64

In [13]:
# pull in list of patients previously assigned to a training set

vocab_size = 10000
training_mrns = pd.read_csv(datafolder + 'training_mrns_76525_82268.csv')
training_mrns.info()
train_text = all_notes[all_notes.dfci_mrn.isin(training_mrns.dfci_mrn)].text
train_text.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27274 entries, 0 to 27273
Data columns (total 2 columns):
Unnamed: 0    27274 non-null int64
dfci_mrn      27274 non-null int64
dtypes: int64(2)
memory usage: 426.3 KB


(1083879,)

In [14]:
# train TF/Keras tokenizer vocabulray using training set notes

train_tokenizer = True

if train_tokenizer:
    tokenizer = Tokenizer(num_words = vocab_size)
    tokenizer.fit_on_texts([str(x) for x in train_text])
    with open('/homes10/klkehl/prissmm_notes_v2/notes_tokenizer_ap_find.pickle', 'wb') as handle:
         pickle.dump(tokenizer, handle, protocol=3)

In [15]:
with open('/homes10/klkehl/prissmm_notes_v2/notes_tokenizer_ap_find.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [16]:
split_text = [find_ap(x) for x in all_notes.text]

In [17]:
all_notes = all_notes[['dfci_mrn', 'date', 'source', 'department', 'provider_name', 'report_type']]

In [18]:
# split text into pre and post assessment and plan sections

all_notes['pre_ap'] = [x[0] for x in split_text]
all_notes['ap'] = [x[1] for x in split_text]

In [20]:
training_mrns = pd.read_csv(datafolder + 'training_mrns_76525_82268.csv')
validation_mrns = pd.read_csv(datafolder + 'validation_mrns_76525_82268.csv')
true_test_mrns = pd.read_csv(datafolder + 'truetest_mrns_76525_82268.csv')

In [22]:
# how many notes are there in the training set?
training = pd.merge(all_notes, training_mrns, on='dfci_mrn')
validation = pd.merge(all_notes, validation_mrns, on='dfci_mrn')
test = pd.merge(all_notes, true_test_mrns, on='dfci_mrn')
print(len(training.index))

1083879


In [23]:
# output datasets
training.to_parquet(datafolder+'training_notes_find_ap.parquet')
validation.to_parquet(datafolder+'validation_notes_find_ap.parquet')
test.to_parquet(datafolder+'test_notes_find_ap.parquet')