# CS598 Deep Learning for Healthcare Final Project
## Reproduction of Deepr: A Convolutional Net for Medical Records
### Juan Alvarez Martinez, Shane Sepac

In [None]:
### TODO: Include summary and report of findings here [200 words]

## Load MIMIC-III Dataset. 
Several csv files are needed from the MIMIC-III dataset: ADMISSIONS, PATIENTS, DIAGNOSES_ICD, and PROCEDURES_ICD. These files can be loaded automatically out of S3, or you can place them in `<project_root>/mimic3`. 
- If loading out of S3, ensure you have all environment variables from .env.sample copied and instantiated in a .env file!

In [None]:
# install the required dependencies
%pip install boto3 python-dotenv pandas pyhealth matplotlib

### Get MIMIC-3 Data
Attempt to load MIMIC-3 data out of S3 if the relevant CSV files are not already in the mimic3 folder at the project root.

In [None]:
import os
from utils import copy_file_from_s3

data_folder = "mimic3"
required_files = ["ADMISSIONS.csv", "PATIENTS.csv", "DIAGNOSES_ICD.csv", "PROCEDURES_ICD.csv", "TRANSFERS.csv"]

for i, fn in enumerate(required_files):
  if not os.path.exists(f"{data_folder}/{fn}"):
    print(f"Cannot find {fn} in {data_folder}, trying to download from S3...")
    copy_file_from_s3(fn, data_folder)
  else:
    print(f"Found {fn}...")

In [None]:
from pyhealth.datasets import MIMIC3Dataset

mimic3_ds = MIMIC3Dataset("./mimic3/", ["DIAGNOSES_ICD", "PROCEDURES_ICD"]) #pyhealth does not support mapping ICD-9 to ICD-10 codes.

mimic3_ds.info()
mimic3_ds.stat()

## Sequencing EMR: Creating Sentences representing patient episodes
Per Deepr, an EMR must be translated into a sentence for use downstream the model. An EMR is a sequence of time-stamped visit episodes. Each episode involves a series of diagnoses and treatments, called a phrase. Each phrase is separated by a time interval equal to `(0–1], (1–3], (3–6], (6–12], and 12+` or `TRANSFER`, with the latter indicating a transfer between care providers (separate departments within the same hospital or between hospitals.) Infrequent words are coded with `RAREWORD`, which indicates the word has appeared <100 times. Per the Deepr paper, an example sentence looks as follows:

```
1910 Z83 911 1008 D12 K31 1-3m R94 RAREWORD H53 Y83 M62 Y92 E87 T81 RAREWORD RAREWORD 1893 D12 S14 738 1910 1916 Z83 0-1m T91 RAREWORD Y83 Y92 K91 M10 E86 6-12m K31 1008 1910 Z13 Z83.
```

Note: In the sentence above, diagnoses are in ICD-10 format (a character followed by digits) and procedures are in digits. 

The MIMIC-3 dataset provides ICD-9 codes, and these will be used, but the level-3 variant of them for consistency with the original paper. It can also be noted that the encounter and discharge datetimes for visits are between the years 2100-2200 in order to deidentify patients, however, the time interval between visits is indeed preserved.


In [None]:
'''
Find rare words (diagnoses and procedures with counts of less than 100)
'''
word_cnts = {}

for i, p in enumerate(mimic3_ds.patients.values()):
  words = []

  for _, v in p.visits.items():
    for e in v.get_event_list('DIAGNOSES_ICD'):
      words.append(e.code)

    for e in v.get_event_list('PROCEDURES_ICD'):
      words.append(e.code)

  for word in words:
    # If the word is already in the dictionary, increment the count
    if word in word_cnts:
        word_cnts[word] += 1
    # Otherwise, add the word to the dictionary with a count of 1
    else:
        word_cnts[word] = 1

In [None]:
'''
Append to pyhealth's record of visits so that ADMISSION_TYPE and ADMISSION_LOCATION data are available. The former is used later in the model training
to target against readmission (by looking at non-elective admits) and the latter is used to discern intra/inter hospital transfers, which is needed to build 
the sentences already described (i.e. to help build sentences using the TRANSFER keyword.)
'''
import pandas as pd

admissions_df = pd.read_csv("./mimic3/ADMISSIONS.csv")
admissions_df = admissions_df.set_index("HADM_ID")

for i, p in enumerate(mimic3_ds.patients.values()):
  # Sort patient visits by encounter_time
  for i, v in enumerate(p.visits.items(),):
    res = admissions_df.loc[int(v[1].visit_id)]

    v[1].attr_dict["ADMISSION_TYPE"] = res["ADMISSION_TYPE"]
    v[1].attr_dict["ADMISSION_LOCATION"] = res["ADMISSION_LOCATION"]

In [None]:
from utils import timedelta_to_interval
import random
import json

'''
Translate EMRs into sentences outlined by the paper. A sentence consists of phrases, which are randomly shuffled diagnosis and procedure codes, separated by the 
time interval between visits, if the time interval exists. Sentences should have 100 words max.

While looping over each patient:
  1. Sort visits by encounter_time
  2. Find the time interval between each visit and generate its relevant string word
  3. Build arrays of diagnosis and procedure codes for each visit, replacing ICD-10 codes with less than 100 usages with RAREWORD
  4. Randomly shuffle each array of diagnosis and procedure codes, then append the time interval string if available. This represents a phrase.
    Concat each phrase to an array, which will be concatenated to form the final sentence. If the concatenation would form a sentence longer
    than 100 words, min(100, words(sentence)) is adhered to.
'''
sentences = []
for i, p in enumerate(mimic3_ds.patients.values()):

  # Sort patient visits by encounter_time
  sorted_visits = sorted(p.visits.items(), key=lambda v: v[1].encounter_time) # sort by encounter time in order to guage interval between visits

  # Generate timestamps in between visits
  discharge_times = [visit[1].discharge_time for visit in sorted_visits[:-1]]
  encounter_times = [visit[1].encounter_time for visit in sorted_visits[1:]]

  time_intervals = [
      t2 - t1
      for t1, t2 in zip(discharge_times, encounter_times)
  ]

  # Convert timestamps to month intervals as specified in paper
  time_interval_strs = timedelta_to_interval(time_intervals)

  # event_diagnoses_ls = (visit, diagnoses_codes)
  event_diagnoses_ls = []

  # event_procedures_ls = (visit, procedure_codes)
  event_procedures_ls = []

  # Helper function to create arrays with RAREWORD using list comprehension
  def handle_event(event_list, word_cnts):
      return ["RAREWORD" if e.code in word_cnts and word_cnts[e.code] < 100 else e.code for e in event_list]

  # build arrays of diagnoses and procedures on a visit level, add to event_diagnoses_ls or event_procedures_ls
  for _, v in sorted_visits:
      visit_diagnoses = handle_event(v.get_event_list('DIAGNOSES_ICD'), word_cnts)
      event_diagnoses_ls.append(visit_diagnoses)

      visit_procedures = handle_event(v.get_event_list('PROCEDURES_ICD'), word_cnts)
      event_procedures_ls.append(visit_procedures)


  # Randomly shuffle diagnosis and procedure codes and append a time interval after, if available. Ensure the output sentence will not be more than 100 words.
  arrs = []
  word_cnt = 0
  for i, vd in enumerate(event_diagnoses_ls):
      arr = vd + event_procedures_ls[i]
      random.shuffle(arr)
      if i < len(time_interval_strs):
          arr.append(time_interval_strs[i])

      new_word_cnt = word_cnt + len(arr)

      if new_word_cnt > 100:
          # Calculate the number of elements needed to reach exactly 100 words
          elements_needed = 100 - word_cnt

          # Take a subset of arr to make new_word_cnt equal 100
          arr = arr[:elements_needed]
          arrs.append(arr)
          break

      arrs.append(arr)
      word_cnt = new_word_cnt

  # Combine all codes and time interval to create a phrase, representing a visit
  phrases = [" ".join(arr) for arr in arrs]

  # Combine all phrases to create a sentence, representing a sequence as outlined by the paper
  sentence = " ".join(phrases)
  sentences.append(sentence)

# output to json file
output_dir = "data"
output_filename = "sentences.json"

os.makedirs(output_dir, exist_ok=True)

with open(os.path.join(output_dir, output_filename), "w") as json_file:
  json.dump(sentences, json_file)

### Test that output sentences satisfy the following conditions:
- There is a sentence for each patient
- Each sentence is capped to max 100 words
- Multi visit patients have visits separated by a timestamp
- Words should not exist in their ICD-10 form if used less than 100 times (should be replaced with RAREWORD)


In [None]:
import re

# There should be one sentence per patient
num_patients = len(mimic3_ds.patients)
num_sentences = len(sentences)
assert(num_patients == num_sentences)

# There should be max 100 words per sentence
word_lengths = map(lambda s: len(s.split()), sentences)
assert(max(list(word_lengths)) <= 100)

# There should be no word in any of the sentences that is present less than 100 times
rarewords = [word for word, count in word_cnts.items() if count < 100]
for sentence in sentences:
  words_of_sentence = sentence.split()
  rareword_violations = list(filter(lambda w: w in word_cnts and word_cnts[w] < 100, words_of_sentence))
  assert(len(rareword_violations) == 0)


# Patients with multiple visits should have timestamps separating their visits i.e. 1-3m or 12+m #TODO: Add TRANSFER to regex
pattern = re.compile(r"[-+]")
for i, p in enumerate(mimic3_ds.patients.values()):
    if len(p.visits) > 1:
        if not pattern.search(sentences[i]):
            print(f"Failed assertion for sentences[{i}]: '{sentences[i]}'")
            assert(False)


## Training Word2Vec
#TODO: Write description

In [None]:
import nltk
import nltk.corpus
from nltk.stem.snowball import SnowballStemmer
import re


nltk.download("stopwords")


def split_tokens(stemmer, stopwords, line):
    return [
        stemmer.stem(i)
        for i in re.split(r" +", re.sub(r"[^a-z@# ]", "", line.lower()))
        if (i not in stopwords) and len(i)
    ]


stemmer = SnowballStemmer("english")
stopwords = nltk.corpus.stopwords.words("english")
sentences = [
    " ".join(sentence)
    for line in open("data/shakespeare.txt", "r").readlines()
    if (sentence := split_tokens(stemmer, stopwords, line)) and sentence
]

In [None]:
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset


class Word2VecDataset(Dataset):
    def __init__(self, dataset, tok2id, wsize=3):
        self.wsize = wsize
        self.tok2id = tok2id

        self.dataset = [
            ctx
            for sentence in dataset for ctx in self.get_contexts(sentence)
        ]

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        return self.dataset[idx]

    def get_contexts(self, sentence):
        moving_window = []

        for word_ix, word in enumerate(sentence):
            target = self.tok2id[word]
            window = [
                word_ix + win_ix
                for win_ix in range(-self.wsize, self.wsize + 1)
                if (
                    word_ix + win_ix >= 0
                    and word_ix + win_ix < len(sentence)
                    and win_ix != 0
                )
            ]

            moving_window += [
                (target, self.tok2id[sentence[win_word]]) for win_word in window
            ]

        return moving_window


class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.prediction = nn.Linear(embedding_size, vocab_size, bias=False)

    def forward(self, input):
        hidden = self.embedding(input)
        logits = self.prediction(hidden)

        return logits


dataset = [sentence.split(" ") for sentence in sentences]
vocab = set([word for sentence in dataset for word in sentence])

id2tok = dict(enumerate(vocab))
tok2id = {token: id for id, token in id2tok.items()}

word2vec_dataloader = DataLoader(
    Word2VecDataset(dataset, tok2id),
    batch_size=64,
    shuffle=True
)

word2vec_model = Word2Vec(len(vocab), embedding_size=100)

In [None]:
import torch
from tqdm import tqdm
import matplotlib.pyplot as plt


LR = 3e-4
EPOCHS = 90

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(word2vec_model.parameters(), lr=LR)
progress_bar = tqdm(range(EPOCHS * len(word2vec_dataloader)))

running_loss = []

for epoch in range(EPOCHS):
    epoch_loss = 0
    for center, context in word2vec_dataloader:
        optimizer.zero_grad()

        logits = word2vec_model(input=context)
        loss = loss_fn(logits, center)
        epoch_loss += loss.item()

        loss.backward()
        optimizer.step()
        progress_bar.update(1)

    epoch_loss /= len(word2vec_dataloader)
    running_loss.append(epoch_loss)

plt.plot(running_loss)

In [None]:
from scipy.spatial import distance
import numpy as np


def get_distance_matrix(wordvecs, metric):
    dist_matrix = distance.squareform(distance.pdist(wordvecs, metric))
    return dist_matrix


def get_k_similar_words(word, dist_matrix, k=10):
    idx = tok2id[word]
    dists = dist_matrix[idx]
    ind = np.argpartition(dists, k)[:k+1]
    ind = ind[np.argsort(dists[ind])][1:]
    out = [(i, id2tok[i], dists[i]) for i in ind]
    return out


dmat = get_distance_matrix(
    word2vec_model.prediction.weight.cpu().detach().numpy(),
    "cosine"
)

test_sentence = ["good", "father" "school", "hate"]

for word in test_sentence:
    print(word, [token[1] for token in get_k_similar_words(word, dmat)], "\n")

In [None]:
weights = word2vec_model.state_dict()
torch.save(weights, os.path.join(output_dir, "word2vec.pt"))

## Convolutional Network
Now that Word2Vec has created embeddings for the sentences we generated in the first step, we are ready to pass the embeddings through a convolutional layer, followed by max pooling.

First, we will generate the train and val loaders. This will first require us to create the validation set. In Deepr, we try to predict the likelihood of unplanned re-admission after both 3 and 6 months. Unplanned re-admission is coded in the database as an unplanned or emergency with the status not equal to "elective" or a transfer from another hospital.

Next, we will define a model containing a convolutional network, ReLU and max pooling layer.

Deepr reported the following parameters were determined to be optimal for the model:

`m = 100, d = 1, motif size = 3, 4, and 5, n = 100 number of epochs = 10, minibatch size = 64, and l2 regularization λ = 1.0.`

In [None]:
'''
Create validation dataset, train and val loaders for training Deepr.
'''

val_readmission_time_days = 180 # The threshold for flagging unplanned readmission for validation set
candidate_group = []
risk_group = []
for i, p in enumerate(mimic3_ds.patients.values()):

  # Sort patient visits by encounter_time
  sorted_visits = sorted(p.visits.items(), key=lambda v: v[1].encounter_time) # sort by encounter time in order to guage interval between visits

  # Generate timestamps in between visits
  discharge_times = [visit[1].discharge_time for visit in sorted_visits[:-1]]
  encounter_times = [visit[1].encounter_time for visit in sorted_visits[1:]]

  time_intervals = [
      t2 - t1
      for t1, t2 in zip(discharge_times, encounter_times)
  ]
  visit_types = [visit[1].attr_dict["ADMISSION_TYPE"] for visit in sorted_visits[1:]]

  unplanned_readmissions = [
      i_vt
      for i_vt, (interval, visit_type) in enumerate(zip(time_intervals, visit_types))
      if interval.days <= val_readmission_time_days and visit_type == "EMERGENCY"
  ]

  if unplanned_readmissions:
    risk_group.append(sentences[i])
  else: 
    candidate_group.append(sentences[i])


In [None]:
# Risk group patients should be separate from the candidate group, and sum to the length of the total dataset
assert(len(mimic3_ds.patients.values()) == len(risk_group) + len(candidate_group))

In [None]:
import torch
from torch.utils.data import Dataset
'''
Returns array of word embeddings corresponding to the sentences in the training data.
'''
class DeeprDataset(Dataset):
    def __init__(self, x_data, y_data, word_embeddings_dict):
        self.x_data = x_data
        self.y_data = y_data
        self.word_embeddings_dict = word_embeddings_dict

    def get_embedding(self, sentence):
      word_tensors = [torch.tensor(self.word_embeddings_dict[word]) for word in sentence.split()]
      return torch.stack(word_tensors)

    def __len__(self):
        return len(self.x_data)

    # returns an array of the embeddings of each word in the sentence, shape of (words(sentenc), embedding_dim)
    def __getitem__(self, idx):
        return self.get_embedding(self.x_data[idx]), self.get_embedding(self.y_data[idx])

In [None]:
import random
train_ratio = 0.8
random.shuffle(candidate_group)
random.shuffle(risk_group)

word_embeddings_dict = {word: word2vec_model.embedding(torch.tensor(tok2id[word])).detach().numpy() for word in tok2id}

train_size = int(len(risk_group) * train_ratio)
x_train_data = candidate_group[:train_size]
x_val_data = candidate_group[train_size:]
y_train_data = risk_group[:train_size]
y_val_data = risk_group[train_size:]

train_dataset = DeeprDataset(x_train_data, y_train_data, word_embeddings_dict)
val_dataset = DeeprDataset(x_val_data, y_val_data, word_embeddings_dict)


In [None]:
# NOT IMPLEMENTED
class Deepr(nn.Module):
    def __init__(self):
        super().__init__()

        self.cnn = nn.Conv1d()
        self.relu = nn.ReLU()
        #self.pool = nn.MaxPool2d()

    def forward(self, input):
      pass

In [None]:
# NOT IMPLEMENTED
# Train the model: use Cross Entropy Loss loss fxn and SGD optimizer

In [None]:
# NOT IMPLEMENTED
# Evaluate the model