## Imports 

In [None]:
import sys
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install pandas

In [None]:
import pandas as pd
import numpy as np
import os
import random
import gzip
import shutil

Install komenti (note that java is required for komenti to run).

In [None]:
%%bash
if ! test -f "komenti/bin/komenti"; then
  if ! test -f "komenti-0.2.0-SNAPSHOT.zip"; then
    echo 'downloading komenti'
    wget https://github.com/reality/komenti/releases/download/0.2.0-SNAPSHOT-1/komenti-0.2.0-SNAPSHOT.zip
  fi
  echo 'extracting komenti'
  unzip komenti-0.2.0-SNAPSHOT.zip
  mv komenti-0.2.0-SNAPSHOT komenti
fi


Test komenti for errors:

In [None]:
%%bash
komenti/bin/komenti query -q test

# Download 'NOTEEVENTS' and 'DIAGNOSES_ICD' from MIMIC-III database


In [None]:
# directory to contain MIMIC-III files 
if not os.path.exists('MIMIC_data'):
    os.makedirs('MIMIC_data')

All MIMIC zip files should be downloaded from the [website](https://physionet.org/content/mimiciii/1.4/) and stored in the 'MIMIC_data' directory. 

In [None]:
# check current directory for DIAGNOSES_ICD and NOTEEVENTS
os.listdir('MIMIC_data')

In [None]:
with gzip.open('MIMIC_data/DIAGNOSES_ICD.csv.gz') as g:
    D_df = pd.read_csv(g)

In [None]:
with gzip.open('MIMIC_data/NOTEEVENTS.csv.gz') as f:
    NE_df = pd.read_csv(f)

## Tidy up diagnoses

In [None]:
# keep only primary diagnoses (SEQ_NUM = 1.0) so that there is only one entry per patient 

D_df_tidy = D_df.loc[(D_df['SEQ_NUM']==1.0),]

D_df_tidy.head()

## Sample of 1000 admissions 

#### Use pre-determined list of patients to reproduce results directly 

Using 'row_ids.txt' file to reproduce the sample used to generate these results.

In [None]:
# to avoid running the slow sampling step and to reproduce our results, use the 
# file 'row_ids.txt' to choose the same sample of 1000

# read in the predetermined sample row ids

row_ids = pd.read_table('row_ids.txt', header=None, index_col=0)
rows_list = row_ids.index.tolist()

sample_id = D_df_tidy.loc[rows_list,'HADM_ID'].tolist()

In [None]:
sample_id[0:10]

#### Or run this and produce your own sample of 1000 patients

In [None]:
# set seed ?
random.seed(30)

## slow running step
# select 1000 random, unique admissions 

sample_id = []

while len(set(sample_id)) < 1000 :
    
    i = D_df_tidy.sample(n=1, replace=False, random_state=30).index
     
    # only sample this patient if they exist in NOTEEVENTS and have notes associated
    if i.isin(NE_df['HADM_ID']) and len(NE_df.loc[(NE_df['HADM_ID'] == i), :]) != 0:
        
        if not i.isin(sample_id):
            
            sample_id.append(D_df_tidy.loc[i[0], 'HADM_ID'])


## Extract and save text for sample patients 

Create a directory and write the concatenated text for each patient, in files named by admission ID, into it.

In [None]:
# if you are running the sampling multiple times ensure to delete the existing text files 
# and repopulate with new sample  

if os.path.exists('TextFiles'):
    shutil.rmtree('TextFiles')

In [None]:
# create directory for storing text files

if not os.path.exists('TextFiles'):
    os.makedirs('TextFiles')

In [None]:
# for each patient in sample, colect text and write into a .txt file inside TextFiles directory 

path = os.getcwd()

for id in sample_id:
    outfile = path + '/TextFiles/' + "text_{}.txt".format(id)
    
    file = open(outfile,"a+")
    
    text = NE_df.loc[NE_df['HADM_ID'] == id,'TEXT'] 
    
    for i in text:
        file.write(i) 
    
    file.close() 

## Apply Komenti annotation to the text

In [None]:
# make directory for storing the annotations

if not os.path.exists('AnnotationFiles'):
    os.makedirs('AnnotationFiles')

Load in the HPO ontology terms to the working directory:

In [None]:
%%bash

wget https://raw.githubusercontent.com/reality/synonym_expansion_validation/master/hpo/unexpanded_all.txt


Annotate the text files with komenti, using the HPO terms in the 'unexpanded_all.txt' file downloaded. This will produce a single file ('annotations.txt') comprising all patients in the sample. 

In [None]:
%%bash
komenti/bin/komenti annotate -l unexpanded_all.txt -t TextFiles/ --out AnnotationFiles/annotations.txt  --threads 40 --verbose
