In [1]:
import os
import numpy as np
import random

class Reader(object):
    def __init__(self, dataset_dir, listfile=None):
        self._dataset_dir = dataset_dir
        self._current_index = 0
        if listfile is None:
            listfile_path = os.path.join(dataset_dir, "listfile.csv")
        else:
            listfile_path = listfile
        with open(listfile_path, "r") as lfile:
            self._data = lfile.readlines()
        self._listfile_header = self._data[0]
        self._data = self._data[1:]

    def get_number_of_examples(self):
        return len(self._data)

    def random_shuffle(self, seed=None):
        if seed is not None:
            random.seed(seed)
        random.shuffle(self._data)

    def read_example(self, index):
        raise NotImplementedError()

    def read_next(self):
        to_read_index = self._current_index
        self._current_index += 1
        if self._current_index == self.get_number_of_examples():
            self._current_index = 0
        return self.read_example(to_read_index)

In [2]:
class DecompensationReader(Reader):
    def __init__(self, dataset_dir, listfile=None):
        """ Reader for decompensation prediction task.
        :param dataset_dir: Directory where timeseries files are stored.
        :param listfile:    Path to a listfile. If this parameter is left `None` then
                            `dataset_dir/listfile.csv` will be used.
        """
        Reader.__init__(self, dataset_dir, listfile)
        self._data = [line.split(',') for line in self._data]
        # print(self._data[1])
        self._data = [(x, float(t), int(stay_id) ,int(y)) for (x, t, stay_id , y) in self._data]

    def _read_timeseries(self, ts_filename, time_bound):
        ret = []
        with open(os.path.join(self._dataset_dir, ts_filename), "r") as tsfile:
            header = tsfile.readline().strip().split(',')
            assert header[0] == "Hours"
            for line in tsfile:
                mas = line.strip().split(',')
                t = float(mas[0])
                if t > time_bound + 1e-6:
                    break
                ret.append(np.array(mas))
        return (np.stack(ret), header)

    def read_example(self, index):
        """ Read the example with given index.

        :param index: Index of the line of the listfile to read (counting starts from 0).
        :return: Directory with the following keys:
            X : np.array
                2D array containing all events. Each row corresponds to a moment.
                First column is the time and other columns correspond to different
                variables.
            t : float
                Length of the data in hours. Note, in general, it is not equal to the
                timestamp of last event.
            y : int (0 or 1)
                Mortality within next 24 hours.
            header : array of strings
                Names of the columns. The ordering of the columns is always the same.
            name: Name of the sample.
        """
        if index < 0 or index >= len(self._data):
            raise ValueError("Index must be from 0 (inclusive) to number of examples (exclusive).")

        name = self._data[index][0]
        t = self._data[index][1]
        y = self._data[index][3]
        (X, header) = self._read_timeseries(name, t)

        return {"X": X,
                "t": t,
                "y": y,
                "header": header,
                "name": name}

In [3]:
class PhenotypingReader(Reader):
    def __init__(self, dataset_dir, listfile=None):
        """ Reader for phenotype classification task.

        :param dataset_dir: Directory where timeseries files are stored.
        :param listfile:    Path to a listfile. If this parameter is left `None` then
                            `dataset_dir/listfile.csv` will be used.
        """
        Reader.__init__(self, dataset_dir, listfile)
        self._data = [line.split(',') for line in self._data]
        self._data = [(mas[0], float(mas[1]), int(mas[2]) , list(map(int, mas[3:]))) for mas in self._data]

    def _read_timeseries(self, ts_filename):
        ret = []
        with open(os.path.join(self._dataset_dir, ts_filename), "r") as tsfile:
            header = tsfile.readline().strip().split(',')
            assert header[0] == "Hours"
            for line in tsfile:
                mas = line.strip().split(',')
                ret.append(np.array(mas))
        return (np.stack(ret), header)

    def read_example(self, index):
        """ Reads the example with given index.

        :param index: Index of the line of the listfile to read (counting starts from 0).
        :return: Dictionary with the following keys:
            X : np.array
                2D array containing all events. Each row corresponds to a moment.
                First column is the time and other columns correspond to different
                variables.
            t : float
                Length of the data in hours. Note, in general, it is not equal to the
                timestamp of last event.
            y : array of ints
                Phenotype labels.
            header : array of strings
                Names of the columns. The ordering of the columns is always the same.
            name: Name of the sample.
        """
        if index < 0 or index >= len(self._data):
            raise ValueError("Index must be from 0 (inclusive) to number of lines (exclusive).")

        name = self._data[index][0]
        t = self._data[index][1]
        y = self._data[index][3]
        (X, header) = self._read_timeseries(name)

        return {"X": X,
                "t": t,
                "y": y,
                "header": header,
                "name": name}

In [4]:
train_decomp_reader = DecompensationReader(dataset_dir='/scratch/fs999/shamoutlab/data/mimic-iv-extracted/decompensation/train', 
                                    listfile='/scratch/fs999/shamoutlab/data/mimic-iv-extracted/decompensation/train_listfile.csv')
get_item = train_decomp_reader.read_example(1)
get_item

{'X': array([['0.05583333333333333', '', '', ..., '37.388888888888886', '', ''],
        ['0.0725', '', '', ..., '', '', ''],
        ['0.3225', '', '', ..., '', '', '7.2'],
        ...,
        ['532.0891666666666', '', '47.0', ..., '', '', ''],
        ['533.0558333333333', '', '', ..., '', '', ''],
        ['533.0891666666666', '', '46.0', ..., '', '', '']], dtype='<U19'),
 't': 534.0,
 'y': 0,
 'header': ['Hours',
  'Capillary refill rate',
  'Diastolic blood pressure',
  'Fraction inspired oxygen',
  'Glascow coma scale eye opening',
  'Glascow coma scale motor response',
  'Glascow coma scale total',
  'Glascow coma scale verbal response',
  'Glucose',
  'Heart Rate',
  'Height',
  'Mean blood pressure',
  'Oxygen saturation',
  'Respiratory rate',
  'Systolic blood pressure',
  'Temperature',
  'Weight',
  'pH'],
 'name': '15406736_episode1_timeseries.csv'}

In [5]:
train_phenotype_reader = PhenotypingReader(dataset_dir='/scratch/fs999/shamoutlab/data/mimic-iv-extracted/phenotyping/train', 
                                    listfile='/scratch/fs999/shamoutlab/data/mimic-iv-extracted/phenotyping/train_listfile.csv')
get_item = train_phenotype_reader.read_example(1)
get_item

{'X': array([['0.2', '', '', '', '', '', '', '', '', '', '', '', '', '23', '',
         '', '', ''],
        ['0.21666666666666667', '', '77.0', '', '', '', '', '', '', '',
         '', '92', '', '', '150.0', '', '', ''],
        ['0.23333333333333334', '', '', '', '', '', '', '', '', '77', '',
         '', '100.0', '', '', '', '', ''],
        ['0.3', '', '127.0', '', '', '', '', '', '', '', '', '135',
         '100.0', '27', '158.0', '', '', ''],
        ['0.36666666666666664', '', '', '', '', '', '', '', '', '75', '',
         '', '', '', '', '', '', ''],
        ['0.4166666666666667', '', '', '', '', '', '', '', '', '', '', '',
         '', '', '', '36.666666666666664', '', ''],
        ['0.45', '', '', '', 'Spontaneously', 'Obeys Commands', '',
         'Oriented', '', '', '', '', '', '', '', '', '', ''],
        ['1.3', '', '85.0', '0.4', '', '', '', '', '110.0', '73', '',
         '100', '99.0', '14', '146.0', '', '', ''],
        ['2.3', '', '86.0', '', '', '', '', '', '', '74'