# MIMIC-III Numeric Data Creation

git clone to https://github.com/YerevaNN/mimic3-benchmarks
follow the instructions in the README file to build benchmark data
add to .\mimic3-benchmarks\mimic3models\in_hospital_mortality\logistic the following file:
run the file


In [64]:
from mimic3benchmark.readers import InHospitalMortalityReader
from mimic3models import common_utils
from mimic3models.metrics import print_metrics_binary
from mimic3models.in_hospital_mortality.utils import save_results
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
import pandas as pd
import os
import os
import numpy as np
import argparse
import json


def read_and_extract_features(reader, period, features):
    ret = common_utils.read_chunk(reader, reader.get_number_of_examples())
    # ret = common_utils.read_chunk(reader, 100)
    X = common_utils.extract_features_from_rawdata(ret['X'], ret['header'], period, features)
    return (X, ret['y'], ret['name'])


parser = argparse.ArgumentParser()
parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization')
parser.add_argument('--l1', dest='l2', action='store_false')
parser.add_argument('--l2', dest='l2', action='store_true')
parser.set_defaults(l2=True)
parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from',
                    choices=['first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all'])
parser.add_argument('--features', type=str, default='all', help='specifies what features to extract',
                    choices=['all', 'len', 'all_but_len'])
parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task',
                    default='directory with data of in-hospital mortality task')
parser.add_argument('--output_dir', type=str, help='Directory relative which all output files are stored',
                    default='.')
args = parser.parse_args()
print(args)

train_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'),
                                         listfile=os.path.join(args.data, 'train_listfile.csv'),
                                         period_length=48.0)

val_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'),
                                       listfile=os.path.join(args.data, 'val_listfile.csv'),
                                       period_length=48.0)

test_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'test'),
                                        listfile=os.path.join(args.data, 'test_listfile.csv'),
                                        period_length=48.0)

print('Reading data and extracting features ...')
(train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features)
(val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features)
(test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features)
#write train_X to csv file
print('  train data shape = {}'.format(train_X.shape))
print('  validation data shape = {}'.format(val_X.shape))
print('  test data shape = {}'.format(test_X.shape))

print('Imputing missing values ...')
imputer = SimpleImputer(missing_values=np.nan, strategy='mean', copy=True)
imputer.fit(train_X)
train_X = np.array(imputer.transform(train_X), dtype=np.float32)
val_X = np.array(imputer.transform(val_X), dtype=np.float32)
test_X = np.array(imputer.transform(test_X), dtype=np.float32)
print('Normalizing the data to have zero mean and unit variance ...')
scaler = StandardScaler()
scaler.fit(train_X)
train_X = scaler.transform(train_X)
val_X = scaler.transform(val_X)
test_X = scaler.transform(test_X)

In [None]:
#create dir input/data_time_series
os.makedirs('input/data_time_series', exist_ok=True)
#change dir to input/data_time_series

In [None]:
train_X = pd.DataFrame(train_X)
train_X.to_csv('train_X.csv', index=False)
#write train_y to csv file
train_y = pd.DataFrame(train_y)
train_y.to_csv('train_y.csv', index=False)

#write val_X to csv file
val_X = pd.DataFrame(val_X)
val_X.to_csv('val_X.csv', index=False)
#write val_y to csv file
val_y = pd.DataFrame(val_y)
val_y.to_csv('val_y.csv', index=False)
#write test_X to csv file
test_X = pd.DataFrame(test_X)
test_X.to_csv('test_X.csv', index=False)
#write test_y to csv file
test_y = pd.DataFrame(test_y)
test_y.to_csv('test_y.csv', index=False)




In [None]:
# verify that u have the following files in the input/data_time_series directory