In [None]:
from __future__ import absolute_import
from __future__ import print_function

from utils.readers import read_ts
from utils.utils import save_results

from models import common_utils
from models.metrics import print_metrics_binary
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.linear_model import LogisticRegression

import os
import numpy as np
import argparse
import json

def read_and_extract_features(reader, period, features):
    ret = common_utils.read_chunk(reader, reader.get_number_of_examples())
    X = common_utils.extract_features_from_rawdata(ret['X'], ret['header'], period, features)
    return (X, ret['y'], ret['name'])

import sys; sys.argv=['']; del sys

parser = argparse.ArgumentParser()
args = parser.parse_args()

args.period='all' 
#  which period extract features from
# ['first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all']

#args.features='mean' 
args.features='all' 
# 'specifies which summary statistics to extract as features '
# [min, max, np.mean, np.std, skew, len, all]

args.period2=48.0 
# which post ICU discharge timeframe to use for prediction

args.l2=True
args.C=0.001 # optimized if all-all-48-l2

args.output_dir='models/output'
args.data='s3://aws-glue-scripts-271538242229-us-west-2/data/in-hospital-mortality/'

print(args)


In [None]:

train_reader = read_ts(dataset_dir='data/in-hospital-mortality/train/',
                                         listfile='data/in-hospital-mortality/train_listfile.csv',
                                         period_length=args.period2)
val_reader = read_ts(dataset_dir='data/in-hospital-mortality/train/',
                                       listfile='data/in-hospital-mortality/val_listfile.csv',
                                       period_length=args.period2) # by default 20% of train dataset is left for validation data


test_reader = read_ts(dataset_dir='data/in-hospital-mortality/test/',
                                        listfile='data/in-hospital-mortality/test_listfile.csv',
                                        period_length=args.period2)

# uncomment following if run locally
# args.data='C:/Users/sy/Downloads/cse 6250/project/mimic3/data/in-hospital-mortality'
# train_reader = read_ts(dataset_dir=os.path.join(args.data, 'train'),
#                                      listfile=os.path.join(args.data, 'train_listfile.csv'),
#                                      period_length=48.0)

# val_reader = read_ts(dataset_dir=os.path.join(args.data, 'train'),
#                                    listfile=os.path.join(args.data, 'val_listfile.csv'),
#                                    period_length=48.0)

# test_reader = read_ts(dataset_dir=os.path.join(args.data, 'test'),
#                                         listfile=os.path.join(args.data, 'test_listfile.csv'),
#                                         period_length=48.0)
 
#train_reader.read_example(0)

print('Reading data and extracting features ...')
(train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features)
(val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features)
(test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features)
print('  train data shape = {}'.format(train_X.shape))
print('  validation data shape = {}'.format(val_X.shape))
print('  test data shape = {}'.format(test_X.shape))

print('Imputing missing values ...')
imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True)
imputer.fit(train_X)
train_X = np.array(imputer.transform(train_X), dtype=np.float32)
val_X = np.array(imputer.transform(val_X), dtype=np.float32)
test_X = np.array(imputer.transform(test_X), dtype=np.float32)

print('Normalizing the data to have zero mean and unit variance ...')
scaler = StandardScaler()
scaler.fit(train_X)
train_X = scaler.transform(train_X)
val_X = scaler.transform(val_X)
test_X = scaler.transform(test_X)

penalty = ('l2' if args.l2 else 'l1')
file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C)

logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42)
logreg.fit(train_X, train_y)

result_dir = os.path.join(args.output_dir, 'logistic')
common_utils.create_directory(result_dir)

with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)), 'w') as res_file:
    ret = print_metrics_binary(train_y, logreg.predict_proba(train_X))
    ret = {k : float(v) for k, v in ret.items()}
    json.dump(ret, res_file)

with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)), 'w') as res_file:
    ret = print_metrics_binary(val_y, logreg.predict_proba(val_X))
    ret = {k: float(v) for k, v in ret.items()}
    json.dump(ret, res_file)

prediction = logreg.predict_proba(test_X)[:, 1]

with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)), 'w') as res_file:
    ret = print_metrics_binary(test_y, prediction)
    ret = {k: float(v) for k, v in ret.items()}
    json.dump(ret, res_file)

save_results(test_names, prediction, test_y,
             os.path.join(args.output_dir, 'logistic/test_predictions', file_name + '.csv'))


