In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import os

from sklearn.feature_extraction import DictVectorizer
from sklearn.externals import joblib

In [2]:
data_path = '../data/'

train_df = pd.read_csv(os.path.join(data_path, 'train_state_action_reward_df.csv'))
val_df = pd.read_csv(os.path.join(data_path, 'val_state_action_reward_df.csv'))
test_df = pd.read_csv(os.path.join(data_path, 'test_state_action_reward_df.csv'))

In [3]:
feature_names = ['ALBUMIN', 'ANION GAP', 'BANDS', 'BICARBONATE',
                   'BILIRUBIN', 'BUN', 'CHLORIDE', 'CREATININE', 'DiasBP', 'Glucose',
                   'GLUCOSE', 'HeartRate', 'HEMATOCRIT', 'HEMOGLOBIN', 'INR', 'LACTATE',
                   'MeanBP', 'PaCO2', 'PLATELET', 'POTASSIUM', 'PT', 'PTT', 'RespRate',
                   'SODIUM', 'SpO2', 'SysBP', 'TempC', 'WBC', 'age', 'is_male',
                   'race_white', 'race_black', 'race_hispanic', 'race_other', 'height',
                   'weight', 'vent', 'sofa', 'lods', 'sirs', 'qsofa', 'qsofa_sysbp_score',
                   'qsofa_gcs_score', 'qsofa_resprate_score', 'elixhauser_hospital',
                   'blood_culture_positive']

In [4]:
feature_df_train = train_df[feature_names]
feature_df_val = val_df[feature_names]
feature_df_test = test_df[feature_names]

In [5]:
v = DictVectorizer(sparse = False)
feature_dict_train = feature_df_train.to_dict('records')
feature_dict_val = feature_df_val.to_dict('records')
feature_dict_test = feature_df_test.to_dict('records')

X_train = v.fit_transform(feature_dict_train)
X_val = v.transform(feature_dict_val)
X_test = v.transform(feature_dict_test)

In [6]:
reward_train = train_df.reward.values
reward_val = val_df.reward.values
reward_test = test_df.reward.values

action_train = train_df.discrete_action.values
action_val = val_df.discrete_action.values
action_test = test_df.discrete_action.values

state_row_id_train = [int(x) for x in train_df.row_id.values]
next_state_row_id_train =[int(x) for x in  train_df.row_id_next.values]

state_row_id_val = [int(x) for x in val_df.row_id.values]
next_state_row_id_val = [int(x) for x in val_df.row_id_next.values]

state_row_id_test = [int(x) for x in test_df.row_id.values]
next_state_row_id_test = [int(x) for x in test_df.row_id_next.values]

In [7]:
output_dict = {'train' : {
                    'X' : X_train,
                    'action' : action_train,
                    'reward' : reward_train,
                    'state_id' : state_row_id_train,
                    'next_state_id' : next_state_row_id_train
                },
                'val' : {
                    'X' : X_val,
                    'action' : action_val,
                    'reward' : reward_val,
                    'state_id' : state_row_id_val,
                    'next_state_id' : next_state_row_id_val
                },
              'test' : {
                    'X' : X_test,
                    'action' : action_test,
                    'reward' : reward_test,
                    'state_id' : state_row_id_test,
                    'next_state_id' : next_state_row_id_test
                },
               'v' : v
         }

joblib.dump(output_dict, os.path.join(data_path, 'data_dict.pkl'))

['../data/data_dict.pkl']