In [1]:
"""
EXAMPLE SCRIPT TO SHOW HOW TO LOAD AND RUN A GPYTORCH MODEL

Assumed folder structure:
On my HPF, the TOP_LEVEL_FOLDER == see /hpf/largeprojects/agoldenb/edrysdale/ED/

TOP_LEVEL_FOLDER
---CensusFlow
------{all the python scripts, etc}
---output
------flow
---------test
------------{date}
---------------*.csv [result output]
---------------pt
------------------[saved model weights]

For example you can download the most recent output and .pt files here:
/hpf/largeprojects/agoldenb/edrysdale/ED/output/flow/test/2021_01_11

python padmanie/ex_run_mdl.py --lead 10 --mdl_date 2020-09-07 --groups mds arr CTAS
"""
"""
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--lead', type=int, default=1, help='Which lead of the data to predict?')
parser.add_argument('--mdl_date', type=str, default='2020-10-01', help='Which model date to use?')
parser.add_argument('--groups', nargs='+',
                    help='Which kernel groups to include? (mds, health, demo, language, CTAS, arr, labs, DI)')
args = parser.parse_args()
print(args)
lead, mdl_date = args.lead, args.mdl_date
groups = None
if hasattr(args, 'groups'):
    groups = args.groups
    
"""

"\nimport argparse\n\nparser = argparse.ArgumentParser()\nparser.add_argument('--lead', type=int, default=1, help='Which lead of the data to predict?')\nparser.add_argument('--mdl_date', type=str, default='2020-10-01', help='Which model date to use?')\nparser.add_argument('--groups', nargs='+',\n                    help='Which kernel groups to include? (mds, health, demo, language, CTAS, arr, labs, DI)')\nargs = parser.parse_args()\nprint(args)\nlead, mdl_date = args.lead, args.mdl_date\ngroups = None\nif hasattr(args, 'groups'):\n    groups = args.groups\n    \n"

In [17]:
lead = 10
mdl_date = "2020-09-07"
groups = ['mds', 'arr', 'CTAS']

In [5]:
from funs_support import find_dir_olu
import numpy as np
import pandas as pd
import os
from time import time
from mdls.gpy import mdl
import torch
import gpytorch

use_cuda = torch.cuda.is_available()
sdev = "cuda" if use_cuda else "cpu"
print('Using device: %s' % sdev)
device = torch.device(sdev)

# Find the top level folder (modify this function to add yours)
dir_olu = find_dir_olu()
print(dir_olu)
dir_output = os.path.join(dir_olu, 'output')
dir_flow = os.path.join(dir_output, 'flow')
dir_test = os.path.join(dir_flow, 'test')
lst_dir = [dir_output, dir_flow, dir_test]
assert all([os.path.exists(path) for path in lst_dir])

Using device: cpu


In [18]:
# Find the most recent date
fn_test = pd.Series(os.listdir(dir_test))
fn_test = fn_test[fn_test.str.contains('^[0-9]{4}')].reset_index(None,True)
fn_test = fn_test[fn_test.str.contains('[0-9]{2}$')].reset_index(None, True)
fn_test = pd.to_datetime(fn_test.str.replace('\\_', '-'))
fn_test = fn_test[fn_test.idxmax()]
print('Most recent model rune date is: %s' % fn_test.strftime('%b %d, %Y'))
dir_mdl = os.path.join(dir_test, fn_test.strftime('%Y_%m_%d'))
dir_pt = os.path.join(dir_mdl, 'pt')

Most recent model rune date is: Jan 11, 2021


In [11]:
print('# --- STEP 1: LOAD DATA --- #')
idx = pd.IndexSlice
df_lead_lags = pd.read_csv(os.path.join(dir_flow, 'df_lead_lags.csv'), header=[0, 1], index_col=[0, 1, 2, 3])
# Create dates
dates = df_lead_lags.index.to_frame().astype(str).assign(
    date=lambda x: pd.to_datetime(x.year + '-' + x.month + '-' + x.day + ' ' + x.hour + ':00:00')).date
# Extract y
yval = df_lead_lags.loc[:, idx[:, 'lead_' + str(lead)]].values.flatten()
# Remove lags (GP handles them automatically in the kernel)
Xmat = df_lead_lags.loc[:, idx[:, 'lag_0']].droplevel(1, 1)
cn = list(Xmat.columns)
Xmat = Xmat.values
# Extract date features (remove year/month)
tmat = dates.index.droplevel([0, 1]).to_frame(False).reset_index().rename(columns={'index': 'trend'})
Xmat = np.hstack([tmat.values, Xmat])
cn = list('date_' + tmat.columns) + cn
assert len(cn) == Xmat.shape[1]

# --- STEP 1: LOAD DATA --- #


In [19]:
print('# --- STEP 2: LOAD MODEL --- #')
mdl_date = pd.to_datetime(pd.Series(mdl_date))[0]
fn_pt = pd.Series(os.listdir(dir_pt))
fn_pt = fn_pt[fn_pt.str.contains('lead_' + str(lead))].reset_index(None, True)
date_pt = pd.to_datetime(fn_pt.str.split('day_', 1, True).iloc[:, 1].str.replace('.pth', ''), format='%Y%m%d')
idx_pt = date_pt[date_pt == mdl_date].index[0]
assert idx_pt is not None  # Ensure model date exists in that folder
path_pt = os.path.join(dir_pt, fn_pt[idx_pt])

# --- STEP 2: LOAD MODEL --- #


In [20]:
# Initialize model. Valid groups: mds, health, demo, language, CTAS, arr, labs, DI
gp = mdl(model='gpy', lead=lead, cn=cn, device=device, groups=groups)
# Fit the model to the data (to create X,y data to condition on for inference time)
# I'm using the first 72 hours here
gp.fit(X=Xmat[:72], y=yval[:72], ntrain=2, nval=1)
gp.gp.load_state_dict(torch.load(path_pt, map_location=device), strict=True) #PMedit: added "strict=False" after model loading error

print('# --- STEP 3: MAKE PREDICTIONS --- #')

gp.gp.float()
gp.gp.eval()
gp.likelihood.eval()
gp.istrained = True
# Using the next 24 hours
print(gp.predict(X=Xmat[72:96], y=yval[72:96]).head(24))

# --- STEP 3: MAKE PREDICTIONS --- #
Test set R2: -2.365
     y         mu         se
0   27  16.584139   3.031407
1   34  21.184352   4.054373
2   43  23.281337   5.175459
3   46  25.097243   6.351348
4   45  25.811464   7.656909
5   42  25.209005   8.967882
6   40  20.263680  10.196408
7   45  12.208421  11.339334
8   48   6.931762  12.469951
9   52   4.834904  13.612908
10  56   5.693100  14.778818
11  61   9.403973  15.917349
12  62  17.004588  17.091307
13  59  23.848298  18.286676
14  52  25.428328  19.473678
15  46  23.120354  20.630048
16  37  23.255419  21.778945
17  27  26.151525  22.977863
18  26  26.175567  24.164627
19  21  21.862673  25.322817
20  17  17.032424  26.518695
21  17  24.891339  27.710641
22  14  12.964243  28.921411
23  19   6.030648  29.847370


In [22]:
import lime
import lime.lime_tabular
from sklearn.model_selection import train_test_split

explainer = lime.lime_tabular.LimeTabularExplainer(Xmat[:72],  
mode='regression',training_labels=yval[:72],feature_names=cn)

# asking for explanation for LIME model
i = 2
exp = explainer.explain_instance(Xmat[i,:], gp.predict_arr, num_features=5)

AttributeError: 'mdl' object has no attribute 'predict_arr'

In [15]:
groups

['mds', 'arr', 'CTAS']

In [None]:
exp.show_in_notebook()


In [3]:
# asking for explanation for LIME model
for i in range(0,23):
    print(f"Explaining prediction # {i}")
    exp = explainer.explain_instance(Xmat[i,:], gp.predict_arr, num_features=5)
    exp.show_in_notebook()

Explaining prediction # 0


NameError: name 'explainer' is not defined

In [21]:
Xmat.shape()

TypeError: 'tuple' object is not callable

In [22]:
len(Xmat)

18227

In [23]:
len(cn)

164

In [24]:
cn

['date_trend',
 'date_day',
 'date_hour',
 'census_max',
 'census_var',
 'tt_arrived',
 'tt_discharged',
 'avgmd_arrived',
 'avgmd_discharged',
 'u_mds10h',
 'age_arrived',
 'age_discharged',
 'diastolic_arrived',
 'diastolic_discharged',
 'num_meds_arrived',
 'num_meds_discharged',
 'pulse_arrived',
 'pulse_discharged',
 'resp_arrived',
 'resp_discharged',
 'ret72_arrived',
 'ret72_discharged',
 'systolic_arrived',
 'systolic_discharged',
 'temp_arrived',
 'temp_discharged',
 'weight_arrived',
 'weight_discharged',
 'sex_F_arrived',
 'sex_F_discharged',
 'sex_M_arrived',
 'sex_M_discharged',
 'language_Amharic_arrived',
 'language_Amharic_discharged',
 'language_Arabic_arrived',
 'language_Arabic_discharged',
 'language_Bengali_arrived',
 'language_Bengali_discharged',
 'language_Chinese - Cantonese_arrived',
 'language_Chinese - Cantonese_discharged',
 'language_Chinese - Mandarin_arrived',
 'language_Chinese - Mandarin_discharged',
 'language_Dari_arrived',
 'language_Dari_discharge