## Reading in the data

In [6]:
#!pip install boxsdk

In [1]:
# Import two classes from the boxsdk module - Client and OAuth2
from boxsdk import Client, OAuth2

import pandas as pd
import io
import numpy as np

# Read app info from text file
with open('app.cfg', 'r') as app_cfg:
    CLIENT_ID = app_cfg.readline()
    CLIENT_SECRET = app_cfg.readline()
    ACCESS_TOKEN = app_cfg.readline()

In [2]:
class BOXFile:
 def __init__(self, CLIENT_ID, CLIENT_SECRET, ACCESS_TOKEN):
     self._oauth2 = OAuth2(CLIENT_ID, CLIENT_SECRET, access_token=ACCESS_TOKEN)
     self._client = Client(self._oauth2)
     self._file = None
 
 def GetUserInfo(self):
     my = self._client.user(user_id='me').get()

 def GetFile(self, fileID):
     self._file = self._client.file(fileID)
     return(self._file.content())

In [3]:
boxfile = BOXFile(CLIENT_ID, CLIENT_SECRET, ACCESS_TOKEN)
csvFile = boxfile.GetFile('936748103348').decode("utf-8")
ioFile = io.StringIO(csvFile)

In [4]:
df = pd.read_csv(filepath_or_buffer = ioFile, delimiter = ",")
df.head()

Unnamed: 0,patient_mrn,ctd,age,height,weight,ascites_yn,he,race,sex,smoker,...,sa_thu00,biliarycomplication,location_dc,dayson_ventpost_lt,htn,dm,ckd,state,time,prev_state
0,3156866,0,57,162.6,80.2,2,2,1,2,1,...,-85.5392,1,2,0,4,2,2,3,1,0
1,3156866,0,57,162.6,80.2,2,2,1,2,1,...,-85.5392,1,2,0,4,2,2,3,2,3
2,3156866,0,57,162.6,80.2,2,2,1,2,1,...,-85.5392,1,2,0,4,2,2,3,3,3
3,3156866,0,57,162.6,80.2,2,2,1,2,1,...,-85.5392,1,2,0,4,2,2,3,4,3
4,3156866,0,57,162.6,80.2,2,2,1,2,1,...,-85.5392,1,2,0,4,2,2,3,5,3


In [5]:
df.shape

(27430, 36)

In [6]:
df.isna().sum()

patient_mrn                          0
ctd                                  0
age                                  0
height                               0
weight                               0
ascites_yn                           0
he                                   0
race                                 0
sex                                  0
smoker                               0
cit                                  0
dri                                  0
etiology                             0
meld                                 0
acr                                  0
newmalignancy                        0
surgeryduration                      0
readmissionwithin90d                 0
infection_90                         0
need_repeat_surgery                  0
vascularcomplicationswithin90days    0
sma00                                0
vat00                                0
sat00                                0
s_mhu00                              0
v_fhu00                  

### Data Splitting

Data cleaning, feature selection, imputing missing values and data processing have all been done in R before reading the data into this notebook. The data however, is not split into training and testing or scaled for modeling.

I need to split the data into separate training and testing data sets. Because the data originally was one observation per patient and I want to make sure all observations for each patient are in the same dat set, I will split the data based on patient_mrn.

In [7]:
from sklearn.model_selection import train_test_split

patients = df['patient_mrn'].unique()

train_patients, test_patients = train_test_split(patients, test_size=0.2)

In [8]:
print(train_patients.shape)
print(test_patients.shape)

(337,)
(85,)


In [9]:
train = df[df["patient_mrn"].isin(train_patients)]
test = df[df["patient_mrn"].isin(test_patients)]

print(train.shape)
print(test.shape)

(21905, 36)
(5525, 36)


In [10]:
X_train = train.drop(['state'], axis = 1)
X_test = test.drop(['state'], axis = 1)
y_train = train[['state']]
y_test = test[['state']]

y_test = y_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

In [11]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(21905, 35)
(5525, 35)
(21905, 1)
(5525, 1)


### Data Scaling

Now that I have training and testing data sets, I need to scale the features to be between 0 and 1 in the training set and then use these scaling parameters to scale the test set. 

In [12]:
# data normalization with sklearn
from sklearn.preprocessing import MinMaxScaler

# fit scaler on training data
scaler = MinMaxScaler()

# transform features (X)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
y_train.state.value_counts()

1    17282
2     2998
3     1406
4      219
Name: state, dtype: int64

To use the coral ordinal model, the output states must start at 0 and they currently start at a value of 1. I need to subtract 1 from each state.

In [14]:
y_train = (y_train.state - 1)
y_test = (y_test.state - 1)

print(y_train.value_counts())
print(y_test.value_counts())

0    17282
1     2998
2     1406
3      219
Name: state, dtype: int64
0    4453
1     581
2     446
3      45
Name: state, dtype: int64


In [15]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(21905, 35)
(5525, 35)
(21905,)
(5525,)


## Modeling

In [15]:
#!pip install coral-ordinal

In [16]:
import tensorflow as tf
print("Tensorflow version", tf.__version__)

import coral_ordinal as coral
print("CORAL Ordinal version:", coral.__version__)

Tensorflow version 2.7.0
CORAL Ordinal version: 0.1.8


In [17]:
def create_model(num_classes):

    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(32, activation = "relu"))
    model.add(coral.CoralOrdinal(num_classes = num_classes))    

    return model

In [18]:
model = create_model(4)
#model.summary()

In [19]:
model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.05),
              loss = coral.OrdinalCrossEntropy(num_classes = 4),
              metrics = [coral.MeanAbsoluteErrorLabels()])

In [20]:
history = model.fit(X_train, y_train, epochs = 5,
                    callbacks = [tf.keras.callbacks.EarlyStopping(patience = 3, restore_best_weights = True)])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Model Evaluation

In [21]:
model.evaluate(X_test, y_test)



[0.3635483384132385, 0.12283237278461456]

In [22]:
# Note that these are ordinal (cumulative) logits, not probabilities or regular logits.
ordinal_logits = model.predict(X_test)

# Convert from logits to label probabilities. This is initially a tensorflow tensor.
tensor_probs = coral.ordinal_softmax(ordinal_logits)

# Convert the tensor into a pandas dataframe.
probs_df = pd.DataFrame(tensor_probs.numpy())

probs_df.shape

(5525, 4)

In [23]:
probs_df.head(10)

Unnamed: 0,0,1,2,3
0,0.868707,0.123182,0.007576,0.000534
1,0.033962,0.359901,0.514737,0.0914
2,0.036111,0.373028,0.504606,0.086255
3,0.03839,0.386201,0.494034,0.081375
4,0.040807,0.399386,0.48306,0.076747
5,0.04337,0.412545,0.471724,0.072362
6,0.046085,0.425638,0.460068,0.068209
7,0.048962,0.438628,0.448133,0.064277
8,0.243075,0.612741,0.133292,0.010893
9,0.254962,0.608522,0.126287,0.010228


In [24]:
y_test.head(10)

0    2
1    2
2    2
3    2
4    2
5    2
6    2
7    1
8    1
9    1
Name: state, dtype: int64

In [25]:
# Evaluate accuracy and mean absolute error
labels_v1 = probs_df.idxmax(axis = 1)
print("Accuracy of label version 1:", np.mean(labels_v1 == y_test))

Accuracy of label version 1: 0.8971945701357467


In [26]:
from scipy import special

# Compare to logit-based cumulative probs
cum_probs = pd.DataFrame(ordinal_logits).apply(special.expit)
# Calculate the labels using the style of Cao et al.
labels_v2 = cum_probs.apply(lambda x: x > 0.5).sum(axis = 1)
print("Accuracy of label version 2:", np.mean(labels_v2 == y_test))

Accuracy of label version 2: 0.9008144796380091


In [27]:
# These do not correspond with what we get from the model evaluation. Something must be off in one of these.
print("Mean absolute label error version 1:", np.mean(np.abs(labels_v1 - y_test)))
print("Mean absolute label error version 2:", np.mean(np.abs(labels_v2 - y_test)))

print("Root mean squared label error version 1:", np.sqrt(np.mean(np.square(labels_v1 - y_test))))
print("Root mean squared label error version 2:", np.sqrt(np.mean(np.square(labels_v2 - y_test))))

Mean absolute label error version 1: 0.12669683257918551
Mean absolute label error version 2: 0.12307692307692308
Root mean squared label error version 1: 0.41814068082318423
Root mean squared label error version 2: 0.41378946271867967


In [28]:
# Review how absolute error is calculated for ordinal labels:
pd.DataFrame({"true": y_test, "pred_v2": labels_v1, "abs": labels_v2 - y_test}).head()

Unnamed: 0,true,pred_v2,abs
0,2,0,-2
1,2,2,0
2,2,2,0
3,2,2,0
4,2,2,0
