In [1]:
import sys
import imp
sys.path.append('/Users/pschulam/Git/mypy')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nips15

from copy import deepcopy

from scipy.optimize import minimize

from sklearn import preprocessing, linear_model, cross_validation
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix

from mypy.models import softmax
_ = imp.reload(softmax)

In [3]:
np.set_printoptions(precision=3)
%matplotlib inline

In [4]:
def PatientData(tbl, t, y, x1, x2):
    pd = {}
    pd['ptid'] = int(tbl['ptid'].values[0])
    pd['t'] = tbl[t].values.copy()
    pd['t'] = np.array([]) if np.all(np.isnan(pd['t'])) else pd['t']
    pd['y'] = tbl[y].values.copy()
    pd['y'] = np.array([]) if np.all(np.isnan(pd['y'])) else pd['y']
    pd['x1'] = np.asarray(tbl.loc[:, x1].drop_duplicates()).ravel()
    pd['x2'] = np.asarray(tbl.loc[:, x2].drop_duplicates()).ravel()
    pd['x2'] = np.r_[1.0, pd['x2']]
    return pd
        
def truncated_data(pd, censor_time):
    obs = pd['t'] <= censor_time
    pdc = deepcopy(pd)
    pdc['t'] = pd['t'][obs]
    pdc['y'] = pd['y'][obs]
    return pdc, pd['t'][~obs]

def unpack_data(patient_data):
    t = patient_data['t']
    x1 = patient_data['x1']
    x2 = patient_data['x2']
    y = patient_data['y']
    return t, x1, x2, y

In [5]:
pfvc_spec = {'t' : 'years_seen_full',
             'y' : 'pfvc',
             'x1': ['female', 'afram'],
             'x2': ['female', 'afram', 'aca', 'scl']}

pfvc    = pd.read_csv('data/benchmark_pfvc.csv')
pfvc_pd = [PatientData(tbl, **pfvc_spec) for _, tbl in pfvc.groupby('ptid')]

In [6]:
tss_spec = {'t' : 'years_seen',
            'y' : 'tss',
            'x1': ['female', 'afram'],
            'x2': ['female', 'afram']}

tss = pd.read_csv('data/benchmark_tss.csv')
tss_match = ['ptid'] + tss_spec['x1']
tss = pd.merge(pfvc[tss_match], tss, 'left', tss_match)
tss_pd = [PatientData(tbl, **tss_spec) for _, tbl in tss.groupby('ptid')]

In [7]:
pdlco_spec = {'t' : 'years_seen',
              'y' : 'pdlco',
              'x1': ['female', 'afram'],
              'x2': ['female', 'afram']}

pdlco = pd.read_csv('data/benchmark_pdc.csv')
pdlco_match = ['ptid'] + pdlco_spec['x1']
pdlco = pd.merge(pfvc[pdlco_match], pdlco, 'left', pdlco_match)
pdlco_pd = [PatientData(tbl, **pdlco_spec) for _, tbl in pdlco.groupby('ptid')]

In [8]:
pv1_spec = {'t' : 'years_seen',
            'y' : 'pfev1',
            'x1': ['female', 'afram'],
            'x2': ['female', 'afram']}

pv1 = pd.read_csv('data/benchmark_pv1.csv')
pv1_match = ['ptid'] + pv1_spec['x1']
pv1 = pd.merge(pfvc[pv1_match], pv1, 'left', pv1_match)
pv1_pd = [PatientData(tbl, **pv1_spec) for _, tbl in pv1.groupby('ptid')]

In [9]:
ef_spec = {'t' : 'years_seen',
           'y' : 'ef',
           'x1': ['female', 'afram'],
           'x2': ['female', 'afram']}

ef = pd.read_csv('data/benchmark_ef.csv')
ef_match = ['ptid'] + ef_spec['x1']
ef = pd.merge(pfvc[ef_match], ef, 'left', ef_match)
ef_pd = [PatientData(tbl, **ef_spec) for _, tbl in ef.groupby('ptid')]

In [10]:
all_pd = {
    'pfvc' : pfvc_pd,
    'tss'  : tss_pd,
    'pdlco': pdlco_pd,
    'pv1'  : pv1_pd,
    'ef'   : ef_pd
}

In [11]:
pfvc_model  = nips15.NipsModel.from_directory('models/pfvc')
tss_model   = nips15.NipsModel.from_directory('models/tss')
pdlco_model = nips15.NipsModel.from_directory('models/pdc')
pv1_model   = nips15.NipsModel.from_directory('models/pv1')
ef_model    = nips15.NipsModel.from_directory('models/ef')

In [12]:
all_models = {
    'pfvc' : pfvc_model,
    'tss'  : tss_model,
    'pdlco': pdlco_model,
    'pv1'  : pv1_model,
    'ef'   : ef_model
}

In [13]:
def make_problem(target, aux, censor, sig_censor, feat_censor, patient_data, models):
    P = np.array([models[target].posterior(*unpack_data(d)) for d in patient_data[target]])
    Q = np.array([models[target].posterior(*unpack_data(truncated_data(d, censor)[0])) for d in patient_data[target]])
    
    Q_sig = []
    for marker in aux:
        Qi = np.array([models[marker].posterior(*unpack_data(truncated_data(d, sig_censor)[0]))
                       for d in patient_data[marker]])
        Q_sig.append(Qi)

    Q_feat = []
    for marker in aux:
        Qi = np.array([models[marker].posterior(*unpack_data(truncated_data(d, feat_censor)[0]))
                       for d in patient_data[marker]])
        Q_feat.append(Qi)
    
    S = [check_significance(Qi, np.argmax(P, axis=1), P.shape[1]) for Qi in Q_sig]
    M = np.concatenate(S, axis=1)
    
    X = np.concatenate(Q_feat, axis=1)
    
    return P, Q, X, M

def fit(P, Q, X, M):
    k  = P.shape[1]
    d  = X.shape[1]
    W0 = np.zeros((k, d))
    C  = offset(Q)
    
    def f(w):
        W = w.reshape(W0.shape)
        y = [softmax.regression_ll(x, y, W, c) for x, y, c in zip(X, P, C)]
        return -sum(y)
    
    def g(w):
        W = w.reshape(W0.shape)
        y = [softmax.regression_ll_grad(x, y, W, c) for x, y, c in zip(X, P, C)]
        y = -sum(y)
        y[~M] = 0.0
        return y.ravel()
    
    s = minimize(f, W0.ravel(), jac=g, method='BFGS')
    W = s.x.reshape(W0.shape)
    Qhat = np.array([softmax.regression_proba(x, W, c) for x, c in zip(X, C)])
    
    return Qhat, W, C, s

def fitcv(nfolds, P, Q, X, M):
    Qhat  = np.zeros_like(Q)
    folds = cross_validation.KFold(P.shape[0], nfolds, shuffle=True, random_state=0)
    W     = []
    C     = offset(Q)
    s     = []
    
    for i, (train, test) in enumerate(folds):
        print('Fitting fold {}.'.format(i))
        Qhat_i, Wi, _, si = fit(P[train], Q[train], X[train], M)
        if not si.success:
            print('Warning: optimization did not terminate:\n{}'.format(si.message))
            
        W.append(Wi)
        s.append(si)
        
        Qhat[test] = np.array([softmax.regression_proba(x, Wi, c) for x, c in zip(X[test], C[test])])
        
    return Qhat, W, C, s

def offset(Q):
    return np.log(Q) - np.log(Q[:, 0][:, np.newaxis])

def xentropy(P, Q=None):
    if Q is None:
        Q = P
    return - np.sum(P * np.log(Q))

def check_significance(Q, z, k):
    pvalues     = np.zeros((k, Q.shape[1]))
    significant = np.zeros((k, Q.shape[1]), dtype=np.bool)
    
    for i in range(1, k):
        c1 = est_bernoullis(Q[z == 0])
        c2 = est_bernoullis(Q[z == i])
        d  = c2[0] - c1[0]
        s  = np.sqrt(c1[2] ** 2 + c2[2] ** 2)
        pvalues[i] = np.abs(d / s)
        significant[i] = pvalues[i] >= 2.0

    return significant

def est_bernoullis(Q):
    n = Q.sum()
    p = Q.sum(axis=0) / n
    v = p * (1 - p)
    s = np.sqrt(v / n)
    return p, v, s

In [14]:
full_cheat = make_problem('pfvc', ['tss', 'pdlco', 'pv1', 'ef'], 1.0, float('inf'), float('inf'), all_pd, all_models)

In [15]:
sig_cheat  = make_problem('pfvc', ['tss', 'pdlco', 'pv1', 'ef'], 1.0, float('inf'), 1.0, all_pd, all_models)

In [16]:
feat_cheat = make_problem('pfvc', ['tss', 'pdlco', 'pv1', 'ef'], 1.0, 1.0, float('inf'), all_pd, all_models)

In [17]:
honest     = make_problem('pfvc', ['tss', 'pdlco', 'pv1', 'ef'], 1.0, 1.0, 1.0, all_pd, all_models)

In [195]:
full_cheat_fit = fit(*full_cheat)
xentropy(full_cheat[0], full_cheat_fit[0])

629.49656310072237

In [204]:
full_cheat_fit_cv = fitcv(20, *full_cheat)
xentropy(full_cheat[0], full_cheat_fit_cv[0])

654.56849879937954

In [196]:
sig_cheat_fit  = fit(*sig_cheat)
xentropy(sig_cheat[0], sig_cheat_fit[0])

775.21876480539697

In [31]:
sig_cheat_fit_cv = fitcv(20, *sig_cheat)
xentropy(sig_cheat[0], sig_cheat_fit_cv[0])

Fitting fold 0.
Fitting fold 1.
Fitting fold 2.
Fitting fold 3.
Fitting fold 4.
Fitting fold 5.
Fitting fold 6.
Fitting fold 7.
Fitting fold 8.
Fitting fold 9.
Fitting fold 10.
Fitting fold 11.
Fitting fold 12.
Fitting fold 13.
Fitting fold 14.
Fitting fold 15.
Fitting fold 16.
Fitting fold 17.
Fitting fold 18.
Fitting fold 19.


809.84875809660787

In [205]:
feat_cheat_fit = fit(*feat_cheat)
xentropy(feat_cheat[0], feat_cheat_fit[0])

655.98553066978707

In [34]:
feat_cheat_fit_cv = fitcv(20, *feat_cheat)
xentropy(feat_cheat[0], feat_cheat_fit_cv[0])

Fitting fold 0.
Fitting fold 1.
Desired error not necessarily achieved due to precision loss.
Fitting fold 2.
Fitting fold 3.
Fitting fold 4.
Fitting fold 5.
Fitting fold 6.
Fitting fold 7.
Fitting fold 8.
Fitting fold 9.
Fitting fold 10.
Fitting fold 11.
Fitting fold 12.
Fitting fold 13.
Fitting fold 14.
Fitting fold 15.
Fitting fold 16.
Fitting fold 17.
Fitting fold 18.
Fitting fold 19.


670.50080824503016

In [39]:
feat_cheat_fit_cv[1][1]

array([[  0.000e+00,   0.000e+00,   0.000e+00,   0.000e+00,   0.000e+00,
          0.000e+00,   0.000e+00,   0.000e+00,   0.000e+00,   0.000e+00,
          0.000e+00,   0.000e+00,   0.000e+00,   0.000e+00,   0.000e+00,
          0.000e+00,   0.000e+00,   0.000e+00,   0.000e+00],
       [  0.000e+00,   0.000e+00,   0.000e+00,   0.000e+00,   0.000e+00,
          0.000e+00,   0.000e+00,   0.000e+00,   0.000e+00,   0.000e+00,
          0.000e+00,   0.000e+00,  -3.959e-01,  -7.424e-01,   0.000e+00,
          0.000e+00,   0.000e+00,   0.000e+00,   0.000e+00],
       [  0.000e+00,   0.000e+00,   0.000e+00,   0.000e+00,   0.000e+00,
         -3.964e-01,   0.000e+00,   0.000e+00,   0.000e+00,   0.000e+00,
          0.000e+00,   0.000e+00,  -1.021e+00,   9.089e-01,   0.000e+00,
          0.000e+00,   0.000e+00,   0.000e+00,   0.000e+00],
       [  0.000e+00,   0.000e+00,   0.000e+00,   3.941e-02,   0.000e+00,
         -7.599e-01,   0.000e+00,   5.849e-02,   0.000e+00,   0.000e+00,
          0.00

In [35]:
subtypes = pd.read_csv('benchmark_pfvc_subtypes.csv')
subtypes['subtype'] = np.argmax(feat_cheat_fit_cv[0], axis=1) + 1
subtypes.to_csv('benchmark_pfvc_1y_subtypes_feat_cheat_cv.csv', index=False)

In [29]:
honest_fit     = fit(*honest)
xentropy(honest[0], honest_fit[0])

782.16033097543652

In [30]:
honest_fit_cv  = fitcv(20, *honest)
xentropy(honest[0], honest_fit_cv[0])

Fitting fold 0.
Fitting fold 1.
Fitting fold 2.
Fitting fold 3.
Fitting fold 4.
Fitting fold 5.
Fitting fold 6.
Fitting fold 7.
Fitting fold 8.
Fitting fold 9.
Fitting fold 10.
Fitting fold 11.
Fitting fold 12.
Fitting fold 13.
Fitting fold 14.
Fitting fold 15.
Fitting fold 16.
Fitting fold 17.
Fitting fold 18.
Fitting fold 19.


806.38916857559275

In [41]:
np.round(honest[0], 2)

array([[ 0.  ,  0.  ,  0.81, ...,  0.01,  0.  ,  0.  ],
       [ 0.  ,  0.39,  0.53, ...,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.6 ,  0.38, ...,  0.  ,  0.  ,  0.  ],
       ..., 
       [ 0.  ,  0.  ,  0.02, ...,  0.23,  0.05,  0.  ],
       [ 0.  ,  0.  ,  0.  , ...,  0.84,  0.04,  0.09],
       [ 0.  ,  0.  ,  0.01, ...,  0.09,  0.02,  0.  ]])

## Conclusions

Using full information (`full_cheat`) yields the best training and cross-validated loss. The second best is the feature cheating case (`feat_cheat`). Why is this? It appears that knowing the true posterior probabilities is important, but there is another factor to consider: can we estimate the right features to use accurately? It turns our that the `feat_cheat` feature selection is *approximately* a subset of the `sig_cheat` features. In other words, it misses out on the opportunity to use some of the useful features, but does not include very many irrelevant ones. 

In [32]:
confusion_matrix(sig_cheat[-1].ravel(), feat_cheat[-1].ravel())

array([[90,  3],
       [26, 33]])

What can we take away from this? Estimating the subtype memberships correctly is the most important thing. Are there ways to do this outside of simply observing more data? Not that I can think of, so maybe the way to analyze is to simply see how using the online training helps to make predictions accurate more quickly than if we used the PFVC data alone.