In [1]:
import sys
sys.path.append('/Users/pschulam/Git/mypy')

In [2]:
import os
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nips15
import adjustment

np.set_printoptions(precision=4)
%matplotlib inline

In [3]:
folds_dir = 'models/jmlr/folds'

def load_model(marker, fold, folds_dir=folds_dir):
    param_dir = os.path.join(folds_dir, marker, '{:02d}'.format(fold), 'param')
    return nips15.NipsModel.from_directory(param_dir)

def get_posteriors(model, data):
    P = [model.posterior(*d.unpack()) for d in data]
    P = np.array(P)
    return P

In [4]:
pfvc_spec = {'t' : 'years_seen_full',
             'y' : 'pfvc',
             'x1': ['female', 'afram'],
             'x2': ['female', 'afram', 'aca', 'scl']}

pfvc    = pd.read_csv('data/benchmark_pfvc.csv')
pfvc_pd = [nips15.PatientData.from_tbl(tbl, **pfvc_spec) for _, tbl in pfvc.groupby('ptid')]

tss_spec = {'t' : 'years_seen',
            'y' : 'tss',
            'x1': ['female', 'afram'],
            'x2': ['female', 'afram']}

tss = pd.read_csv('data/benchmark_tss.csv')
tss_match = ['ptid'] + tss_spec['x1']
tss = pd.merge(pfvc[tss_match], tss, 'left', tss_match)
tss_pd = [nips15.PatientData.from_tbl(tbl, **tss_spec) for _, tbl in tss.groupby('ptid')]

pdlco_spec = {'t' : 'years_seen',
              'y' : 'pdlco',
              'x1': ['female', 'afram'],
              'x2': ['female', 'afram']}

pdlco = pd.read_csv('data/benchmark_pdc.csv')
pdlco_match = ['ptid'] + pdlco_spec['x1']
pdlco = pd.merge(pfvc[pdlco_match], pdlco, 'left', pdlco_match)
pdlco_pd = [nips15.PatientData.from_tbl(tbl, **pdlco_spec) for _, tbl in pdlco.groupby('ptid')]

pv1_spec = {'t' : 'years_seen',
            'y' : 'pfev1',
            'x1': ['female', 'afram'],
            'x2': ['female', 'afram']}

pv1 = pd.read_csv('data/benchmark_pv1.csv')
pv1_match = ['ptid'] + pv1_spec['x1']
pv1 = pd.merge(pfvc[pv1_match], pv1, 'left', pv1_match)
pv1_pd = [nips15.PatientData.from_tbl(tbl, **pv1_spec) for _, tbl in pv1.groupby('ptid')]

combined_pd = list(zip(pfvc_pd, tss_pd, pdlco_pd, pv1_pd))

In [5]:
folds = pfvc.loc[:, ['ptid', 'fold']].drop_duplicates()
folds = dict(zip(folds['ptid'], folds['fold']))

In [11]:
def run_experiment(targ, aux, censor_time, fold,
                   patient_data, model_names, folds,
                   feat_censor=None, feat_alpha=0.05):
    
    k = fold
    models = [load_model(n, k) for n in model_names]
    
    feat_censor = censor_time if feat_censor is None else feat_censor
    
    train = [pd for pd in patient_data if not folds[pd[0].ptid] == k]
    test  = [pd for pd in patient_data if     folds[pd[0].ptid] == k]
    
    P = get_posteriors(models[targ], [pd[targ] for pd in train])
    Q = get_posteriors(models[targ], [pd[targ].truncate(censor_time) for pd in train])
    
    XX = []
    MM = []
    for i in aux:
        X = get_posteriors(models[i], [pd[i].truncate(feat_censor) for pd in train])
        M = adjustment.choose_features(P, X, feat_alpha)
        XX.append(X)
        MM.append(M)
        
    X = np.concatenate(XX, axis=1)
    M = np.concatenate(MM, axis=1)
    W = adjustment.fit_adjustment(P, Q, X, M)
    
    Ptest = get_posteriors(models[targ], [pd[targ] for pd in test])
    Qtest = get_posteriors(models[targ], [pd[targ].truncate(censor_time) for pd in test])
    
    XXtest = []
    for i in aux:
        X = get_posteriors(models[i], [pd[i].truncate(feat_censor) for pd in test])
        XXtest.append(X)
        
    Xtest = np.concatenate(XXtest, axis=1)
    Qhat  = adjustment.make_adjustment(W, Qtest, Xtest)
    
    entropy = adjustment.xentropy(Ptest) / Ptest.shape[0]
    entropy_orig = adjustment.xentropy(Ptest, Qtest) / Ptest.shape[0]
    entropy_adju = adjustment.xentropy(Ptest, Qhat)  / Ptest.shape[0]
    
    results = {
        'censor_time' : censor_time,
        'fold'        : fold,
        'entropy'     : entropy,
        'entropy_orig': entropy_orig,
        'entropy_adju': entropy_adju
    }
    
    return results

In [12]:
experiments = itertools.product([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
                                [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

results = [run_experiment(0, [1, 2], c, k, combined_pd, ['pfvc', 'tss', 'pdc'], folds, feat_alpha=0.01)
           for c, k in experiments]

In [13]:
results_tbl = pd.DataFrame(results)

In [14]:
results_tbl.groupby('censor_time').agg(np.sum)

Unnamed: 0_level_0,entropy,entropy_adju,entropy_orig,fold
censor_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,7.423415,12.882076,12.833973,55
2,7.423415,11.268248,11.247051,55
3,7.423415,10.387614,10.35633,55
4,7.423415,9.489795,9.456062,55
5,7.423415,9.060934,9.013514,55
6,7.423415,8.720835,8.694756,55
7,7.423415,8.359612,8.33557,55
8,7.423415,8.15186,8.145246,55


In [141]:
experiments = itertools.product([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
                                [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

results2 = [run_experiment(0, [1, 2, 3], c, k, combined_pd, ['pfvc', 'tss', 'pdc', 'pv1'], folds, feat_alpha=0.01)
            for c, k in experiments]

In [142]:
results2_tbl = pd.DataFrame(results2)

In [143]:
results2_tbl.groupby('censor_time').agg(np.sum)

Unnamed: 0_level_0,entropy,entropy_adju,entropy_orig,fold
censor_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,498.215601,872.328424,863.434045,55
2,498.215601,765.460771,758.171137,55
3,498.215601,705.86804,697.90272,55
4,498.215601,646.692868,636.156694,55
5,498.215601,617.070104,606.224536,55
6,498.215601,589.690825,584.705553,55
7,498.215601,563.717918,561.055252,55
8,498.215601,549.18276,547.704555,55


In [36]:
from scipy.optimize import minimize
from sklearn.cross_validation import KFold

def run_experiment2(targ, aux, censor_time, fold, patient_data, model_names, folds):
    k = fold
    models = [load_model(n, k) for n in model_names]

    train = [pd for pd in patient_data if not folds[pd[0].ptid] == k]
    test  = [pd for pd in patient_data if     folds[pd[0].ptid] == k]
    
    P = get_posteriors(models[targ], [pd[targ] for pd in train])
    Q = get_posteriors(models[targ], [pd[targ].truncate(censor_time) for pd in train])

    XX = []
    for i in aux:
        print('Making features for aux {}'.format(i))
        X = get_posteriors(models[i], [pd[i].truncate(censor_time) for pd in train])
        X = np.c_[ np.ones(X.shape[0]), X[:, 1:] ]
        XX.append(X)
        
    QQdev = [np.zeros_like(Q) for _ in XX]
    dev_folds = KFold(P.shape[0], 4, shuffle=True, random_state=0)
    for i, (dtrain, dtest) in enumerate(dev_folds):
        print('Training dev fold {}'.format(i))
        for m, X in enumerate(XX):
            print('Training marker {}'.format(m))
            W = adjustment.fit_adjustment2(P[dtrain], X[dtrain])
            QQdev[m][dtest] = adjustment.make_adjustment2(W, X[dtest])
            
    weight_attempts   = [interpolate(P, [Q] + QQdev, s) for s in range(100)]
    weight_preplexity = [perplexity(P, [Q] + QQdev, np.log(w)) for w in weight_attempts]
    weights = min(zip(weight_preplexity, weight_attempts))[1]
    
    WW = []
    for i, X in enumerate(XX):
        print('Training marker {} for real!'.format(i))
        W = adjustment.fit_adjustment2(P, X)
        WW.append(W)
        
    Ptest = get_posteriors(models[targ], [pd[targ] for pd in test])
    Qtest = get_posteriors(models[targ], [pd[targ].truncate(censor_time) for pd in test])
    
    XXtest = []
    for i in aux:
        X = get_posteriors(models[i], [pd[i].truncate(censor_time) for pd in test])
        X = np.c_[ np.ones(X.shape[0]), X[:, 1:] ]
        XXtest.append(X)
        
    QQtest = []
    for i, X in enumerate(XXtest):
        Qi = adjustment.make_adjustment2(WW[i], X)
        QQtest.append(Qi)
        
    Qhat = mixture([Qtest] + QQtest, np.log(weights))
    
    entropy = adjustment.xentropy(Ptest) / Ptest.shape[0]
    entropy_orig = adjustment.xentropy(Ptest, Qtest) / Ptest.shape[0]
    entropy_adju = adjustment.xentropy(Ptest, Qhat)  / Ptest.shape[0]
    
    results = {
        'fold'        : fold,
        'censor_time' : censor_time,
        'entropy'     : entropy,
        'entropy_orig': entropy_orig,
        'entropy_adju': entropy_adju,
        'Ptest'       : Ptest,
        'Qtest'       : Qtest,
        'QQtest'      : QQtest,
        'weights'     : weights
    }
    
    return results


def interpolate(P, QQ, seed=1):
    rnd = np.random.RandomState(seed)
    
    M  = len(QQ)
    w  = rnd.uniform(size=M)
    w /= w.sum()
    v  = np.log(w)
    l  = 0.0
    
    obj = lambda x: interpolate_obj(P, QQ, x[:-1], x[-1])
    jac = lambda x: interpolate_jac(P, QQ, x[:-1], x[-1])
    sol = minimize(obj, np.r_[v, l], jac=jac, method='BFGS')
    
    w = np.exp(sol.x[:-1])
    
    return w
    

def interpolate_obj(P, QQ, v, l):
    return perplexity(P, QQ, v) + l * (1 - np.exp(v).sum())


def interpolate_jac(P, QQ, v, l):
    g = np.zeros(v.size + 1)
    g[:-1] = perplexity_jac(P, QQ, v)
    g[-1]  = 1 - np.exp(v).sum()
    return g


def perplexity(P, QQ, v):
    Q = mixture(QQ, v)
    return - np.sum(P * np.log(Q))


def perplexity_jac(P, QQ, v):
    Q = mixture(QQ, v)
    g = np.zeros_like(v)
    
    for i, vi in enumerate(v):
        wi   = np.exp(vi)
        g[i] = wi * np.sum(P * QQ[i] / Q)
        
    return g


def mixture(QQ, v):
    w = np.exp(v)
    Q = np.zeros_like(QQ[0])
    for wi, Qi in zip(w, QQ):
        Q += wi * Qi
    return Q

In [37]:
r = run_experiment2(0, [1, 2], 1.0, 1, combined_pd, ['pfvc', 'tss', 'pdc'], folds)

Making features for aux 1
Making features for aux 2
Training dev fold 0
Training marker 0
Training marker 1
Training dev fold 1
Training marker 0
Training marker 1
Training dev fold 2
Training marker 0
Training marker 1
Training dev fold 3
Training marker 0
Training marker 1
Training marker 0 for real!
Training marker 1 for real!


In [39]:
adjustment.xentropy(r['Ptest'])

60.375734267340718

In [40]:
adjustment.xentropy

(75, 8)

In [50]:
np.round(r['Ptest'], 2)

array([[ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.99,  0.01],
       [ 0.  ,  0.  ,  0.03,  0.95,  0.  ,  0.01,  0.01,  0.  ],
       [ 0.  ,  0.01,  0.29,  0.62,  0.04,  0.01,  0.02,  0.  ],
       [ 0.  ,  0.  ,  0.  ,  0.17,  0.25,  0.58,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  1.  ],
       [ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.1 ,  0.31,  0.58],
       [ 0.  ,  0.11,  0.85,  0.04,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.01,  0.84,  0.06,  0.08,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.02,  0.62,  0.35,  0.01,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.01,  0.02,  0.89,  0.04,  0.05,  0.  ],
       [ 0.  ,  0.02,  0.59,  0.38,  0.01,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.02,  0.32,  0.28,  0.3 ,  0.08,  0.  ],
       [ 1.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.42,  0.47,  0.1 ,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  ,  0.09,  0.  ,  0.01,  0.89,  0.  ],
       [ 0.  ,  0.  ,  0.

In [54]:
np.round(r['QQtest'][0], 2)

array([[ 0.16,  0.31,  0.15,  0.14,  0.08,  0.04,  0.09,  0.03],
       [ 0.09,  0.23,  0.14,  0.15,  0.12,  0.07,  0.13,  0.05],
       [ 0.09,  0.29,  0.14,  0.15,  0.11,  0.06,  0.11,  0.05],
       [ 0.08,  0.16,  0.14,  0.14,  0.15,  0.1 ,  0.17,  0.05],
       [ 0.  ,  0.12,  0.05,  0.11,  0.18,  0.27,  0.12,  0.15],
       [ 0.09,  0.23,  0.14,  0.15,  0.12,  0.07,  0.13,  0.05],
       [ 0.09,  0.23,  0.14,  0.15,  0.12,  0.07,  0.13,  0.05],
       [ 0.11,  0.3 ,  0.14,  0.15,  0.1 ,  0.05,  0.1 ,  0.04],
       [ 0.09,  0.21,  0.13,  0.12,  0.16,  0.1 ,  0.14,  0.04],
       [ 0.03,  0.2 ,  0.1 ,  0.14,  0.16,  0.14,  0.14,  0.09],
       [ 0.09,  0.23,  0.14,  0.15,  0.12,  0.07,  0.13,  0.05],
       [ 0.09,  0.23,  0.14,  0.15,  0.12,  0.07,  0.13,  0.05],
       [ 0.09,  0.23,  0.14,  0.15,  0.12,  0.07,  0.13,  0.05],
       [ 0.08,  0.29,  0.13,  0.15,  0.11,  0.07,  0.11,  0.05],
       [ 0.04,  0.22,  0.12,  0.15,  0.14,  0.11,  0.14,  0.07],
       [ 0.01,  0.12,  0.

In [48]:
(P * np.log(P)).sum()

-60.375734267340718