In [1]:
import sys
sys.path.append('/Users/pschulam/Git/mypy')

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nips15
import online
import loglin
%matplotlib inline

In [3]:
folds_dir = 'models/jmlr/folds'

def load_model(marker, fold, folds_dir=folds_dir):
    param_dir = os.path.join(folds_dir, marker, '{:02d}'.format(fold), 'param')
    return nips15.NipsModel.from_directory(param_dir)

In [4]:
demographic = ['female', 'afram']
molecular = ['aca', 'scl']

pfvc_spec = {'t':'years_seen_full', 'y':'pfvc', 'x1':demographic, 'x2':demographic + molecular}
pfvc = pd.read_csv('data/benchmark_pfvc.csv')
pfvc_pd = [nips15.PatientData.from_tbl(tbl, **pfvc_spec) for _, tbl in pfvc.groupby('ptid')]

tss_spec = {'t':'years_seen', 'y':'tss', 'x1':demographic, 'x2':demographic}
tss = pd.read_csv('data/benchmark_tss.csv')
tss_match = ['ptid'] + tss_spec['x1']
tss = pd.merge(pfvc[tss_match], tss, 'left', tss_match).drop_duplicates()
tss_pd = [nips15.PatientData.from_tbl(tbl, **tss_spec) for _, tbl in tss.groupby('ptid')]

pdlco_spec = {'t':'years_seen', 'y':'pdlco', 'x1':demographic, 'x2':demographic}
pdlco = pd.read_csv('data/benchmark_pdc.csv')
pdlco_match = ['ptid'] + pdlco_spec['x1']
pdlco = pd.merge(pfvc[pdlco_match], pdlco, 'left', pdlco_match).drop_duplicates()
pdlco_pd = [nips15.PatientData.from_tbl(tbl, **pdlco_spec) for _, tbl in pdlco.groupby('ptid')]

pv1_spec = {'t':'years_seen', 'y':'pfev1', 'x1':demographic, 'x2':demographic}
pv1 = pd.read_csv('data/benchmark_pv1.csv')
pv1_match = ['ptid'] + pv1_spec['x1']
pv1 = pd.merge(pfvc[pv1_match], pv1, 'left', pv1_match).drop_duplicates()
pv1_pd = [nips15.PatientData.from_tbl(tbl, **pv1_spec) for _, tbl in pv1.groupby('ptid')]

sp_spec = {'t':'years_seen', 'y':'rvsp', 'x1':demographic, 'x2':demographic}
sp = pd.read_csv('data/benchmark_sp.csv')
sp_match = ['ptid'] + sp_spec['x1']
sp = pd.merge(pfvc[sp_match], sp, 'left', sp_match).drop_duplicates()
sp_pd = [nips15.PatientData.from_tbl(tbl, **sp_spec) for _, tbl in sp.groupby('ptid')]

In [26]:
get_ptids = lambda pd: [p.ptid for p in pd]
pfvc_df   = pd.DataFrame({'ptid': get_ptids(pfvc_pd),  'pfvc' : pfvc_pd}).set_index('ptid')
tss_df    = pd.DataFrame({'ptid': get_ptids(tss_pd),   'tss'  : tss_pd}).set_index('ptid')
pdlco_df  = pd.DataFrame({'ptid': get_ptids(pdlco_pd), 'pdlco': pdlco_pd}).set_index('ptid')
pv1_df    = pd.DataFrame({'ptid': get_ptids(pv1_pd),   'pv1'  : pv1_pd}).set_index('ptid')
sp_df     = pd.DataFrame({'ptid': get_ptids(sp_pd),    'rvsp' : sp_pd}).set_index('ptid')

In [6]:
folds_df = pfvc.loc[:, ['ptid', 'fold']].drop_duplicates().set_index('ptid')

In [7]:
patient_data = pd.concat([folds_df, pfvc_df, tss_df, pdlco_df, pv1_df, sp_df], axis=1, join='inner')

In [8]:
def make_examples(patient_data, col_names, models, censor_time, aux_censor=None):
    marker_histories = zip(*[patient_data[n] for n in col_names])
    examples = []
    for i, histories in enumerate(marker_histories):
        X = []
        for j, h in enumerate(histories):
            if j > 0 and aux_censor is not None:
                d_obs = h.truncate(aux_censor).unpack()
            else:
                d_obs = h.truncate(censor_time).unpack()
            X.append(d_obs)
            
        X_unobs = []
        for j, (m, h) in enumerate(zip(models, histories)):
            if j > 0 and aux_censor is not None:
                d_unobs = h.truncate(aux_censor, after=True).unpack()
            else:
                d_unobs = h.truncate(censor_time, after=True).unpack()
                
            X_unobs.append(d_unobs)
            
        p = models[0].posterior(*histories[0].unpack())
        y_hat = np.argmax(p)
        
        y = (y_hat, X_unobs)
            
        ex = (X, y)
        examples.append(ex)
        
    return examples

In [20]:
import imp
imp.reload(loglin)
from mypy.util import check_grad

import lbfgs

import logging; imp.reload(logging)
from scipy.optimize import minimize
logging.basicConfig(level=logging.INFO)

model_posteriors = np.zeros((patient_data.shape[0], 8))
np.savetxt('param/model_posteriors.dat', model_posteriors)

In [21]:
model_names = ['pfvc', 'tss', 'pdc', 'pv1']
col_names   = ['pfvc', 'tss', 'pdlco', 'pv1']

censor = 1.0
folds = [k + 1 for k in range(10)]
max_iterations = 25
penalty = 1e-4

model_posteriors = np.zeros((patient_data.shape[0], 8))
posteriors_file = 'param/l1_full_hac_model_posteriors_{:.01e}'.format(penalty)
np.savetxt(posteriors_file, model_posteriors)

all_models = []

for fold in folds:
    print('Starting fold {}'.format(fold))
    
    model_posteriors = np.loadtxt(posteriors_file)
    
    models = [load_model(m, fold) for m in model_names]
    test = patient_data['fold'].values == fold
    train = ~test

    train_data = make_examples(patient_data[train], col_names, models, censor, censor)
    test_data = make_examples(patient_data[test], col_names, models, censor, censor)

    model = loglin.SubtypeModel(penalty, models, censor, regularizer='l1', max_iterations=max_iterations)
    model.fit(train_data)
    
    all_models.append(model)

    model_posteriors[test, :] = model.proba([X for X, _ in test_data])
    np.savetxt(posteriors_file, model_posteriors)

INFO:root:Evaluated objective: f(w) = 26.35695883, ||w||_0 = 326
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.05267351
INFO:root:Evaluated objective: f(w) = 26.15758196, ||w||_0 = 321
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02191881
INFO:root:Evaluated objective: f(w) = 26.12903140, ||w||_0 = 321
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.06415875
INFO:root:Evaluated objective: f(w) = 26.08930550, ||w||_0 = 324
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02101445
INFO:root:Evaluated objective: f(w) = 26.07366697, ||w||_0 = 323
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02234525
INFO:root:Evaluated objective: f(w) = 26.01737623, ||w||_0 = 316
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.01831606
INFO:root:Evaluated objective: f(w) = 25.97311013, ||w||_0 = 313
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.00783914
INFO:root:Evaluated objective: f(w) = 25.93337362, ||w||_0 = 315
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.01281174
INFO:root:Evaluated objective: f

Starting fold 1
Starting fold 2

INFO:root:Evaluated objective: f(w) = 26.26935541, ||w||_0 = 326
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.05805989
INFO:root:Evaluated objective: f(w) = 26.07107778, ||w||_0 = 321
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02653742
INFO:root:Evaluated objective: f(w) = 26.04633670, ||w||_0 = 323
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.06642635
INFO:root:Evaluated objective: f(w) = 26.00434905, ||w||_0 = 324
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.01771482
INFO:root:Evaluated objective: f(w) = 25.98938160, ||w||_0 = 324
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02535107
INFO:root:Evaluated objective: f(w) = 25.93115277, ||w||_0 = 315
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02275045
INFO:root:Evaluated objective: f(w) = 25.90742136, ||w||_0 = 310
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.06497174
INFO:root:Evaluated objective: f(w) = 25.86664368, ||w||_0 = 317
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.01884132
INFO:root:Evaluated objective: f


Starting fold 3

INFO:root:Evaluated objective: f(w) = 26.42336446, ||w||_0 = 326
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.05927335
INFO:root:Evaluated objective: f(w) = 26.20029188, ||w||_0 = 321
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02422217
INFO:root:Evaluated objective: f(w) = 26.16987813, ||w||_0 = 322
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.06878631
INFO:root:Evaluated objective: f(w) = 26.12525645, ||w||_0 = 324
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02177782
INFO:root:Evaluated objective: f(w) = 26.10764532, ||w||_0 = 325
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02931749
INFO:root:Evaluated objective: f(w) = 26.06670618, ||w||_0 = 320
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.01612614
INFO:root:Evaluated objective: f(w) = 25.98422352, ||w||_0 = 305
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.01657239
INFO:root:Evaluated objective: f(w) = 25.95321499, ||w||_0 = 312
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.03694367
INFO:root:Evaluated objective: f


Starting fold 4

INFO:root:Evaluated objective: f(w) = 26.87951263, ||w||_0 = 326
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.05301406
INFO:root:Evaluated objective: f(w) = 26.64929618, ||w||_0 = 321
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02639057
INFO:root:Evaluated objective: f(w) = 26.59607297, ||w||_0 = 322
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.05123454
INFO:root:Evaluated objective: f(w) = 26.56530056, ||w||_0 = 324
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02207096
INFO:root:Evaluated objective: f(w) = 26.54700290, ||w||_0 = 324
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02551685
INFO:root:Evaluated objective: f(w) = 26.46679454, ||w||_0 = 308
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02582802
INFO:root:Evaluated objective: f(w) = 26.38487479, ||w||_0 = 311
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.03631640
INFO:root:Evaluated objective: f(w) = 26.38307452, ||w||_0 = 308
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.04068807



Starting fold 5

INFO:root:Evaluated objective: f(w) = 26.99166720, ||w||_0 = 326
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.05721038
INFO:root:Evaluated objective: f(w) = 26.75639747, ||w||_0 = 321
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02650106
INFO:root:Evaluated objective: f(w) = 26.75077654, ||w||_0 = 322
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.09407231
INFO:root:Evaluated objective: f(w) = 26.67568665, ||w||_0 = 325
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02241542
INFO:root:Evaluated objective: f(w) = 26.66080686, ||w||_0 = 325
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02364775
INFO:root:Evaluated objective: f(w) = 26.58803456, ||w||_0 = 314
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02378496
INFO:root:Evaluated objective: f(w) = 26.54695741, ||w||_0 = 317
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.03101217
INFO:root:Evaluated objective: f(w) = 26.52733415, ||w||_0 = 316
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.03926401
INFO:root:Evaluated objective: f


Starting fold 6

INFO:root:Evaluated objective: f(w) = 26.81201366, ||w||_0 = 326
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.05137319
INFO:root:Evaluated objective: f(w) = 26.59345236, ||w||_0 = 321
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02326195
INFO:root:Evaluated objective: f(w) = 26.55043588, ||w||_0 = 323
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.05158591
INFO:root:Evaluated objective: f(w) = 26.52022125, ||w||_0 = 324
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02343953
INFO:root:Evaluated objective: f(w) = 26.49634661, ||w||_0 = 322
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02476526
INFO:root:Evaluated objective: f(w) = 26.43045207, ||w||_0 = 318
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.01641077
INFO:root:Evaluated objective: f(w) = 26.38731687, ||w||_0 = 310
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.03010089
INFO:root:Evaluated objective: f(w) = 26.34603434, ||w||_0 = 311
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02156822
INFO:root:Evaluated objective: f


Starting fold 7

INFO:root:Evaluated objective: f(w) = 25.24231514, ||w||_0 = 326
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.04961935
INFO:root:Evaluated objective: f(w) = 25.02009206, ||w||_0 = 321
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02251666
INFO:root:Evaluated objective: f(w) = 25.00484680, ||w||_0 = 320
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.08154942
INFO:root:Evaluated objective: f(w) = 24.95221179, ||w||_0 = 323
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02350858
INFO:root:Evaluated objective: f(w) = 24.93909056, ||w||_0 = 325
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02239466
INFO:root:Evaluated objective: f(w) = 24.88789027, ||w||_0 = 318
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.01720155
INFO:root:Evaluated objective: f(w) = 24.84801496, ||w||_0 = 317
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.00967649
INFO:root:Evaluated objective: f(w) = 24.81087972, ||w||_0 = 317
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.00883006
INFO:root:Evaluated objective: f


Starting fold 8

INFO:root:Evaluated objective: f(w) = 25.49161643, ||w||_0 = 326
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.04739973
INFO:root:Evaluated objective: f(w) = 25.28533684, ||w||_0 = 320
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02389555
INFO:root:Evaluated objective: f(w) = 25.35839849, ||w||_0 = 319
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.13086890
INFO:root:Evaluated objective: f(w) = 25.24098538, ||w||_0 = 325
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.04839441
INFO:root:Evaluated objective: f(w) = 25.21407927, ||w||_0 = 322
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.01962991
INFO:root:Evaluated objective: f(w) = 25.19138157, ||w||_0 = 325
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02846381
INFO:root:Evaluated objective: f(w) = 25.13416500, ||w||_0 = 317
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.00847218
INFO:root:Evaluated objective: f(w) = 25.07800300, ||w||_0 = 307
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.03477246
INFO:root:Evaluated objective: f


Starting fold 9

INFO:root:Evaluated objective: f(w) = 26.83267680, ||w||_0 = 326
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.05992763
INFO:root:Evaluated objective: f(w) = 26.59428989, ||w||_0 = 321
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02774203
INFO:root:Evaluated objective: f(w) = 26.54972169, ||w||_0 = 321
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.05981741
INFO:root:Evaluated objective: f(w) = 26.51153483, ||w||_0 = 325
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.01992701
INFO:root:Evaluated objective: f(w) = 26.49153927, ||w||_0 = 323
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02661579
INFO:root:Evaluated objective: f(w) = 26.43156123, ||w||_0 = 319
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.01988488
INFO:root:Evaluated objective: f(w) = 26.38428759, ||w||_0 = 308
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.04160583
INFO:root:Evaluated objective: f(w) = 26.34843807, ||w||_0 = 319
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02678812
INFO:root:Evaluated objective: f


Starting fold 10

INFO:root:Evaluated objective: f(w) = 26.56936938, ||w||_0 = 326
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.05594202
INFO:root:Evaluated objective: f(w) = 26.32807044, ||w||_0 = 321
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02929011
INFO:root:Evaluated objective: f(w) = 26.27771504, ||w||_0 = 322
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.06034099
INFO:root:Evaluated objective: f(w) = 26.24133681, ||w||_0 = 323
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02265089
INFO:root:Evaluated objective: f(w) = 26.22248621, ||w||_0 = 325
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02688439
INFO:root:Evaluated objective: f(w) = 26.15876353, ||w||_0 = 315
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02656212
INFO:root:Evaluated objective: f(w) = 26.10409303, ||w||_0 = 309
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.04886804
INFO:root:Evaluated objective: f(w) = 26.04892002, ||w||_0 = 311
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.04166992
INFO:root:Evaluated objective: f




In [22]:
posterior_tbl = pd.DataFrame(model_posteriors)
posterior_tbl.index = patient_data.index
posterior_tbl.columns = ['p{}'.format(i) for i in range(1, 9)]

In [23]:
posterior_tbl.to_csv('l1_full_hac_lgln_posteriors_{:.01e}.csv'.format(penalty))