In [1]:
import sys
sys.path.append('/Users/pschulam/Git/mypy')

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nips15
import online
import loglin
%matplotlib inline

In [3]:
folds_dir = 'models/jmlr/folds'

def load_model(marker, fold, folds_dir=folds_dir):
    param_dir = os.path.join(folds_dir, marker, '{:02d}'.format(fold), 'param')
    return nips15.NipsModel.from_directory(param_dir)

In [4]:
demographic = ['female', 'afram']
molecular = ['aca', 'scl']

pfvc_spec = {'t':'years_seen_full', 'y':'pfvc', 'x1':demographic, 'x2':demographic + molecular}
pfvc = pd.read_csv('data/benchmark_pfvc.csv')
pfvc_pd = [nips15.PatientData.from_tbl(tbl, **pfvc_spec) for _, tbl in pfvc.groupby('ptid')]

tss_spec = {'t':'years_seen', 'y':'tss', 'x1':demographic, 'x2':demographic}
tss = pd.read_csv('data/benchmark_tss.csv')
tss_match = ['ptid'] + tss_spec['x1']
tss = pd.merge(pfvc[tss_match], tss, 'left', tss_match).drop_duplicates()
tss_pd = [nips15.PatientData.from_tbl(tbl, **tss_spec) for _, tbl in tss.groupby('ptid')]

pdlco_spec = {'t':'years_seen', 'y':'pdlco', 'x1':demographic, 'x2':demographic}
pdlco = pd.read_csv('data/benchmark_pdc.csv')
pdlco_match = ['ptid'] + pdlco_spec['x1']
pdlco = pd.merge(pfvc[pdlco_match], pdlco, 'left', pdlco_match).drop_duplicates()
pdlco_pd = [nips15.PatientData.from_tbl(tbl, **pdlco_spec) for _, tbl in pdlco.groupby('ptid')]

pv1_spec = {'t':'years_seen', 'y':'pfev1', 'x1':demographic, 'x2':demographic}
pv1 = pd.read_csv('data/benchmark_pv1.csv')
pv1_match = ['ptid'] + pv1_spec['x1']
pv1 = pd.merge(pfvc[pv1_match], pv1, 'left', pv1_match).drop_duplicates()
pv1_pd = [nips15.PatientData.from_tbl(tbl, **pv1_spec) for _, tbl in pv1.groupby('ptid')]

sp_spec = {'t':'years_seen', 'y':'rvsp', 'x1':demographic, 'x2':demographic}
sp = pd.read_csv('data/benchmark_sp.csv')
sp_match = ['ptid'] + sp_spec['x1']
sp = pd.merge(pfvc[sp_match], sp, 'left', sp_match).drop_duplicates()
sp_pd = [nips15.PatientData.from_tbl(tbl, **sp_spec) for _, tbl in sp.groupby('ptid')]

In [5]:
get_ptids = lambda pd: [p.ptid for p in pd]
pfvc_df   = pd.DataFrame({'ptid': get_ptids(pfvc_pd),  'pfvc' : pfvc_pd}).set_index('ptid')
tss_df    = pd.DataFrame({'ptid': get_ptids(tss_pd),   'tss'  : tss_pd}).set_index('ptid')
pdlco_df  = pd.DataFrame({'ptid': get_ptids(pdlco_pd), 'pdlco': pdlco_pd}).set_index('ptid')
pv1_df    = pd.DataFrame({'ptid': get_ptids(pv1_pd),   'pv1'  : pdlco_pd}).set_index('ptid')
sp_df     = pd.DataFrame({'ptid': get_ptids(sp_pd),    'rvsp' : sp_pd}).set_index('ptid')

In [6]:
folds_df = pfvc.loc[:, ['ptid', 'fold']].drop_duplicates().set_index('ptid')

In [7]:
patient_data = pd.concat([folds_df, pfvc_df, tss_df, pdlco_df, pv1_df, sp_df], axis=1, join='inner')

In [8]:
def make_examples(patient_data, col_names, models, censor_time, aux_censor=None):
    marker_histories = zip(*[patient_data[n] for n in col_names])
    examples = []
    for i, histories in enumerate(marker_histories):
        X = []
        for j, h in enumerate(histories):
            if j > 0 and aux_censor is not None:
                d_obs = h.truncate(aux_censor).unpack()
            else:
                d_obs = h.truncate(censor_time).unpack()
            X.append(d_obs)
            
        X_unobs = []
        for j, (m, h) in enumerate(zip(models, histories)):
            if j > 0 and aux_censor is not None:
                d_unobs = h.truncate(aux_censor, after=True).unpack()
            else:
                d_unobs = h.truncate(censor_time, after=True).unpack()
                
            X_unobs.append(d_unobs)
            
        p = models[0].posterior(*histories[0].unpack())
        y_hat = np.argmax(p)
        
        y = (y_hat, X_unobs)
            
        ex = (X, y)
        examples.append(ex)
        
    return examples

In [9]:
import imp
imp.reload(loglin)
from mypy.util import check_grad

import lbfgs

import logging; imp.reload(logging)
from scipy.optimize import minimize
logging.basicConfig(level=logging.INFO)

model_posteriors = np.zeros((patient_data.shape[0], 8))
np.savetxt('param/model_posteriors.dat', model_posteriors)

In [None]:
model_names = ['pfvc', 'tss', 'pdc', 'pv1']
col_names   = ['pfvc', 'tss', 'pdlco', 'pv1']

censor = 1.0
folds = [k + 1 for k in range(10)]
max_iterations = 25
penalty = 0.0

model_posteriors = np.zeros((patient_data.shape[0], 8))
posteriors_file = 'param/l1_trim_model_posteriors_{:.01e}'.format(penalty)
np.savetxt(posteriors_file, model_posteriors)

all_models = []

for fold in folds:
    print('Starting fold {}'.format(fold))
    
    model_posteriors = np.loadtxt(posteriors_file)
    
    models = [load_model(m, fold) for m in model_names]
    test = patient_data['fold'].values == fold
    train = ~test

    train_data = make_examples(patient_data[train], col_names, models, censor, censor)
    test_data = make_examples(patient_data[test], col_names, models, censor, censor)

    model = loglin.SubtypeModel(penalty, models, censor, regularizer='l2', maxiter=max_iterations)
    model.fit(train_data)
    
    all_models.append(model)

    model_posteriors[test, :] = model.proba([X for X, _ in test_data])
    np.savetxt(posteriors_file, model_posteriors)

INFO:root:Evaluated gradient: ||g(w)||_inf = 0.06385184
INFO:root:Evaluated objective: f(w) = 26.19277553, ||w||_0 = 62
INFO:root:Evaluated objective: f(w) = 26.16199755, ||w||_0 = 62
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.05596029
INFO:root:Evaluated objective: f(w) = 26.06112196, ||w||_0 = 62
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02891547
INFO:root:Evaluated objective: f(w) = 26.04087442, ||w||_0 = 62
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.06488199
INFO:root:Evaluated objective: f(w) = 26.01382958, ||w||_0 = 62
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02812944
INFO:root:Evaluated objective: f(w) = 26.00748994, ||w||_0 = 62
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.01398133
INFO:root:Evaluated objective: f(w) = 26.00000737, ||w||_0 = 62
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.01345417
INFO:root:Evaluated objective: f(w) = 25.98613041, ||w||_0 = 62
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.01340546
INFO:root:Evaluated objective: f(w) = 25

Starting fold 1
Starting fold 2

INFO:root:Evaluated gradient: ||g(w)||_inf = 0.07572374
INFO:root:Evaluated objective: f(w) = 26.21435243, ||w||_0 = 62
INFO:root:Evaluated objective: f(w) = 26.17825543, ||w||_0 = 62
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.06828506
INFO:root:Evaluated objective: f(w) = 26.05092472, ||w||_0 = 62
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.03347872
INFO:root:Evaluated objective: f(w) = 26.16635003, ||w||_0 = 62
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.13419486
INFO:root:Evaluated objective: f(w) = 26.00077005, ||w||_0 = 62
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.01095925
INFO:root:Evaluated objective: f(w) = 25.99807265, ||w||_0 = 62
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.01403984
INFO:root:Evaluated objective: f(w) = 25.98838577, ||w||_0 = 62
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.02998577
INFO:root:Evaluated objective: f(w) = 25.96167699, ||w||_0 = 62
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.05572596
INFO:root:Evaluated objective: f(w) = 25


Starting fold 3

INFO:root:Evaluated gradient: ||g(w)||_inf = 0.07297798
INFO:root:Evaluated objective: f(w) = 26.19601986, ||w||_0 = 62
INFO:root:Evaluated objective: f(w) = 26.15646660, ||w||_0 = 62
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.06394140
INFO:root:Evaluated objective: f(w) = 26.03291911, ||w||_0 = 62
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.03106983
INFO:root:Evaluated objective: f(w) = 26.00190271, ||w||_0 = 62
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.04663964
INFO:root:Evaluated objective: f(w) = 25.98701163, ||w||_0 = 62
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.01965740
INFO:root:Evaluated objective: f(w) = 25.98065870, ||w||_0 = 62
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.01433840
INFO:root:Evaluated objective: f(w) = 25.96903590, ||w||_0 = 62
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.01423708
INFO:root:Evaluated objective: f(w) = 25.94925450, ||w||_0 = 62
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.01183210
INFO:root:Evaluated objective: f(w) = 25


Starting fold 4

INFO:root:Evaluated gradient: ||g(w)||_inf = 0.07513933
INFO:root:Evaluated objective: f(w) = 26.64875760, ||w||_0 = 62
INFO:root:Evaluated objective: f(w) = 26.60051803, ||w||_0 = 62
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.06281193
INFO:root:Evaluated objective: f(w) = 26.46289799, ||w||_0 = 62
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.03265067
INFO:root:Evaluated objective: f(w) = 26.42170742, ||w||_0 = 62
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.03603258
INFO:root:Evaluated objective: f(w) = 26.41067364, ||w||_0 = 62
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.03129571
INFO:root:Evaluated objective: f(w) = 26.37609802, ||w||_0 = 62
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.01402146
INFO:root:Evaluated objective: f(w) = 26.36479229, ||w||_0 = 62
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.03344019
INFO:root:Evaluated objective: f(w) = 26.35655069, ||w||_0 = 62
INFO:root:Evaluated gradient: ||g(w)||_inf = 0.00836535
INFO:root:Evaluated objective: f(w) = 26

In [35]:
W0 = all_models[0].objective.scorer.weights

In [42]:
np.round(W0.pairwise(1), 2)

array([[-0.69,  0.19, -0.15, -1.21, -0.14,  0.25],
       [-1.15,  1.48, -0.03,  0.39,  0.1 , -0.  ],
       [-0.56,  0.  , -0.09, -0.01, -0.67,  0.29],
       [-0.53,  0.04, -0.83,  0.  , -0.88, -0.86],
       [-0.54,  0.39,  0.  ,  0.  , -0.59, -0.13],
       [-0.01,  0.65, -0.49, -1.27,  0.36,  0.52],
       [ 0.5 ,  0.2 , -0.98, -0.56, -1.23, -0.36],
       [ 1.82, -2.  ,  0.41,  0.  , -0.21, -0.33]])

In [28]:
posterior_tbl = pd.DataFrame(model_posteriors)
posterior_tbl.index = patient_data.index
posterior_tbl.columns = ['p{}'.format(i) for i in range(1, 9)]

In [29]:
posterior_tbl.to_csv('l1_trim_lgln_posteriors_{:.01e}.csv'.format(penalty))