In [1]:
import sys
sys.path.append('/Users/pschulam/Git/mypy')

In [32]:
import os
import pandas as pd
import numpy as np
import nips15
import online

In [24]:
folds_dir = 'models/jmlr/folds'

def load_model(marker, fold, folds_dir=folds_dir):
    param_dir = os.path.join(folds_dir, marker, '{:02d}'.format(fold), 'param')
    return nips15.NipsModel.from_directory(param_dir)

def get_posteriors(model, data, censor=None):
    if censor is None:
        P = [model.posterior(*d.unpack()) for d in data]
    else:
        P = [model.posterior(*d.truncate(censor).unpack()) for d in data]
        
    return np.array(P)

In [6]:
pfvc_spec = {'t' : 'years_seen_full',
             'y' : 'pfvc',
             'x1': ['female', 'afram'],
             'x2': ['female', 'afram', 'aca', 'scl']}

pfvc    = pd.read_csv('data/benchmark_pfvc.csv')
pfvc_pd = [nips15.PatientData.from_tbl(tbl, **pfvc_spec) for _, tbl in pfvc.groupby('ptid')]

tss_spec = {'t' : 'years_seen',
            'y' : 'tss',
            'x1': ['female', 'afram'],
            'x2': ['female', 'afram']}

tss = pd.read_csv('data/benchmark_tss.csv')
tss_match = ['ptid'] + tss_spec['x1']
tss = pd.merge(pfvc[tss_match], tss, 'left', tss_match)
tss_pd = [nips15.PatientData.from_tbl(tbl, **tss_spec) for _, tbl in tss.groupby('ptid')]

pdlco_spec = {'t' : 'years_seen',
              'y' : 'pdlco',
              'x1': ['female', 'afram'],
              'x2': ['female', 'afram']}

pdlco = pd.read_csv('data/benchmark_pdc.csv')
pdlco_match = ['ptid'] + pdlco_spec['x1']
pdlco = pd.merge(pfvc[pdlco_match], pdlco, 'left', pdlco_match)
pdlco_pd = [nips15.PatientData.from_tbl(tbl, **pdlco_spec) for _, tbl in pdlco.groupby('ptid')]

pv1_spec = {'t' : 'years_seen',
            'y' : 'pfev1',
            'x1': ['female', 'afram'],
            'x2': ['female', 'afram']}

pv1 = pd.read_csv('data/benchmark_pv1.csv')
pv1_match = ['ptid'] + pv1_spec['x1']
pv1 = pd.merge(pfvc[pv1_match], pv1, 'left', pv1_match)
pv1_pd = [nips15.PatientData.from_tbl(tbl, **pv1_spec) for _, tbl in pv1.groupby('ptid')]

In [18]:
get_ptids = lambda pd: [p.ptid for p in pd]
pfvc_df   = pd.DataFrame({'ptid': get_ptids(pfvc_pd),  'pfvc' : pfvc_pd}).set_index('ptid')
tss_df    = pd.DataFrame({'ptid': get_ptids(tss_pd),   'tss'  : tss_pd}).set_index('ptid')
pdlco_df  = pd.DataFrame({'ptid': get_ptids(pdlco_pd), 'pdlco': pdlco_pd}).set_index('ptid')
pv1_df    = pd.DataFrame({'ptid': get_ptids(pv1_pd),   'pv1'  : pdlco_pd}).set_index('ptid')

In [25]:
folds_df = pfvc.loc[:, ['ptid', 'fold']].drop_duplicates().set_index('ptid')

In [26]:
patient_data = pd.concat([folds_df, pfvc_df, tss_df, pdlco_df, pv1_df], axis=1, join='inner')

In [42]:
model_names = ['pfvc', 'tss', 'pdc', 'pv1']
col_names   = ['pfvc', 'tss', 'pdlco', 'pv1']

In [40]:
folds = [k + 1 for k in range(10)]
posteriors = np.zeros((patient_data.shape[0], 8))

In [67]:
all_WWaux = []
all_weights = []

for k in folds:
    print('Starting fold {}'.format(k))
    
    models = [load_model(n, k) for n in model_names]
    test   = patient_data['fold'].values == k
    train  = ~test

    Ptrain = get_posteriors(models[0], patient_data['pfvc'][train])
    Qtrain = get_posteriors(models[0], patient_data['pfvc'][train], 1.0)
    QQaux  = []
    for n, m in zip(col_names[1:], models[1:]):
        Qaux = get_posteriors(m, patient_data[n][train])
        QQaux.append(Qaux)
        
    WWaux, weights = online.train_adjustment(Ptrain, Qtrain, QQaux)
    
    Qtest = get_posteriors(models[0], patient_data['pfvc'][test], 1.0)
    QQaux = []
    for n, m in zip(col_names[1:], models[1:]):
        Qaux = get_posteriors(m, patient_data[n][test])
        QQaux.append(Qaux)
        
    posteriors[test, :] = online.apply_adjustment(Qtest, QQaux, WWaux, weights)
    np.savetxt('param/adjusted_posteriors.dat', posteriors)
    
    all_WWaux.append(WWaux)
    all_weights.append(weights)

Starting fold 1
Starting fold 2
Starting fold 3
Starting fold 4
Starting fold 5
Starting fold 6
Starting fold 7
Starting fold 8
Starting fold 9
Starting fold 10


In [73]:
posterior_tbl = pd.DataFrame(posteriors)
posterior_tbl.index = patient_data.index
posterior_tbl.columns = ['p{}'.format(i) for i in range(1, 9)]

In [76]:
posterior_tbl.to_csv('adjusted_posteriors.csv')

In [78]:
!Rscript score_posteriors.R 1.0 benchmark_pfvc_1y_posteriors.csv

Loading required package: methods
Source: local data frame [4 x 2]

     bin  mae
1  (1,2] 4.84
2  (2,4] 6.10
3  (4,8] 7.84
4 (8,25] 9.51


In [77]:
!Rscript score_posteriors.R 1.0 adjusted_posteriors.csv

Loading required package: methods
Source: local data frame [4 x 2]

     bin  mae
1  (1,2] 4.83
2  (2,4] 5.98
3  (4,8] 7.79
4 (8,25] 9.93
