In [3]:
import pandas as pd
import numpy as np

from zipfile import ZipFile

In [4]:
ssc_zip = ZipFile('data/scleroderma.zip')

In [5]:
def LongitudinalSpec(index, date, outcome):
    'The index, timestamp, and outcome column names of a longitudinal dataset.'
    return {'index': index, 'date': date, 'outcome': outcome}

def AlignmentSpec(index, time, date, baseline):
    'The date and baseline column names used to align longitudinal data.'
    return {'index': index, 'time': time, 'date': date, 'baseline': baseline}

In [6]:
def read_dates(file='tPtData.csv', zipfile=ssc_zip):
    with zipfile.open(file) as f:
        ptdata = pd.read_csv(f)
    dates = ptdata.loc[:, ['PtID', 'DateFirstSeen', 'Date1stSymptom']]
    dates['DateFirstSeen'] = pd.to_datetime(dates['DateFirstSeen'])
    dates['Date1stSymptom'] = pd.to_datetime(dates['Date1stSymptom'])
    dates.columns = ['ptid', 'first_seen', 'first_symptom']
    return dates

def read_longitudinal(stream, names, renames):
    tbl = pd.read_csv(stream).loc[:, [names['index'], names['date'], names['outcome']]]
    tbl.columns = [renames['index'], renames['date'], renames['outcome']]
    tbl[renames['date']] = pd.to_datetime(tbl[renames['date']])
    return tbl[~tbl[renames['outcome']].isnull()]

def align_longitudinal(dataset, dates, alignment):
    aligned_dataset = pd.merge(dataset, dates, 'left', alignment['index'])
    date = aligned_dataset[alignment['date']]
    base = aligned_dataset[alignment['baseline']]
    aligned_dataset[alignment['time']] = (date - base).dt.days / 365.0
    return aligned_dataset[[alignment['index'], alignment['time'], dataset.columns[2]]]

def variables(file, zipfile=ssc_zip):
    with zipfile.open(file) as f:
        tbl = pd.read_csv(f, nrows=1)
    return list(tbl.columns)

def is_variable(name, file, zipfile=ssc_zip):
    with zipfile.open(file) as f:
        tbl = pd.read_csv(f, nrows=1)
    return name in tbl

In [7]:
def read_from_table(index, date, file, zipfile=ssc_zip, **kwargs):
    columns = [(k, v) for k, v in kwargs.items() if is_variable(v, file, zipfile)]
    if not len(columns) == 1:
        raise RuntimeError('Must specify exactly one outcome column.')
    else:
        rename, outcome = columns[0]
    
    names = LongitudinalSpec(index, date, outcome)
    renames = LongitudinalSpec('ptid', 'date', rename)
    
    with zipfile.open(file) as f:
        tbl = read_longitudinal(f, names, renames)
        
    return tbl

def read_from_visits(zipfile=ssc_zip, **kwargs):
    return read_from_table('PtID', 'Visit.Date', 'tVisit.csv', zipfile, **kwargs)

def read_from_echos(zipfile=ssc_zip, **kwargs):
    return read_from_table('PtID', 'Date.of.ECHO', 'tECHO.csv', zipfile, **kwargs)

def read_from_pfts(zipfile=ssc_zip, **kwargs):
    return read_from_table('PtID', 'Date', 'tPFT.csv', zipfile, **kwargs)

In [8]:
dates = read_dates()
alignment = AlignmentSpec('ptid', 'years_seen', 'date', 'first_seen')

In [9]:
rp_ss = align_longitudinal(read_from_visits(rp='RP.Sev.Score'), dates, alignment)
heart_ss = align_longitudinal(read_from_visits(heart='Heart.Sev.Score'), dates, alignment)
gen_ss = align_longitudinal(read_from_visits(general='lkpGeneralScore'), dates, alignment)
kidney_ss = align_longitudinal(read_from_visits(kidney='lkpLabUrineProtein'), dates, alignment)
muscle_ss = align_longitudinal(read_from_visits(muscle='Muscle.Sev.Score'), dates, alignment)
gi_ss = align_longitudinal(read_from_visits(gi='GI.Sev.Score'), dates, alignment)

In [10]:
rvsp = align_longitudinal(read_from_echos(rvsp='RVSP'), dates, alignment)
ejection_frac = align_longitudinal(read_from_echos(ef='Ejection.Fraction'), dates, alignment)

In [11]:
pfev1 = align_longitudinal(read_from_pfts(pfev1='perc.FEV1.of.predicted'), dates, alignment)
pdlco = align_longitudinal(read_from_pfts(pdlco='perc.DLCO.of.predicted'), dates, alignment)
pfev1fvc = align_longitudinal(read_from_pfts(pfev1fvc='perc.FEV1FVC.of.predicted'), dates, alignment)

In [12]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [13]:
bm_pfvc = pd.read_csv('data/benchmark_pfvc.csv').loc[:, ['ptid', 'years_seen_full', 'pfvc']]
bm_pfvc.columns = ['ptid', 'years_seen', 'pfvc']

In [14]:
bm_ptid = set(bm_pfvc['ptid'].values)
def select_bm(tbl, ptids=bm_ptid):
    tbl = tbl[[ptid in ptids for ptid in tbl['ptid']]]
    tbl = tbl[tbl['years_seen'] >= 0.0]
    return tbl.copy()

In [15]:
aux_names   = ['rp',  'hrt',    'gen',  'kid',     'msc',     'gi',  'sp', 'ef',         'pv1', 'pdc' ]
aux_markers = [rp_ss, heart_ss, gen_ss, kidney_ss, muscle_ss, gi_ss, rvsp, ejection_frac, pfev1, pdlco]

In [16]:
bm_auxm = [select_bm(m) for m in aux_markers]
for n, m in zip(aux_names, bm_auxm):
    fn = 'data/{}.csv'.format(n)
    m.to_csv(fn, index=False)

In [636]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.multiclass import OneVsOneClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import cross_val_score, cross_val_predict, LeaveOneOut
from sklearn.metrics import confusion_matrix
from sklearn.grid_search import GridSearchCV

def fit_mlm(data, reml=False):
    outcome = str(data.columns[2])
    formula = '{} ~ years_seen'.format(outcome)
    model   = smf.mixedlm(formula, data, re_formula='years_seen', groups=data['ptid'])
    fitted  = model.fit(reml=reml)
    return fitted

def add_mlm_features(subtypes, mlm, prefix):
    intercept, slope = mlm.fe_params
    ranef = mlm.random_effects
    ranef.columns = ['{}_{}'.format(prefix, n) for n in ('intercept', 'slope')]
    subtypes = pd.merge(subtypes, ranef, 'left', left_index=True, right_index=True)
    subtypes[ranef.columns[0]].fillna(intercept, inplace=True)
    subtypes[ranef.columns[1]].fillna(slope, inplace=True)
    return subtypes

def add_val_features(subtypes, mlm, prefix, time):
    intercept, slope = mlm.fe_params
    ranef = mlm.random_effects
    ranef.columns = ['{}_{}'.format(prefix, n) for n in ('intercept', 'slope')]
    aligned_ranef = pd.merge(subtypes, ranef, 'left', left_index=True, right_index=True)
    intercept = aligned_ranef[ranef.columns[0]].fillna(intercept)
    slope = aligned_ranef[ranef.columns[1]].fillna(slope)
    subtypes['{}_val'.format(prefix)] = intercept + slope * time
    return subtypes

def naive_model_score(y):
    counts = np.bincount(y)
    return (np.argmax(counts) == y).mean()

def ability_to_predict(X, y):
    par = {'C': list(np.logspace(-4, 2, 10)), 'gamma': list(np.logspace(-2, 1, 10))}
    loo = LeaveOneOut(y.size)
    clf = GridSearchCV(SVC(class_weight='auto'), par, scoring='accuracy', cv=loo)
    clf.fit(X, y)
    return clf

def make_loo_predictions(X, y):
    par = {'C': list(np.logspace(-4, 2, 10)), 'gamma': list(np.logspace(-2, 1, 10))}
    clf = GridSearchCV(SVC(class_weight='auto'), par, scoring='accuracy', cv=4, verbose=1)
    return cross_val_predict(clf, X, y, cv=LeaveOneOut(y.size), verbose=1)

def correct_predictions(correct_map, P):
    ranked = np.argsort(P, axis=1)
    yhat = np.zeros_like(correct_map)
    yhat[correct_map == 1] = ranked[correct_map == 1, -1]
    yhat[correct_map == 0] = ranked[correct_map == 0, -2]
    return yhat

In [614]:
subtypes = pd.read_csv('benchmark_pfvc_subtypes.csv').set_index('ptid')

In [768]:
P = pd.read_csv('benchmark_pfvc_1y_posteriors.csv')
P = np.asarray(P.iloc[:, 1:])
yhat_posterior = np.argmax(P, axis=1)
y = subtypes['subtype'].values - 1

for i in range(P.shape[1]):
    yi = y[yhat_posterior == i] == i
    print(naive_model_score(yi))

0.85
0.725352112676
0.596899224806
0.5
0.603896103896
0.7
0.660377358491
0.772727272727


In [777]:
!Rscript score_predictions.R benchmark_pfvc_1y_subtypes.csv

Loading required package: methods
Source: local data frame [4 x 2]

     bin   mae
1  (1,2]  4.95
2  (2,4]  6.96
3  (4,8]  9.31
4 (8,25] 11.05
Source: local data frame [8 x 5]

  true_subtype (1,2] (2,4] (4,8] (8,25]
1            1  5.68  9.47  7.15  16.67
2            2  4.01  4.31  6.38   6.68
3            3  4.43  5.48  6.57   6.54
4            4  4.87  7.12 11.05  14.80
5            5  4.95  6.74  9.97  14.57
6            6  5.10  9.07 10.64  10.67
7            7  6.19  9.88 14.49  19.92
8            8  5.45  6.12  3.48   1.53
Source: local data frame [8 x 9]

  true_subtype    1     2     3     4     5     6     7     8
1            1 3.60  8.80 43.35 52.91    NA    NA    NA    NA
2            2 8.02  3.43  7.61 20.67 11.93 31.94    NA    NA
3            3   NA 10.38  3.62 12.09  8.81    NA 27.87    NA
4            4   NA 16.80  7.36  3.18 13.37    NA 16.49    NA
5            5   NA    NA  5.61 11.28  4.64 10.05 28.03 24.56
6            6   NA    NA 21.57    NA 12.62  3.55  9.03 1

## True Features

In [637]:
mlm = [fit_mlm(m, True) for m in bm_auxm]



In [759]:
P = pd.read_csv('benchmark_pfvc_1y_posteriors.csv')
P = np.asarray(P.iloc[:, 1:])
yhat_posterior = np.argmax(P, axis=1)

In [760]:
subtype_coef_features_true1 = subtypes.copy()
for m, n in zip(mlm, aux_names):
    subtype_coef_features_true1 = add_mlm_features(subtype_coef_features_true1, m, n)

In [761]:
y = subtype_coef_features_true1['subtype'].values - 1
X = np.asarray(subtype_coef_features_true1.iloc[:, 1:])
X = np.concatenate((X, P), axis=1)
X = StandardScaler().fit_transform(X)

In [762]:
coef_models_true = []
for i in range(P.shape[1]):
    print('Starting run {}'.format(i))
    Xi = X[yhat_posterior == i]
    yi = y[yhat_posterior == i] == i
    fi = ability_to_predict(Xi, yi)
    coef_models_true.append(fi)

Starting run 0
Starting run 1
Starting run 2
Starting run 3
Starting run 4
Starting run 5
Starting run 6
Starting run 7


In [763]:
for m in coef_models_true:
    print(m.best_score_)

1.0
1.0
0.666666666667
1.0
0.837662337662
0.7
1.0
1.0


In [765]:
correct_map = np.zeros_like(y)
for i, m in enumerate(coef_models_true):
    ix = yhat_posterior == i
    Xi = X[ix]
    yi = y[ix] == i
    correct_map[ix] = cross_val_predict(m.best_estimator_, Xi, yi, cv=LeaveOneOut(yi.size))
    
yhat_coef_true1 = np.zeros_like(y)
yhat_coef_true1[correct_map == 1] = yhat_posterior[correct_map == 1]
yhat_coef_true1[correct_map == 0] = np.argsort(P, axis=1)[correct_map == 0, -2]

confusion_matrix(y, yhat_coef_true1)

array([[ 20,   0,   1,   0,   3,   0,   0,   0],
       [  0, 119,  13,   0,   6,   1,   0,   0],
       [  2,  12,  81,   7,   7,   2,   0,   0],
       [  1,   2,  21,  73,   9,   2,   1,   0],
       [  0,   0,   1,   6, 109,  11,   6,   3],
       [  0,   0,   0,   3,  12,  44,   1,   2],
       [  0,   3,   2,   8,   7,  11,  41,   0],
       [  0,   0,   0,   0,   0,   1,   1,  17]])

In [766]:
coef_true1_subtypes = pd.DataFrame({'subtype': yhat_coef_true1 + 1}, subtypes.index)
coef_true1_subtypes.to_csv('benchmark_pfvc_1y_subtypes_coef_true_adjustment.csv')

In [767]:
!Rscript score_predictions.R benchmark_pfvc_1y_subtypes_coef_true_adjustment.csv

Loading required package: methods
Source: local data frame [4 x 2]

     bin  mae
1  (1,2] 4.65
2  (2,4] 5.85
3  (4,8] 8.05
4 (8,25] 9.81
Source: local data frame [8 x 5]

  true_subtype (1,2] (2,4] (4,8] (8,25]
1            1  6.35  8.62  6.73  13.28
2            2  3.60  3.72  4.90   6.91
3            3  4.61  4.79  6.42   7.74
4            4  4.17  5.46  7.72   9.66
5            5  5.16  6.74  9.51  10.35
6            6  4.89  6.83  8.95  13.08
7            7  5.51  8.00 14.78  17.99
8            8  4.30  5.54  3.48   1.53
Source: local data frame [8 x 9]

  true_subtype     1     2     3     4     5     6     7     8
1            1  3.25    NA 16.09    NA 44.00    NA    NA    NA
2            2    NA  3.31  8.95    NA  9.75 36.14    NA    NA
3            3 19.12 10.11  3.58 11.91  5.80 23.81    NA    NA
4            4 26.12 14.73  9.58  4.01 13.07  4.33 18.26    NA
5            5    NA    NA  6.73 10.81  4.44 11.81 26.76 31.99
6            6    NA    NA    NA  4.58 15.36  3.79  8.46

In [778]:
!Rscript compare_subtype_reprediction.R benchmark_pfvc_1y_subtypes_coef_true_adjustment.csv

    repred
pred  1  2  3  4  5  6  7
   1  0  3  0  0  0  0  0
   2  3  0 19  1  0  0  0
   3  0 10  0  6  1  0  0
   4  0  3  4  0 21  0  0
   5  0  0  0 12  0 11  3
   6  0  0  0  0  1  0  2
   7  0  0  1  3  3  4  0
   8  0  0  0  0  0  1  1


## Baseline Features Only

In [749]:
P = pd.read_csv('benchmark_pfvc_1y_posteriors.csv')
P = np.asarray(P.iloc[:, 1:])
yhat_posterior = np.argmax(P, axis=1)

In [750]:
bm_feat_names = ['female', 'afram', 'aca', 'scl']
bm_features = pd.read_csv('data/benchmark_pfvc.csv').loc[:, ['ptid'] + bm_feat_names]
bm_features = bm_features.drop_duplicates()

In [751]:
baseline_features = subtypes.copy().reset_index()
baseline_features = pd.merge(baseline_features, bm_features, 'inner', on='ptid')
baseline_features = baseline_features.set_index('ptid')

In [752]:
baseline_features.head()

Unnamed: 0_level_0,subtype,female,afram,aca,scl
ptid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,3,1,0,0,0
3,3,1,0,0,0
5,2,1,0,1,0
6,7,1,0,1,0
11,7,0,0,0,0


In [753]:
y = baseline_features['subtype'].values - 1
X = np.asarray(baseline_features.iloc[:, 1:])
X = np.concatenate((X, P), axis=1)
X = StandardScaler().fit_transform(X)

In [754]:
base_models = []
for i in range(P.shape[1]):
    print('Starting run {}'.format(i))
    Xi = X[yhat_posterior == i]
    yi = y[yhat_posterior == i] == i
    fi = ability_to_predict(Xi, yi)
    base_models.append(fi)

Starting run 0
Starting run 1
Starting run 2
Starting run 3
Starting run 4
Starting run 5
Starting run 6
Starting run 7


In [755]:
for m in base_models:
    print(m.best_score_)

1.0
1.0
0.596899224806
1.0
0.603896103896
0.7
1.0
1.0


In [756]:
correct_map = np.zeros_like(y)
for i, m in enumerate(base_models):
    ix = yhat_posterior == i
    Xi = X[ix]
    yi = y[ix] == i
    correct_map[ix] = cross_val_predict(m.best_estimator_, Xi, yi, cv=LeaveOneOut(yi.size))
    
yhat_base = np.zeros_like(y)
yhat_base[correct_map == 1] = yhat_posterior[correct_map == 1]
yhat_base[correct_map == 0] = np.argsort(P, axis=1)[correct_map == 0, -2]

confusion_matrix(y, yhat_base)

array([[ 20,   0,   3,   0,   1,   0,   0,   0],
       [  0, 109,  23,   0,   7,   0,   0,   0],
       [  2,   1, 101,   3,   4,   0,   0,   0],
       [  1,   0,  30,  55,  21,   2,   0,   0],
       [  0,   0,   3,   0, 120,   5,   7,   1],
       [  0,   0,   2,   0,  26,  30,   2,   2],
       [  0,   2,   6,   1,  21,   3,  39,   0],
       [  0,   0,   0,   0,   1,   0,   1,  17]])

In [757]:
base_subtypes = pd.DataFrame({'subtype': yhat_base + 1}, subtypes.index)
base_subtypes.to_csv('benchmark_pfvc_1y_subtypes_base_adjustment.csv')

In [758]:
!Rscript score_predictions.R benchmark_pfvc_1y_subtypes_base_adjustment.csv

Loading required package: methods
Source: local data frame [4 x 2]

     bin  mae
1  (1,2] 4.87
2  (2,4] 6.64
3  (4,8] 8.53
4 (8,25] 9.08
Source: local data frame [8 x 5]

  true_subtype (1,2] (2,4] (4,8] (8,25]
1            1  5.47  8.05  6.55  13.21
2            2  3.70  3.87  5.22   6.14
3            3  3.81  3.80  4.80   5.39
4            4  4.43  6.47  9.40  11.78
5            5  4.87  6.17  8.40   8.58
6            6  6.35 10.20 11.89  12.02
7            7  6.34 10.72 16.91  19.01
8            8  4.47  6.12  3.48   1.53
Source: local data frame [8 x 9]

  true_subtype     1     2     3     4     5     6     7     8
1            1  3.25    NA 28.67    NA 37.54    NA    NA    NA
2            2    NA  3.36  7.87    NA 12.68    NA    NA    NA
3            3 19.12  6.00  3.60 12.95  8.43    NA    NA    NA
4            4 26.12    NA  8.45  3.64 13.37  4.33    NA    NA
5            5    NA    NA  5.97    NA  4.28 15.24 25.58 34.28
6            6    NA    NA 21.57    NA 13.77  4.14  9.21

In [780]:
!Rscript compare_subtype_reprediction.R benchmark_pfvc_1y_subtypes_base_adjustment.csv

    repred
pred  1  2  3  4  5  6  7
   1  0  3  0  0  0  0  0
   2  3  0 19  1  0  0  0
   4  0  3  4  0 21  0  0
   6  0  0  0  0  3  0  3
   7  0  0  1  3  3  4  0
   8  0  0  0  0  0  1  1


## Year 1 Intercept/Slope Features

In [769]:
P = pd.read_csv('benchmark_pfvc_1y_posteriors.csv')
P = np.asarray(P.iloc[:, 1:])
yhat_posterior = np.argmax(P, axis=1)

In [770]:
def censor(markers, time):
    ix = markers['years_seen'] <= time
    return markers[ix]

mlm1 = [fit_mlm(censor(m, 1.0)) for m in bm_auxm]

In [771]:
subtype_features1 = subtypes.copy()
for m, n in zip(mlm1, aux_names):
    subtype_features1 = add_mlm_features(subtype_features1, m, n)

y = subtype_features1['subtype'].values - 1
X = np.asarray(subtype_features1.iloc[:, 1:])
X = np.concatenate((X, P), axis=1)
X = StandardScaler().fit_transform(X)

In [772]:
coef_models = []
for i in range(P.shape[1]):
    print('Starting run {}'.format(i))
    Xi = X[yhat_posterior == i]
    yi = y[yhat_posterior == i] == i
    fi = ability_to_predict(Xi, yi)
    coef_models.append(fi)

Starting run 0
Starting run 1
Starting run 2
Starting run 3
Starting run 4
Starting run 5
Starting run 6
Starting run 7


In [773]:
for m in coef_models:
    print(m.best_score_)

1.0
1.0
0.604651162791
1.0
0.62987012987
0.72
1.0
1.0


In [774]:
correct_map = np.zeros_like(y)
for i, m in enumerate(coef_models):
    ix = yhat_posterior == i
    Xi = X[ix]
    yi = y[ix] == i
    correct_map[ix] = cross_val_predict(m.best_estimator_, Xi, yi, cv=LeaveOneOut(yi.size))

yhat_coef1 = np.zeros_like(y)
yhat_coef1[correct_map == 1] = yhat_posterior[correct_map == 1]
yhat_coef1[correct_map == 0] = np.argsort(P, axis=1)[correct_map == 0, -2]

confusion_matrix(y, yhat_coef1).T

array([[ 20,   0,   2,   1,   0,   0,   0,   0],
       [  0, 111,   2,   0,   0,   0,   2,   0],
       [  3,  21, 100,  30,   3,   2,   6,   0],
       [  0,   0,   3,  60,   9,   1,   1,   0],
       [  1,   4,   2,  14,  99,  18,  13,   1],
       [  0,   3,   2,   2,  11,  39,  10,   0],
       [  0,   0,   0,   2,  12,   2,  40,   1],
       [  0,   0,   0,   0,   2,   0,   0,  17]])

In [775]:
coef1_subtypes = pd.DataFrame({'subtype': yhat_coef1 + 1}, subtypes.index)
coef1_subtypes.to_csv('benchmark_pfvc_1y_subtypes_coef_adjustment.csv')

In [776]:
!Rscript score_predictions.R benchmark_pfvc_1y_subtypes_coef_adjustment.csv

Loading required package: methods
Source: local data frame [4 x 2]

     bin   mae
1  (1,2]  4.81
2  (2,4]  6.48
3  (4,8]  8.57
4 (8,25] 10.20
Source: local data frame [8 x 5]

  true_subtype (1,2] (2,4] (4,8] (8,25]
1            1  5.47  8.05  6.55  13.21
2            2  3.90  4.00  5.30   7.10
3            3  4.08  4.23  5.58   6.64
4            4  4.32  6.09  8.93  11.89
5            5  5.44  7.12  9.74  11.50
6            6  5.68  8.98  9.87  10.82
7            7  5.48  9.42 16.64  22.71
8            8  4.47  6.12  3.48   1.53
Source: local data frame [8 x 9]

  true_subtype     1     2     3     4     5     6     7     8
1            1  3.25    NA 28.67    NA 37.54    NA    NA    NA
2            2    NA  3.34  8.00    NA  8.42 35.75    NA    NA
3            3 19.12 12.70  3.58 12.95 11.96 23.81    NA    NA
4            4 26.12    NA  8.45  3.54 14.39  4.33 18.26    NA
5            5    NA    NA  5.97 12.99  4.15 12.05 25.43 32.72
6            6    NA    NA 21.57  6.05 15.16  3.54 

In [781]:
!Rscript compare_subtype_reprediction.R benchmark_pfvc_1y_subtypes_coef_adjustment.csv

    repred
pred  1  2  3  4  5  6  7
   1  0  3  0  0  0  0  0
   2  3  0 19  1  0  0  0
   3  0  2  0  0  0  0  0
   4  0  3  4  0 21  0  0
   5  0  0  0  5  0  6  2
   6  0  0  0  0  3  0  2
   7  0  0  1  3  3  4  0
   8  0  0  0  0  0  1  1


**Conclusions**: It seems that we can create a classifier for each subtype that can predict whether the MAP estimate is correct or not. If we use this classifier to make correctness predictions and then use the second most likely subtype in the posterior as the prediction then the accuracy of classification at year 1 improves (see the output of the evaluation results in the cell above). This does not say anything about whether the changes in subtype prediction accuracy have a strong impact on the accuracy of trajectory predictions.

### Evaluate Intercept/Slope Features

The evaluation below is a lower bound. If we knew which subtype an individual belongs to after observing all of the data, then we can get very accurate predictions on average (below 5 PFVC).

In [627]:
!Rscript score_predictions.R benchmark_pfvc_subtypes.csv

Loading required package: methods
Source: local data frame [4 x 2]

     bin  mae
1  (1,2] 3.14
2  (2,4] 3.48
3  (4,8] 4.07
4 (8,25] 4.41
Source: local data frame [8 x 5]

  true_subtype (1,2] (2,4] (4,8] (8,25]
1            1  2.82  3.87  2.98   4.77
2            2  2.65  3.06  3.23   3.55
3            3  3.23  3.12  3.49   3.85
4            4  2.96  3.30  4.37   4.78
5            5  3.72  3.94  4.64   4.63
6            6  3.21  3.51  3.91   4.99
7            7  2.90  3.64  5.40   6.66
8            8  3.61  5.13  3.48   1.53
Source: local data frame [8 x 9]

  true_subtype    1    2    3    4    5    6    7    8
1            1 3.69   NA   NA   NA   NA   NA   NA   NA
2            2   NA 3.16   NA   NA   NA   NA   NA   NA
3            3   NA   NA 3.46   NA   NA   NA   NA   NA
4            4   NA   NA   NA 3.85   NA   NA   NA   NA
5            5   NA   NA   NA   NA 4.27   NA   NA   NA
6            6   NA   NA   NA   NA   NA 3.86   NA   NA
7            7   NA   NA   NA   NA   NA   NA 4.36

These are the predictive accuracies that we get using the posterior MAP estimates of subtypes.

In [628]:
!Rscript score_predictions.R benchmark_pfvc_1y_subtypes.csv

Loading required package: methods
Source: local data frame [4 x 2]

     bin   mae
1  (1,2]  4.95
2  (2,4]  6.96
3  (4,8]  9.31
4 (8,25] 11.05
Source: local data frame [8 x 5]

  true_subtype (1,2] (2,4] (4,8] (8,25]
1            1  5.68  9.47  7.15  16.67
2            2  4.01  4.31  6.38   6.68
3            3  4.43  5.48  6.57   6.54
4            4  4.87  7.12 11.05  14.80
5            5  4.95  6.74  9.97  14.57
6            6  5.10  9.07 10.64  10.67
7            7  6.19  9.88 14.49  19.92
8            8  5.45  6.12  3.48   1.53
Source: local data frame [8 x 9]

  true_subtype    1     2     3     4     5     6     7     8
1            1 3.60  8.80 43.35 52.91    NA    NA    NA    NA
2            2 8.02  3.43  7.61 20.67 11.93 31.94    NA    NA
3            3   NA 10.38  3.62 12.09  8.81    NA 27.87    NA
4            4   NA 16.80  7.36  3.18 13.37    NA 16.49    NA
5            5   NA    NA  5.61 11.28  4.64 10.05 28.03 24.56
6            6   NA    NA 21.57    NA 12.62  3.55  9.03 1

These are the predictive accuracies we get using the adjusted subtype predictions. Overall, we see that there are slight improvements, but nothing dramatic.

In [629]:
!Rscript score_predictions.R benchmark_pfvc_1y_subtypes_coef_adjustment.csv

Loading required package: methods
Source: local data frame [4 x 2]

     bin   mae
1  (1,2]  4.81
2  (2,4]  6.48
3  (4,8]  8.57
4 (8,25] 10.20
Source: local data frame [8 x 5]

  true_subtype (1,2] (2,4] (4,8] (8,25]
1            1  5.47  8.05  6.55  13.21
2            2  3.90  4.00  5.30   7.10
3            3  4.08  4.23  5.58   6.64
4            4  4.32  6.09  8.93  11.89
5            5  5.44  7.12  9.74  11.50
6            6  5.68  8.98  9.87  10.82
7            7  5.48  9.42 16.64  22.71
8            8  4.47  6.12  3.48   1.53
Source: local data frame [8 x 9]

  true_subtype     1     2     3     4     5     6     7     8
1            1  3.25    NA 28.67    NA 37.54    NA    NA    NA
2            2    NA  3.34  8.00    NA  8.42 35.75    NA    NA
3            3 19.12 12.70  3.58 12.95 11.96 23.81    NA    NA
4            4 26.12    NA  8.45  3.54 14.39  4.33 18.26    NA
5            5    NA    NA  5.97 12.99  4.15 12.05 25.43 32.72
6            6    NA    NA 21.57  6.05 15.16  3.54 

In [709]:
!Rscript score_predictions.R benchmark_pfvc_1y_subtypes_coef_true_adjustment.csv

Loading required package: methods
Source: local data frame [4 x 2]

     bin  mae
1  (1,2] 4.65
2  (2,4] 5.85
3  (4,8] 8.05
4 (8,25] 9.81
Source: local data frame [8 x 5]

  true_subtype (1,2] (2,4] (4,8] (8,25]
1            1  6.35  8.62  6.73  13.28
2            2  3.60  3.72  4.90   6.91
3            3  4.61  4.79  6.42   7.74
4            4  4.17  5.46  7.72   9.66
5            5  5.16  6.74  9.51  10.35
6            6  4.89  6.83  8.95  13.08
7            7  5.51  8.00 14.78  17.99
8            8  4.30  5.54  3.48   1.53
Source: local data frame [8 x 9]

  true_subtype     1     2     3     4     5     6     7     8
1            1  3.25    NA 16.09    NA 44.00    NA    NA    NA
2            2    NA  3.31  8.95    NA  9.75 36.14    NA    NA
3            3 19.12 10.11  3.58 11.91  5.80 23.81    NA    NA
4            4 26.12 14.73  9.58  4.01 13.07  4.33 18.26    NA
5            5    NA    NA  6.73 10.81  4.44 11.81 26.76 31.99
6            6    NA    NA    NA  4.58 15.36  3.79  8.46

### Value at Year 1 Features

In [539]:
subtype_val_features1 = subtypes.copy()
for m, n in zip(mlm1, aux_names):
    subtype_val_features1 = add_val_features(subtype_val_features1, m, n, 1.0)

In [540]:
y = subtype_val_features1['subtype'].values - 1
X = np.asarray(subtype_val_features1.iloc[:, 1:])
X = np.concatenate((X, P), axis=1)
X = StandardScaler().fit_transform(X)

In [541]:
val_models = []
for i in range(P.shape[1]):
    print('Starting run {}'.format(i))
    Xi = X[yhat_posterior == i]
    yi = y[yhat_posterior == i] == i
    fi = ability_to_predict(Xi, yi)
    val_models.append(fi)

Starting run 0
Starting run 1
Starting run 2
Starting run 3
Starting run 4
Starting run 5
Starting run 6


In [542]:
for m in val_models:
    print(m.best_score_)

1.0
0.746031746032
1.0
1.0
0.597122302158
1.0
1.0


In [543]:
correct_map = np.zeros_like(y)
for i, m in enumerate(val_models):
    ix = yhat_posterior == i
    Xi = X[ix]
    yi = y[ix] == i
    correct_map[ix] = cross_val_predict(m.best_estimator_, Xi, yi, cv=LeaveOneOut(yi.size))

In [544]:
yhat_val1 = np.zeros_like(y)
yhat_val1[correct_map == 1] = yhat_posterior[correct_map == 1]
yhat_val1[correct_map == 0] = np.argsort(P, axis=1)[correct_map == 0, -2]

In [545]:
confusion_matrix(y, yhat_val1).T

array([[ 18,   1,   1,   0,   0,   0,   0],
       [  4, 111,  17,  16,   1,   2,   0],
       [  1,   7, 121,   1,   0,   5,   0],
       [  0,   6,   2,  88,   4,   2,   0],
       [  1,   3,   7,  24, 112,  22,   9],
       [  0,   0,   0,   2,   8,  42,   1],
       [  0,   0,   0,   0,   2,   0,  31]])

In [546]:
val1_subtypes = pd.DataFrame({'subtype': yhat_val1 + 1}, subtypes.index)
val1_subtypes.to_csv('benchmark_pfvc_1y_subtypes_val_adjustment.csv')

In [633]:
!Rscript score_predictions.R benchmark_pfvc_1y_subtypes_val_adjustment.csv

Loading required package: methods
Source: local data frame [4 x 2]

     bin   mae
1  (1,2]  5.09
2  (2,4]  6.87
3  (4,8]  9.08
4 (8,25] 10.15
Source: local data frame [8 x 5]

  true_subtype (1,2] (2,4] (4,8] (8,25]
1            1  5.27  8.84  6.84  12.34
2            2  3.57  3.81  5.33   6.33
3            3  3.86  4.38  5.61   6.41
4            4  4.46  5.65  8.48  11.28
5            5  4.62  5.29  7.38   8.19
6            6  6.97 12.25 16.43  17.24
7            7  7.57 12.08 19.10  24.02
8            8  5.00  7.38  6.79   7.29
Source: local data frame [8 x 8]

  true_subtype     1     2     3     4     5     6     7
1            1  3.35 17.33 16.09    NA 37.54    NA    NA
2            2 16.77  3.19  6.38 20.13 20.39    NA    NA
3            3 16.94 10.83  3.51 15.40  9.53    NA    NA
4            4    NA 18.31  6.52  3.73 14.54  3.34    NA
5            5    NA    NA  3.40  8.18  4.68 15.57 38.06
6            6    NA 34.52    NA  4.18 12.77  1.84 13.26
7            7    NA 27.02 26.

## Year 2 Features

In [462]:
mlm2 = [fit_mlm(censor(m, 2.0)) for m in bm_auxm]



In [465]:
P = pd.read_csv('benchmark_pfvc_2y_posteriors.csv')
P = np.asarray(P.iloc[:, 1:])
yhat_posterior = np.argmax(P, axis=1)

In [466]:
y = subtypes['subtype'].values - 1
confusion_matrix(y, yhat_posterior).T

array([[ 19,   2,   0,   0,   0,   0,   0],
       [  3, 101,  25,   5,   0,   0,   0],
       [  1,  23, 111,  19,   0,   2,   0],
       [  1,   1,  11,  76,  20,   8,   0],
       [  0,   1,   1,  21, 102,   7,   4],
       [  0,   0,   0,  10,   2,  52,   1],
       [  0,   0,   0,   0,   3,   4,  36]])

In [467]:
for i in range(P.shape[1]):
    yi = y[yhat_posterior == i] == i
    print(naive_model_score(yi))

0.904761904762
0.753731343284
0.711538461538
0.649572649573
0.75
0.8
0.837209302326


### Intercept/Slope Features

In [468]:
subtype_features2 = subtypes.copy()
for m, n in zip(mlm2, aux_names):
    subtype_features2 = add_mlm_features(subtype_features2, m, n)

In [469]:
y = subtype_features['subtype'].values - 1
X = np.asarray(subtype_features2.iloc[:, 1:])
X = np.concatenate((X, P), axis=1)
X = StandardScaler().fit_transform(X)

In [470]:
coef_models2 = []
for i in range(P.shape[1]):
    print('Starting run {}'.format(i))
    Xi = X[yhat_posterior == i]
    yi = y[yhat_posterior == i] == i
    fi = ability_to_predict(Xi, yi)
    coef_models2.append(fi)

Starting run 0
Starting run 1
Starting run 2
Starting run 3
Starting run 4
Starting run 5
Starting run 6


In [471]:
for m in coef_models2:
    print(m.best_score_)

0.904761904762
1.0
0.711538461538
0.709401709402
1.0
0.861538461538
0.976744186047


### Value at Year 2 Features

In [472]:
subtype_val_features2 = subtypes.copy()
for m, n in zip(mlm2, aux_names):
    subtype_val_features2 = add_val_features(subtype_val_features2, m, n, 2.0)

In [473]:
y = subtype_val_features2['subtype'].values - 1
X = np.asarray(subtype_val_features2.iloc[:, 1:])
X = np.concatenate((X, P), axis=1)
X = StandardScaler().fit_transform(X)

In [475]:
val_models2 = []
for i in range(P.shape[1]):
    print('Starting run {}'.format(i))
    Xi = X[yhat_posterior == i]
    yi = y[yhat_posterior == i] == i
    fi = ability_to_predict(Xi, yi)
    val_models2.append(fi)

Starting run 0
Starting run 1
Starting run 2
Starting run 3
Starting run 4
Starting run 5
Starting run 6


In [476]:
for m in val_models2:
    print(m.best_score_)

0.904761904762
1.0
0.724358974359
0.692307692308
1.0
0.815384615385
0.93023255814
