In [187]:
import pandas as pd
import numpy as np

from zipfile import ZipFile

In [3]:
ssc_zip = ZipFile('data/scleroderma.zip')

In [35]:
def LongitudinalSpec(index, date, outcome):
    'The index, timestamp, and outcome column names of a longitudinal dataset.'
    return {'index': index, 'date': date, 'outcome': outcome}

def AlignmentSpec(index, time, date, baseline):
    'The date and baseline column names used to align longitudinal data.'
    return {'index': index, 'time': time, 'date': date, 'baseline': baseline}

In [89]:
def read_dates(file='tPtData.csv', zipfile=ssc_zip):
    with zipfile.open(file) as f:
        ptdata = pd.read_csv(f)
    dates = ptdata.loc[:, ['PtID', 'DateFirstSeen', 'Date1stSymptom']]
    dates['DateFirstSeen'] = pd.to_datetime(dates['DateFirstSeen'])
    dates['Date1stSymptom'] = pd.to_datetime(dates['Date1stSymptom'])
    dates.columns = ['ptid', 'first_seen', 'first_symptom']
    return dates

def read_longitudinal(stream, names, renames):
    tbl = pd.read_csv(stream).loc[:, [names['index'], names['date'], names['outcome']]]
    tbl.columns = [renames['index'], renames['date'], renames['outcome']]
    tbl[renames['date']] = pd.to_datetime(tbl[renames['date']])
    return tbl[~tbl[renames['outcome']].isnull()]

def align_longitudinal(dataset, dates, alignment):
    aligned_dataset = pd.merge(dataset, dates, 'left', alignment['index'])
    date = aligned_dataset[alignment['date']]
    base = aligned_dataset[alignment['baseline']]
    aligned_dataset[alignment['time']] = (date - base).dt.days / 365.0
    return aligned_dataset[[alignment['index'], alignment['time'], dataset.columns[2]]]

def variables(file, zipfile=ssc_zip):
    with zipfile.open(file) as f:
        tbl = pd.read_csv(f, nrows=1)
    return list(tbl.columns)

def is_variable(name, file, zipfile=ssc_zip):
    with zipfile.open(file) as f:
        tbl = pd.read_csv(f, nrows=1)
    return name in tbl

In [94]:
def read_from_table(index, date, file, zipfile=ssc_zip, **kwargs):
    columns = [(k, v) for k, v in kwargs.items() if is_variable(v, file, zipfile)]
    if not len(columns) == 1:
        raise RuntimeError('Must specify exactly one outcome column.')
    else:
        rename, outcome = columns[0]
    
    names = LongitudinalSpec(index, date, outcome)
    renames = LongitudinalSpec('ptid', 'date', rename)
    
    with zipfile.open(file) as f:
        tbl = read_longitudinal(f, names, renames)
        
    return tbl

def read_from_visits(zipfile=ssc_zip, **kwargs):
    return read_from_table('PtID', 'Visit.Date', 'tVisit.csv', zipfile, **kwargs)

def read_from_echos(zipfile=ssc_zip, **kwargs):
    return read_from_table('PtID', 'Date.of.ECHO', 'tECHO.csv', zipfile, **kwargs)

def read_from_pfts(zipfile=ssc_zip, **kwargs):
    return read_from_table('PtID', 'Date', 'tPFT.csv', zipfile, **kwargs)

In [79]:
dates = read_dates()
alignment = AlignmentSpec('ptid', 'years_seen', 'date', 'first_seen')

In [99]:
rp_ss = align_longitudinal(read_from_visits(rp='RP.Sev.Score'), dates, alignment)
heart_ss = align_longitudinal(read_from_visits(heart='Heart.Sev.Score'), dates, alignment)
gen_ss = align_longitudinal(read_from_visits(general='lkpGeneralScore'), dates, alignment)
kidney_ss = align_longitudinal(read_from_visits(kidney='lkpLabUrineProtein'), dates, alignment)
muscle_ss = align_longitudinal(read_from_visits(muscle='Muscle.Sev.Score'), dates, alignment)
gi_ss = align_longitudinal(read_from_visits(gi='GI.Sev.Score'), dates, alignment)

In [101]:
rvsp = align_longitudinal(read_from_echos(rvsp='RVSP'), dates, alignment)
ejection_frac = align_longitudinal(read_from_echos(ef='Ejection.Fraction'), dates, alignment)

In [105]:
pfev1 = align_longitudinal(read_from_pfts(pfev1='perc.FEV1.of.predicted'), dates, alignment)
pdlco = align_longitudinal(read_from_pfts(pdlco='perc.DLCO.of.predicted'), dates, alignment)
pfev1fvc = align_longitudinal(read_from_pfts(pfev1fvc='perc.FEV1FVC.of.predicted'), dates, alignment)

In [249]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [250]:
bm_pfvc = pd.read_csv('data/benchmark_pfvc.csv').loc[:, ['ptid', 'years_seen_full', 'pfvc']]
bm_pfvc.columns = ['ptid', 'years_seen', 'pfvc']

In [251]:
bm_ptid = set(bm_pfvc['ptid'].values)
def select_bm(tbl, ptids=bm_ptid):
    tbl = tbl[[ptid in ptids for ptid in tbl['ptid']]]
    tbl = tbl[tbl['years_seen'] >= 0.0]
    return tbl.copy()

In [436]:
aux_names   = ['rp',  'hrt',    'gen',  'kid',     'msc',     'gi',  'sp', 'ef',         'pv1', 'pdc' ]
aux_markers = [rp_ss, heart_ss, gen_ss, kidney_ss, muscle_ss, gi_ss, rvsp, ejection_frac, pfev1, pdlco]

In [433]:
bm_auxm = [select_bm(m) for m in aux_markers]

In [448]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.multiclass import OneVsOneClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import cross_val_score, cross_val_predict, LeaveOneOut
from sklearn.metrics import confusion_matrix
from sklearn.grid_search import GridSearchCV

def fit_mlm(data):
    outcome = str(data.columns[2])
    formula = '{} ~ years_seen'.format(outcome)
    model   = smf.mixedlm(formula, data, re_formula='years_seen', groups=data['ptid'])
    fitted  = model.fit(reml=False)
    return fitted

def add_mlm_features(subtypes, mlm, prefix):
    intercept, slope = mlm.fe_params
    ranef = mlm.random_effects
    ranef.columns = ['{}_{}'.format(prefix, n) for n in ('intercept', 'slope')]
    subtypes = pd.merge(subtypes, ranef, 'left', left_index=True, right_index=True)
    subtypes[ranef.columns[0]].fillna(intercept, inplace=True)
    subtypes[ranef.columns[1]].fillna(slope, inplace=True)
    return subtypes

def add_val_features(subtypes, mlm, prefix, time):
    intercept, slope = mlm.fe_params
    ranef = mlm.random_effects
    ranef.columns = ['{}_{}'.format(prefix, n) for n in ('intercept', 'slope')]
    aligned_ranef = pd.merge(subtypes, ranef, 'left', left_index=True, right_index=True)
    intercept = aligned_ranef[ranef.columns[0]].fillna(intercept)
    slope = aligned_ranef[ranef.columns[1]].fillna(slope)
    subtypes['{}_val'.format(prefix)] = intercept + slope * time
    return subtypes

def naive_model_score(y):
    counts = np.bincount(y)
    return (np.argmax(counts) == y).mean()

def ability_to_predict(X, y):
    par = {'C': list(np.logspace(-4, 2, 10)), 'gamma': list(np.logspace(-2, 1, 10))}
    loo = LeaveOneOut(y.size)
    clf = GridSearchCV(SVC(class_weight='auto'), par, scoring='accuracy', cv=loo)
    return clf.fit(X, y)

In [431]:
subtypes = pd.read_csv('benchmark_pfvc_subtypes.csv').set_index('ptid')

## True Features

In [None]:
mlm = [fit_mlm(m) for m in bm_auxm]

In [None]:
subtype_val_features_true_1 = subtypes.copy()
for m, n in zip(mlm, aux_names):
    subtype_val_features_true_1 = add_val_features(subtype_val_features_true_1, m, n, 1.0)

In [None]:
subtype_val_features_true_2 = subtypes.copy()
for m, n in zip(mlm, aux_names):
    subtype_val_features_true_2 = add_val_features(subtype_val_features_true_2, m, n, 2.0)

## Year 1 Features

In [434]:
def censor(markers, time):
    ix = markers['years_seen'] <= time
    return markers[ix]

In [435]:
mlm1 = [fit_mlm(censor(m, 1.0)) for m in bm_auxm]

In [298]:
P = pd.read_csv('benchmark_pfvc_1y_posteriors.csv')
P = np.asarray(P.iloc[:, 1:])
yhat_posterior = np.argamx(P, axis=1)

In [458]:
y = subtypes['subtype'].values - 1
confusion_matrix(y, yhat_posterior).T

array([[ 17,   2,   0,   0,   0,   0,   0],
       [  4,  93,  22,   6,   0,   1,   0],
       [  2,  28, 105,  27,   1,   3,   0],
       [  1,   2,  15,  68,  30,  10,   2],
       [  0,   3,   6,  22,  82,  20,   6],
       [  0,   0,   0,   8,   7,  35,   2],
       [  0,   0,   0,   0,   7,   4,  31]])

In [459]:
for i in range(P.shape[1]):
    yi = y[yhat_posterior == i] == i
    print(naive_model_score(yi))

0.894736842105
0.738095238095
0.632530120482
0.53125
0.589928057554
0.673076923077
0.738095238095


### Intercept/Slope Features

In [437]:
subtype_features1 = subtypes.copy()
for m, n in zip(mlm1, aux_names):
    subtype_features1 = add_mlm_features(subtype_features1, m, n)

In [438]:
y = subtype_features1['subtype'].values - 1
X = np.asarray(subtype_features1.iloc[:, 1:])
X = np.concatenate((X, P), axis=1)
X = StandardScaler().fit_transform(X)

In [447]:
coef_models = []
for i in range(P.shape[1]):
    print('Starting run {}'.format(i))
    Xi = X[yhat_posterior == i]
    yi = y[yhat_posterior == i] == i
    fi = ability_to_predict(Xi, yi)
    coef_models.append(fi)

Starting run 0
Starting run 1
Starting run 2
Starting run 3
Starting run 4
Starting run 5
Starting run 6


In [464]:
for m in coef_models:
    print(m.best_score_)

1.0
0.738095238095
1.0
1.0
0.647482014388
1.0
1.0


### Value at Year 1 Features

In [443]:
subtype_val_features1 = subtypes.copy()
for m, n in zip(mlm1, aux_names):
    subtype_val_features1 = add_val_features(subtype_val_features1, m, n, 1.0)

In [394]:
y = subtype_val_features1['subtype'].values - 1
X = np.asarray(subtype_val_features1.iloc[:, 1:])
X = np.concatenate((X, P), axis=1)
X = StandardScaler().fit_transform(X)

In [444]:
val_models = []
for i in range(P.shape[1]):
    print('Starting run {}'.format(i))
    Xi = X[yhat_posterior == i]
    yi = y[yhat_posterior == i] == i
    fi = ability_to_predict(Xi, yi)
    val_models.append(fi)

Starting run 0
Starting run 1
Starting run 2
Starting run 3
Starting run 4
Starting run 5
Starting run 6


In [474]:
for m in val_models:
    print(m.best_score_)

1.0
0.738095238095
1.0
1.0
0.647482014388
1.0
1.0


## Year 2 Features

In [462]:
mlm2 = [fit_mlm(censor(m, 2.0)) for m in bm_auxm]



In [465]:
P = pd.read_csv('benchmark_pfvc_2y_posteriors.csv')
P = np.asarray(P.iloc[:, 1:])
yhat_posterior = np.argmax(P, axis=1)

In [466]:
y = subtypes['subtype'].values - 1
confusion_matrix(y, yhat_posterior).T

array([[ 19,   2,   0,   0,   0,   0,   0],
       [  3, 101,  25,   5,   0,   0,   0],
       [  1,  23, 111,  19,   0,   2,   0],
       [  1,   1,  11,  76,  20,   8,   0],
       [  0,   1,   1,  21, 102,   7,   4],
       [  0,   0,   0,  10,   2,  52,   1],
       [  0,   0,   0,   0,   3,   4,  36]])

In [467]:
for i in range(P.shape[1]):
    yi = y[yhat_posterior == i] == i
    print(naive_model_score(yi))

0.904761904762
0.753731343284
0.711538461538
0.649572649573
0.75
0.8
0.837209302326


### Intercept/Slope Features

In [468]:
subtype_features2 = subtypes.copy()
for m, n in zip(mlm2, aux_names):
    subtype_features2 = add_mlm_features(subtype_features2, m, n)

In [469]:
y = subtype_features['subtype'].values - 1
X = np.asarray(subtype_features2.iloc[:, 1:])
X = np.concatenate((X, P), axis=1)
X = StandardScaler().fit_transform(X)

In [470]:
coef_models2 = []
for i in range(P.shape[1]):
    print('Starting run {}'.format(i))
    Xi = X[yhat_posterior == i]
    yi = y[yhat_posterior == i] == i
    fi = ability_to_predict(Xi, yi)
    coef_models2.append(fi)

Starting run 0
Starting run 1
Starting run 2
Starting run 3
Starting run 4
Starting run 5
Starting run 6


In [471]:
for m in coef_models2:
    print(m.best_score_)

0.904761904762
1.0
0.711538461538
0.709401709402
1.0
0.861538461538
0.976744186047


### Value at Year 2 Features

In [472]:
subtype_val_features2 = subtypes.copy()
for m, n in zip(mlm2, aux_names):
    subtype_val_features2 = add_val_features(subtype_val_features2, m, n, 2.0)

In [473]:
y = subtype_val_features2['subtype'].values - 1
X = np.asarray(subtype_val_features2.iloc[:, 1:])
X = np.concatenate((X, P), axis=1)
X = StandardScaler().fit_transform(X)

In [475]:
val_models2 = []
for i in range(P.shape[1]):
    print('Starting run {}'.format(i))
    Xi = X[yhat_posterior == i]
    yi = y[yhat_posterior == i] == i
    fi = ability_to_predict(Xi, yi)
    val_models2.append(fi)

Starting run 0
Starting run 1
Starting run 2
Starting run 3
Starting run 4
Starting run 5
Starting run 6


In [476]:
for m in val_models2:
    print(m.best_score_)

0.904761904762
1.0
0.724358974359
0.692307692308
1.0
0.815384615385
0.93023255814
