# Clot Burden Prediction

We perform a univariable regression for each target variable and explanatory variable. We then perform multivariable regressions. For these, we focus on body composition only, cardiopulmonary features only, and then a composite model. For each of these, we perform three forms of feature selection, using (1) recursive feature elimination with cross validation, (2) forward sequential feature selection with cross validation, and (3) backward feature selection with cross validation. For these groups of selected features, we also perform sensitivities controlling for gender, age, and both gender and age.

# Imports

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os
import pickle
from tqdm.notebook import trange, tqdm

# Custom / Lcoal
from config import model_config
from regression import reg

# Stats
from scipy.stats import shapiro
from sklearn.feature_selection import(
	RFECV, SequentialFeatureSelector,
)
from sklearn.metrics import (
    make_scorer, r2_score
)
from sklearn.linear_model import (
	LinearRegression, LogisticRegression, LassoCV
)
from sklearn.model_selection import (
	train_test_split, cross_val_score,
    RepeatedKFold
)
from sklearn.pipeline import (
	Pipeline
)
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Global vars
SEED = 123
TEST_SIZE = 0.25
HEATMAP_COLORS = sns.diverging_palette(h_neg=250, h_pos=359, as_cmap=True)
SIGNIFICANCE_CUTOFF = 0.05
CV_FOLDS = 5
CUSTOM_CV = RepeatedKFold(n_splits=CV_FOLDS, n_repeats=10, random_state=SEED)


# Cov type
# Options:
#  - robust
#  - clustered
COV_TYPE = 'robust'


# Import/Preprocess Data

In [6]:
with open(Path('../data/prediction_data.pkl'), 'rb') as f:
	data = pickle.load(f)
	
X = data.get('X')
y = data.get('y')
body_features = data.get('body_features')
cardio_features = data.get('cardio_features')
control_features = data.get('controls')
all_features = body_features + cardio_features + control_features

print(X.shape)
print(y.shape)
print(body_features)
print(cardio_features)
print(control_features)

(91, 37)
(91, 7)
['volume_visceral_fat', 'density_visceral_fat', 'mass_visceral_fat', 'volume_subcutaneous_fat', 'density_subcutaneous_fat', 'mass_subcutaneous_fat', 'volume_intermuscular_fat', 'density_intermuscular_fat', 'mass_intermuscular_fat', 'volume_muscle', 'density_muscle', 'mass_muscle', 'volume_bone', 'density_bone', 'mass_bone', 'bmi', 'bsa']
['emphysema_volume_950hu', 'lung_volume', 'extrapulmonary_artery_volume', 'extrapulmonary_vein_volume', 'intrapulmonary_artery_volume', 'intrapulmonary_vein_volume', 'artery_vein_ratio', 'bv5', 'bv10', 'pb_larger_10', 'pv_diameter', 'a_diameter', 'pv_a', 'heart_volume', 'airway_volume', 'airway_ratio', 'ild_volume', 'ild_ratio']
['age', 'gender_cl_Male']


In [7]:
pe_numbers = y.index.str[:-2]
pe_numbers

Index(['PE1', 'PE12', 'PE12', 'PE14', 'PE15', 'PE15', 'PE15', 'PE15', 'PE16',
       'PE16', 'PE17', 'PE18', 'PE18', 'PE18', 'PE18', 'PE19', 'PE2', 'PE20',
       'PE21', 'PE22', 'PE22', 'PE22', 'PE23', 'PE23', 'PE24', 'PE24', 'PE25',
       'PE25', 'PE27', 'PE27', 'PE28', 'PE3', 'PE3', 'PE3', 'PE31', 'PE32',
       'PE32', 'PE32', 'PE32', 'PE33', 'PE34', 'PE34', 'PE36', 'PE36', 'PE37',
       'PE37', 'PE37', 'PE4', 'PE40', 'PE40', 'PE41', 'PE41', 'PE41', 'PE41',
       'PE42', 'PE42', 'PE43', 'PE44', 'PE45', 'PE47', 'PE48', 'PE48', 'PE49',
       'PE5', 'PE51', 'PE51', 'PE51', 'PE52', 'PE52', 'PE52', 'PE52', 'PE52',
       'PE52', 'PE53', 'PE54', 'PE54', 'PE54', 'PE56', 'PE56', 'PE6', 'PE6',
       'PE6', 'PE6', 'PE6', 'PE6', 'PE6', 'PE7', 'PE8', 'PE8', 'PE8', 'PE9'],
      dtype='object')

# OLS Regression Functions

In [8]:
def get_params(model, X, y):
    """Returns pd.Series of coefs for comparison with statsmodels params."""
    model.fit(X, y)
    coef = pd.Series(model.coef_, index=model.feature_names_in_)
    coef['const'] = model.intercept_
    return coef.sort_values()

In [9]:
def model_residual_correlation(model):
    """Returns measure of correlation."""
    return np.corrcoef(np.arange(len(model.resid)), model.resid)[1, 0]

In [10]:
def fit_model(X, y):
    """Fit statsmodels OLS model with robust SEs and sklearn OLS model."""
    # Fit statsmodels model for pvalues and coef
    if COV_TYPE == 'robust':
        model_sm = sm.OLS(y, X).fit(cov_type='HC3')
    elif COV_TYPE == 'clustered':
        model_sm = sm.OLS(y, X).fit(cov_type='cluster', cov_kwds={'groups': pe_numbers})
    # Define sklearn model for CV evaluation
    model_sk = LinearRegression(fit_intercept=True, n_jobs=-1)
    # Check that model params match
    sk_model_params = get_params(model_sk, X, y)
    sm_model_params = model_sm.params.sort_values()
    params_match = np.isclose(sk_model_params, sm_model_params, atol=1e-5)
    if not np.all(params_match):
        print(f"Regressions on {y.name} did not match for sklearn and statsmodels. CV scores may differ.")
    return model_sm, model_sk

In [11]:
def store_model_results(model_sm, model_sk, X, y):
    """
    Params:
        - model_sm: statsmodel model for coefs, pvalues, and residuals.
        - model_sk: sklearn model for cross validation
        - X: X data.
        - y: y data.
    """
    # Calculate CV scores
    cv_scores = cross_val_score(
        model_sk, X, y, 
        scoring='r2', 
        cv=CUSTOM_CV, n_jobs=-1
    )
    # Store model results
    model_results = pd.DataFrame(
        {
            'y': y.name,
            'model_dfn': [tuple(X.columns.values)],
            'nobs': model_sm.nobs,
            'shapiro_resid_pvalue': shapiro(model_sm.resid).pvalue,
            'metric_train': model_sk.score(X, y),
            'metric_cv_mean': np.mean(np.maximum(cv_scores, np.zeros_like(cv_scores))),
            'metric_cv_std': np.std(np.maximum(cv_scores, np.zeros_like(cv_scores))),
            'fpvalue': model_sm.f_pvalue
        }
    )
    # Set model index
    model_results = model_results.set_index(['y', 'model_dfn'])
    return model_results

In [12]:
def store_coef_results(model_sm, y):
    """
    Params:
        - model_sm: statsmodel model for coefs, pvalues, and residuals.
        - y: y data.
    """
    results = pd.DataFrame(
        {
            'model_dfn': [tuple(model_sm.params.index) for _ in range(len(model_sm.params))],
            'coef': model_sm.params, 
            'pval': model_sm.pvalues,
        },
    )
    results['signif'] = results['pval'].apply(reg.add_significance)
    results = results.reset_index(names='x')
    results['y'] = y.name
    results = results.pivot(index=['y', 'model_dfn'], columns=['x'], values=['coef', 'pval', 'signif'])
    results.columns = ['_'.join(idx) for idx in results.columns]
    return results

In [13]:
def combine_model_results(model_sm, model_sk, X, y):
    model_results = store_model_results(model_sm, model_sk, X, y)
    coef_results = store_coef_results(model_sm, y)
    assert model_results.shape[0] == coef_results.shape[0] 
    combined_results = pd.concat([model_results, coef_results], axis=1)
    return combined_results

In [14]:
def backward_stepwise_selection(X, y, cutoff):
    # Make copies of X, y
    X_temp = sm.add_constant(X.copy())
    y_temp = y.copy()
    
    # Fit initial model
    if COV_TYPE == 'robust':
        model_sm = sm.OLS(y_temp, X_temp).fit(cov_type='HC3')
    elif COV_TYPE == 'clustered':
        model_sm = sm.OLS(y_temp, X_temp).fit(cov_type='cluster', cov_kwds={'groups': pe_numbers})
    coefs = model_sm.params[1:]
    pvals = model_sm.pvalues[1:]
    df_temp = pd.DataFrame({
        'coefs': coefs,
        'pvals': pvals
    })
    current_varlist = list(coefs.index.values)

    # Store progression in a list of lists
    progression = list()
    progression.append(dict(zip(coefs.index.values, zip(coefs.values, pvals.values))))
    
    # Iterate until all are stat signif
    while not np.all(df_temp['pvals'] < cutoff):
        
        # Drop the variable with the highest pvalue
        new_vars = df_temp.drop(index=df_temp['pvals'].idxmax()).index.values
        
        # If remaining varlist is empty, break and return the last regression results
        if len(new_vars) == 0:
            break

        # Subset X to new list of variables
        X_temp = sm.add_constant(X_temp.loc[:, new_vars])
        
        # Re-fit model
        model_sm = sm.OLS(y_temp, X_temp).fit(cov_type='HC3')
        coefs = model_sm.params[1:]
        pvals = model_sm.pvalues[1:]
        df_temp = pd.DataFrame({
            'coefs': coefs,
            'pvals': pvals
        })
        progression.append(dict(zip(coefs.index.values, zip(coefs.values, pvals.values))))
        current_varlist = [var for var in model_sm.params.index.values if var != 'const']
    
    return current_varlist, progression

# Example
feat_out, prog = backward_stepwise_selection(X[body_features], np.array(y['total_clot_burden']).ravel(), 0.05)
print(feat_out)

['volume_intermuscular_fat', 'mass_intermuscular_fat', 'bsa']


### Example

In [15]:
target = 'total_clot_burden'
features = 'density_visceral_fat'
X_temp = sm.add_constant(X[features])
y_temp = y[target]
model_sm, model_sk = fit_model(X_temp, y_temp)
combine_model_results(model_sm, model_sk, X_temp, y_temp)

Unnamed: 0_level_0,Unnamed: 1_level_0,nobs,shapiro_resid_pvalue,metric_train,metric_cv_mean,metric_cv_std,fpvalue,coef_const,coef_density_visceral_fat,pval_const,pval_density_visceral_fat,signif_const,signif_density_visceral_fat
y,model_dfn,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
total_clot_burden,"(const, density_visceral_fat)",91.0,8.131611e-09,0.04507,0.024305,0.038637,0.058144,0.0,-0.212296,1.0,0.05494,,


# Perform univariable regressions

In [16]:
univariable_results = pd.DataFrame()

for target in tqdm(model_config.num_targets):
    for feature in all_features:
        # Fit model
        X_temp = sm.add_constant(X[feature])
        y_temp = y[target]
        model_sm, model_sk = fit_model(X_temp, y_temp)

        # Store results
        univariable_results = pd.concat(
            [univariable_results, combine_model_results(model_sm, model_sk, X_temp, y_temp)],
            axis=0
        )
        
print(univariable_results.shape)

univariable_results = univariable_results.reset_index()
univariable_results['selection_method'] = 'All'
univariable_results['model_dfn'] = univariable_results['model_dfn'].apply(lambda x: x[1])
univariable_results['category'] = 'univariable_' + univariable_results['model_dfn']
univariable_results['controls'] = 'None'
univariable_results.index = univariable_results[['category', 'selection_method', 'y', 'controls']].apply('%'.join, axis=1)
univariable_results.index.name = 'Lookup'

print(univariable_results.shape)

univariable_results.tail()

  0%|          | 0/7 [00:00<?, ?it/s]

(259, 120)
(259, 125)


Unnamed: 0_level_0,y,model_dfn,nobs,shapiro_resid_pvalue,metric_train,metric_cv_mean,metric_cv_std,fpvalue,coef_const,coef_volume_visceral_fat,...,signif_ild_ratio,coef_age,pval_age,signif_age,coef_gender_cl_Male,pval_gender_cl_Male,signif_gender_cl_Male,selection_method,category,controls
Lookup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
univariable_airway_ratio%All%centralartery%None,centralartery,airway_ratio,91.0,3.707857e-11,0.024585,0.015885,0.025125,0.295278,0.0,,...,,,,,,,,All,univariable_airway_ratio,
univariable_ild_volume%All%centralartery%None,centralartery,ild_volume,91.0,5.457319e-12,0.018797,0.008864,0.017819,0.118259,0.0,,...,,,,,,,,All,univariable_ild_volume,
univariable_ild_ratio%All%centralartery%None,centralartery,ild_ratio,91.0,1.343951e-11,0.028731,0.012513,0.024327,0.064465,0.0,,...,,,,,,,,All,univariable_ild_ratio,
univariable_age%All%centralartery%None,centralartery,age,91.0,5.922423e-13,0.002368,0.000242,0.001009,0.631711,0.0,,...,,0.048662,0.63053,,,,,All,univariable_age,
univariable_gender_cl_Male%All%centralartery%None,centralartery,gender_cl_Male,91.0,1.794152e-12,0.011807,0.021077,0.028239,0.134966,-0.112343,,...,,,,,0.329782,0.131425,,All,univariable_gender_cl_Male,


# Perform multivariable regressions

In [17]:
lasso = LassoCV(
    alphas=np.logspace(-2, 5, 100),
    cv=CUSTOM_CV,
    fit_intercept=True,
    max_iter=10_000
)

target = 'total_clot_burden'
lasso.fit(X, y[target])
print(lasso.alpha_)

coefs = pd.DataFrame(
    {'coef': lasso.coef_},
    index=lasso.feature_names_in_
)
remaining_features = coefs.loc[~np.isclose(coefs['coef'], 0.0), :].index.values
print(remaining_features)

cvscore = cross_val_score(
    estimator=lasso, 
    X=X, 
    y=y[target],
    cv=5, 
    scoring='r2',
)
np.mean(cvscore)

0.03678379771828634
['volume_visceral_fat' 'density_subcutaneous_fat'
 'density_intermuscular_fat' 'density_muscle' 'mass_muscle' 'density_bone'
 'bsa' 'emphysema_volume_950hu' 'extrapulmonary_artery_volume'
 'extrapulmonary_vein_volume' 'artery_vein_ratio' 'bv5' 'bv10' 'pv_a'
 'heart_volume' 'airway_ratio' 'ild_volume' 'ild_ratio' 'age']


0.3855750708802656

In [18]:
multivariable_results = pd.DataFrame()

for target in tqdm(model_config.num_targets):
    lasso = LassoCV(
        alphas=np.logspace(-3, 4, 100),
        cv=CUSTOM_CV,
        fit_intercept=True,
        max_iter=100_000,
        tol=0.001
    )
    lasso.fit(X, y[target])

    print(f"{target:<27s} alpha={lasso.alpha_:.3f}")
    
    coefs = pd.DataFrame(
        {'coef': lasso.coef_},
        index=lasso.feature_names_in_
    )
    remaining_features_lasso = coefs.loc[~np.isclose(coefs['coef'], 0.0), :].index.values
    
    # Fit models
    X_temp_lasso = sm.add_constant(X[remaining_features_lasso])
    y_temp = y[target]
    model_sm_lasso, model_sk_lasso = fit_model(X_temp_lasso, y_temp)

    # Collect model/coef information and store
    model_eval = store_model_results(model_sm_lasso, model_sk_lasso, X_temp_lasso, y_temp)
    model_coefs = store_coef_results(model_sm_lasso, y_temp)
    model_results = pd.concat([model_eval, model_coefs], axis=1)
    multivariable_results = pd.concat([multivariable_results, model_results], axis=0)


  0%|          | 0/7 [00:00<?, ?it/s]

total_clot_burden           alpha=0.036
superior_right              alpha=0.069
superior_left               alpha=0.050
middle_right                alpha=0.095
inferior_right              alpha=0.069
inferior_left               alpha=0.059
centralartery               alpha=0.036


In [19]:
multivariable_results = multivariable_results.reset_index()
multivariable_results['selection_method'] = 'LassoCV'
multivariable_results['category'] = 'composite'
multivariable_results['controls'] = 'None'
multivariable_results.index = multivariable_results[['category', 'selection_method', 'y', 'controls']].apply('%'.join, axis=1)
multivariable_results.index.name = 'Lookup'
print(multivariable_results.shape)
multivariable_results.head()

(7, 98)


Unnamed: 0_level_0,y,model_dfn,nobs,shapiro_resid_pvalue,metric_train,metric_cv_mean,metric_cv_std,fpvalue,coef_age,coef_airway_ratio,...,pval_volume_intermuscular_fat,signif_mass_intermuscular_fat,signif_volume_bone,signif_volume_intermuscular_fat,coef_airway_volume,pval_airway_volume,signif_airway_volume,selection_method,category,controls
Lookup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
composite%LassoCV%total_clot_burden%None,total_clot_burden,"(const, volume_visceral_fat, density_subcutane...",91.0,6.30007e-05,0.689377,0.386181,0.234205,3.145411e-13,-0.169979,0.263638,...,,,,,,,,LassoCV,composite,
composite%LassoCV%superior_right%None,superior_right,"(const, density_visceral_fat, density_muscle, ...",91.0,0.0219501,0.546695,0.258156,0.165825,7.381225e-08,,0.130926,...,,,,,,,,LassoCV,composite,
composite%LassoCV%superior_left%None,superior_left,"(const, volume_visceral_fat, density_visceral_...",91.0,2.341108e-10,0.478301,0.118004,0.144586,0.000969562,-0.13052,0.180648,...,,,,,,,,LassoCV,composite,
composite%LassoCV%middle_right%None,middle_right,"(const, density_muscle, mass_muscle, bsa, emph...",91.0,8.079073e-08,0.42399,0.152009,0.158093,3.961097e-05,,0.167614,...,,,,,,,,LassoCV,composite,
composite%LassoCV%inferior_right%None,inferior_right,"(const, volume_visceral_fat, density_subcutane...",91.0,4.261428e-07,0.505452,0.183601,0.211538,0.0008050648,,0.11977,...,,,,,,,,LassoCV,composite,


# Combine Univariate and Multivariable regression results

In [20]:
ols_results = pd.concat([univariable_results, multivariable_results], axis=0)
if COV_TYPE == 'robust':
    ols_results.to_csv('../output/regressions/ols_results_robust.csv')
elif COV_TYPE == 'clustered':
    ols_results.to_csv('../output/regressions/ols_results_clustered.csv')