# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os
import pickle
from tqdm.notebook import trange, tqdm

# Custom / Lcoal
from config import model_config
from regression import reg

# Stats
from scipy.stats import shapiro
from sklearn.feature_selection import(
	RFECV, SequentialFeatureSelector
)
from sklearn.linear_model import (
	LinearRegression, LogisticRegression,
)
from sklearn.model_selection import (
	train_test_split, cross_val_score,
)
from sklearn.pipeline import (
	Pipeline
)
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Global vars
SEED = 123
TEST_SIZE = 0.25
HEATMAP_COLORS = sns.diverging_palette(h_neg=250, h_pos=359, as_cmap=True)
SIGNIFICANCE_CUTOFF = 0.05
CV_FOLDS = 10

# Import/Preprocess Data

In [2]:
with open(Path('../data/prediction_data.pkl'), 'rb') as f:
	data = pickle.load(f)
	
X = data.get('X')
y = data.get('y')
body_features = data.get('body_features')
cardio_features = data.get('cardio_features')
control_features = data.get('controls')
all_features = body_features + cardio_features + control_features

print(X.shape)
print(y.shape)
print(body_features)
print(cardio_features)
print(control_features)

(95, 17)
(95, 21)
['density_visceral_fat', 'mass_visceral_fat', 'density_intermuscular_fat', 'density_muscle', 'density_bone', 'bmi']
['emphysema_volume_950hu', 'extrapulmonary_vein_volume', 'artery_vein_ratio', 'bv10', 'pb_larger_10', 'pv_a', 'heart_volume', 'airway_ratio', 'ild_volume']
['age', 'gender_cl_Male']


# OLS Regression Functions

We loop through each of the numerical target variables (the various clot burden measures for different locations, as well as the total clot burden), and perform a simple single-variable OLS regression. Shown below are the statistically significant results at the 95% confidence level.

In [3]:
def get_params(model, X, y):
    """Returns pd.Series of coefs for comparison with statsmodels params."""
    model.fit(X, y)
    coef = pd.Series(model.coef_, index=model.feature_names_in_)
    coef['const'] = model.intercept_
    return coef.sort_values()

In [4]:
def model_residual_correlation(model):
    """Returns measure of correlation."""
    return np.corrcoef(np.arange(len(model.resid)), model.resid)[1, 0]

In [5]:
def fit_model(X, y):
    """Fit statsmodels OLS model with robust SEs and sklearn OLS model."""
    # Fit statsmodels model for pvalues and coef
    model_sm = sm.OLS(y, X).fit(cov_type='HC3')
    # Define sklearn model for CV evaluation
    model_sk = LinearRegression(fit_intercept=True, n_jobs=-1)
    # Check that model params match
    assert np.all(np.isclose(get_params(model_sk, X, y), model_sm.params.sort_values()))
    return model_sm, model_sk

In [6]:
def store_model_results(model_sm, model_sk, X, y):
    """
    Params:
        - model_sm: statsmodel model for coefs, pvalues, and residuals.
        - model_sk: sklearn model for cross validation
        - X: X data.
        - y: y data.
    """
    # Calculate CV scores
    cv_scores = cross_val_score(
        model_sk, X, y, 
        scoring='neg_mean_squared_error', 
        cv=CV_FOLDS, n_jobs=-1
    )
    # Store model results
    model_results = pd.DataFrame(
        {
            'y': y.name,
            'model_dfn': [tuple(X.columns.values)],
            'nobs': model_sm.nobs,
            'shapiro_resid_pvalue': shapiro(model_sm.resid).pvalue,
            'metric_train': model_sk.score(X, y),
            'metric_cv_mean': np.mean(np.abs(cv_scores)),
            'metric_cv_std': np.std(cv_scores),
        }
    )
    # Set model index
    model_results = model_results.set_index(['y', 'model_dfn'])
    return model_results

In [7]:
def store_coef_results(model_sm, y):
    """
    Params:
        - model_sm: statsmodel model for coefs, pvalues, and residuals.
        - y: y data.
    """
    results = pd.DataFrame(
        {
            'model_dfn': [tuple(model_sm.params.index) for _ in range(len(model_sm.params))],
            'coef': model_sm.params, 
            'pval': model_sm.pvalues,
        },
    )
    results['signif'] = results['pval'].apply(reg.add_significance)
    results = results.reset_index(names='x')
    results['y'] = y.name
    results = results.pivot(index=['y', 'model_dfn'], columns=['x'], values=['coef', 'pval', 'signif'])
    results.columns = ['_'.join(idx) for idx in results.columns]
    return results

### Example

In [8]:
target = 'total_clot_burden'
features = 'density_visceral_fat'
X_temp = sm.add_constant(X[features])
y_temp = y[target]
model_sm, model_sk = fit_model(X_temp, y_temp)

In [9]:
store_model_results(model_sm, model_sk, X_temp, y_temp)

Unnamed: 0_level_0,Unnamed: 1_level_0,nobs,shapiro_resid_pvalue,metric_train,metric_cv_mean,metric_cv_std
y,model_dfn,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
total_clot_burden,"(const, density_visceral_fat)",95.0,0.000139,0.022689,1.038515,0.368592


In [10]:
store_coef_results(model_sm, y_temp)

Unnamed: 0_level_0,Unnamed: 1_level_0,coef_const,coef_density_visceral_fat,pval_const,pval_density_visceral_fat,signif_const,signif_density_visceral_fat
y,model_dfn,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
total_clot_burden,"(const, density_visceral_fat)",-0.0,-0.150629,1.0,0.204898,,


# Perform univariable regressions

In [11]:
univariate_models = pd.DataFrame()
univariate_coefs = pd.DataFrame()

for target in tqdm(model_config.num_targets):
    for feature in all_features:
        # Fit model
        X_temp = sm.add_constant(X[feature])
        y_temp = y[target]
        model_sm, model_sk = fit_model(X_temp, y_temp)

        # Collect model information
        univariate_models = pd.concat([univariate_models, store_model_results(model_sm, model_sk, X_temp, y_temp)], 
                                      axis=0)

        # Collect coef information
        univariate_coefs = pd.concat([univariate_coefs, store_coef_results(model_sm, y_temp)], 
                                     axis=0)
print(univariate_models.shape)
print(univariate_coefs.shape)

  0%|          | 0/21 [00:00<?, ?it/s]

(357, 5)
(357, 54)


In [12]:
univariate_results = univariate_models.join(univariate_coefs, how='left', validate='1:1')
univariate_results = univariate_results.reset_index()
univariate_results['selection_method'] = 'All'
univariate_results['model_dfn'] = univariate_results['model_dfn'].apply(lambda x: x[1])
univariate_results['category'] = 'univariable_' + univariate_results['model_dfn']
univariate_results['controls'] = 'None'
univariate_results.index = univariate_results[['category', 'selection_method', 'y', 'controls']].apply('%'.join, axis=1)
univariate_results.index.name = 'Lookup'
print(univariate_results.shape)
univariate_results.to_csv('../output/regressions/univariate.csv')
univariate_results

(357, 64)


Unnamed: 0_level_0,y,model_dfn,nobs,shapiro_resid_pvalue,metric_train,metric_cv_mean,metric_cv_std,coef_const,coef_density_visceral_fat,pval_const,...,signif_ild_volume,coef_age,pval_age,signif_age,coef_gender_cl_Male,pval_gender_cl_Male,signif_gender_cl_Male,selection_method,category,controls
Lookup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
univariable_density_visceral_fat%All%total_clot_burden%None,total_clot_burden,density_visceral_fat,95.0,1.390171e-04,0.022689,1.038515,0.368592,-0.0,-0.150629,1.0,...,,,,,,,,All,univariable_density_visceral_fat,
univariable_mass_visceral_fat%All%total_clot_burden%None,total_clot_burden,mass_visceral_fat,95.0,2.612391e-06,0.000016,1.070149,0.347454,-0.0,,1.0,...,,,,,,,,All,univariable_mass_visceral_fat,
univariable_density_intermuscular_fat%All%total_clot_burden%None,total_clot_burden,density_intermuscular_fat,95.0,6.753050e-05,0.029813,1.040085,0.320862,-0.0,,1.0,...,,,,,,,,All,univariable_density_intermuscular_fat,
univariable_density_muscle%All%total_clot_burden%None,total_clot_burden,density_muscle,95.0,2.308583e-04,0.036184,1.041125,0.368510,-0.0,,1.0,...,,,,,,,,All,univariable_density_muscle,
univariable_density_bone%All%total_clot_burden%None,total_clot_burden,density_bone,95.0,8.104790e-05,0.035796,1.037565,0.312428,-0.0,,1.0,...,,,,,,,,All,univariable_density_bone,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
univariable_heart_volume%All%posterior_basal_ls10%None,posterior_basal_ls10,heart_volume,95.0,2.768651e-12,0.003644,1.022249,0.385297,0.0,,1.0,...,,,,,,,,All,univariable_heart_volume,
univariable_airway_ratio%All%posterior_basal_ls10%None,posterior_basal_ls10,airway_ratio,95.0,9.057779e-13,0.000268,1.037180,0.417025,0.0,,1.0,...,,,,,,,,All,univariable_airway_ratio,
univariable_ild_volume%All%posterior_basal_ls10%None,posterior_basal_ls10,ild_volume,95.0,2.841119e-12,0.002488,1.043721,0.436442,0.0,,1.0,...,,,,,,,,All,univariable_ild_volume,
univariable_age%All%posterior_basal_ls10%None,posterior_basal_ls10,age,95.0,9.613406e-12,0.006554,1.015112,0.415131,0.0,,1.0,...,,0.080955,0.443772,,,,,All,univariable_age,


# Perform multivariable regressions

In [13]:
feature_options = {
    'body': body_features,
    'cardio': cardio_features,
    'composite': body_features + cardio_features,
}

control_options = {
    'None': None,
    'Age': ['age'],
    'Gender': ['gender_cl_Male'],
    'Age and Gender': ['age', 'gender_cl_Male'],
}

ols = LinearRegression(fit_intercept=True)

selector_options = {
    # 'univariable': None,
    'refcv': RFECV(
        estimator=ols, 
        step=1, 
        scoring='neg_mean_squared_error', 
        cv=10,
        n_jobs=-1,
    ),
    'sfs_fwd': SequentialFeatureSelector(
        estimator=ols,
        n_features_to_select='auto', 
        tol=0.01, 
        scoring='neg_mean_squared_error', 
        direction='forward', 
        cv=10,
        n_jobs=-1,
    ),
    'sfs_bwd': SequentialFeatureSelector(
        estimator=ols,
        n_features_to_select='auto', 
        tol=0.01, 
        scoring='neg_mean_squared_error', 
        direction='backward', 
        cv=10,
        n_jobs=-1
    ),
}

def get_selected_features(selector, X, y):
    selector.fit(X, y)
    return list(selector.get_feature_names_out().flatten())

# Example
# get_selected_features(selector_options['refcv'], X[body_features], y['total_clot_burden'])

## Feature selection for all models

In [14]:
feature_option_dict = dict()
for feature_key, feature_option in tqdm(feature_options.items()):
    
    # Create dictionary to store target results
    target_dict = dict()
    
    for target in tqdm(model_config.num_targets):
        # Select features (varying methods)
        refcv_feat = get_selected_features(selector_options['refcv'], X[feature_option], y[target])
        sfs_fwd_feat = get_selected_features(selector_options['sfs_fwd'], X[feature_option], y[target])
        sfs_bwd_feat = get_selected_features(selector_options['sfs_bwd'], X[feature_option], y[target])
        
        # Create dictionary to store control results
        control_dict = dict()
        
        for control_key, control_option in control_options.items():
            
            selector_dict = dict()
            # Store REFCV results
            selector_dict['refcv'] = refcv_feat if control_option is None else refcv_feat + control_option 
            # Store SFS fwd
            selector_dict['sfs_fwd'] = sfs_fwd_feat if control_option is None else sfs_fwd_feat + control_option 
            # Store SFS bwd
            selector_dict['sfs_bwd'] = sfs_bwd_feat if control_option is None else sfs_bwd_feat + control_option 
            
            control_dict[control_key] = selector_dict
        
        target_dict[target] = control_dict
        
    feature_option_dict[feature_key] = target_dict

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

## Run Models

In [15]:
model_definitions = feature_option_dict.copy()

print("Testing out a few...")
print(model_definitions['cardio']['apical_ls1']['Age']['sfs_bwd'])
print(model_definitions['cardio']['apical_ls1']['Age']['refcv'])
print(model_definitions['cardio']['apical_ls1']['Age']['sfs_fwd'])
print(model_definitions['cardio']['apical_ls1']['None']['sfs_bwd'])
print(model_definitions['cardio']['apical_ls1']['None']['refcv'])
print(model_definitions['cardio']['apical_ls1']['None']['sfs_fwd'])

Testing out a few...
['artery_vein_ratio', 'bv10', 'pb_larger_10', 'pv_a', 'heart_volume', 'airway_ratio', 'ild_volume', 'age']
['emphysema_volume_950hu', 'artery_vein_ratio', 'bv10', 'pv_a', 'heart_volume', 'airway_ratio', 'ild_volume', 'age']
['artery_vein_ratio', 'pv_a', 'heart_volume', 'airway_ratio', 'age']
['artery_vein_ratio', 'bv10', 'pb_larger_10', 'pv_a', 'heart_volume', 'airway_ratio', 'ild_volume']
['emphysema_volume_950hu', 'artery_vein_ratio', 'bv10', 'pv_a', 'heart_volume', 'airway_ratio', 'ild_volume']
['artery_vein_ratio', 'pv_a', 'heart_volume', 'airway_ratio']


In [16]:
multivariable_models = pd.DataFrame()
multivariable_coefs = pd.DataFrame()
multivariable_results = pd.DataFrame()

for feature_key, feature_option in tqdm(feature_options.items()):
    for target in tqdm(model_config.num_targets):
        for control_key, control_option in control_options.items():
            for selector_key, selector_option in selector_options.items():
                # Unwrap features
                features = model_definitions[feature_key][target][control_key][selector_key]

                # Fit model
                X_temp = sm.add_constant(X[features])
                y_temp = y[target]
                model_sm, model_sk = fit_model(X_temp, y_temp)
        
                # Collect model information
                temp_models = store_model_results(model_sm, model_sk, X_temp, y_temp)
                multivariable_models = pd.concat([multivariable_models, temp_models], axis=0)
        
                # Collect coef information
                temp_coefs = store_coef_results(model_sm, y_temp)
                multivariable_coefs = pd.concat([multivariable_coefs, temp_coefs], axis=0)

                # Combine model and coef information
                temp_results = pd.concat([temp_models, temp_coefs], axis=1)
                temp_results = temp_results.reset_index()
                temp_results['model_dfn'] = temp_results['model_dfn'].apply('_'.join)
                temp_results['category'] = feature_key
                temp_results['controls'] = control_key
                temp_results['selection_method'] = selector_key
                temp_results.index = temp_results[['category', 'selection_method', 'y', 'controls']].apply('%'.join, axis=1)
                temp_results.index.name = 'Lookup'

                # Combine
                multivariable_results = pd.concat([multivariable_results, temp_results], axis=0)

print(multivariable_models.shape)
print(multivariable_coefs.shape)
print(multivariable_results.shape)

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

(756, 5)
(756, 54)
(756, 64)


# Combine Univariate and Multivariable regression results

In [17]:
ols_results = pd.concat([univariate_results, multivariable_results], axis=0)
ols_results.to_csv('../output/regressions/ols_results.csv')