# PE Resolution Classification

We perform a univariable classifications using each explanatory variable. We then perform multivariable classifications. For these, we focus on body composition only, cardiopulmonary features only, and then a composite model. For each of these, we perform three forms of feature selection, using (1) recursive feature elimination with cross validation, (2) forward sequential feature selection with cross validation, and (3) backward feature selection with cross validation. For these groups of selected features, we also perform sensitivities controlling for gender, age, and both gender and age.

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
from pathlib import Path
import os
import pickle
from tqdm.notebook import trange, tqdm
from config import model_config

from scipy.stats import shapiro
from sklearn.ensemble import (
	RandomForestClassifier
)
from sklearn.feature_selection import(
	RFECV, SequentialFeatureSelector
)
from sklearn.linear_model import (
	LinearRegression, LogisticRegression,
    LogisticRegressionCV
)
from sklearn.metrics import (
	confusion_matrix, classification_report, f1_score,
	roc_curve, roc_auc_score, auc, RocCurveDisplay,
    brier_score_loss
)
from sklearn.model_selection import (
	train_test_split, RandomizedSearchCV, GridSearchCV, 
	cross_val_score, cross_val_predict, KFold, StratifiedKFold,
    RepeatedStratifiedKFold
)
from sklearn.pipeline import (
	Pipeline
)
from sklearn.preprocessing import (
	LabelEncoder, OneHotEncoder, StandardScaler,
	RobustScaler, QuantileTransformer,
)
import statsmodels.api as sm
from xgboost import XGBClassifier

from regression import reg

In [3]:
SEED = 123
TEST_SIZE = 0.25
CV_FOLDS = 5
CUSTOM_CV = RepeatedStratifiedKFold(n_splits=CV_FOLDS, n_repeats=10, random_state=SEED)
print(f"n repeats (R): {CUSTOM_CV.n_repeats}")
print(f"n folds: {CUSTOM_CV.cvargs['n_splits']}")
R = CUSTOM_CV.n_repeats
K = CUSTOM_CV.cvargs['n_splits']
SE_ADJ_FACTOR = np.sqrt( (1 / (K*R)) + ( (1/K) / ( 1-(1/K) ) ) )
print(f"SE adjustment factor: {SE_ADJ_FACTOR:.3f}")
print("See: https://stats.stackexchange.com/questions/305804/get-ci-and-p-values-for-cross-validated-performance-measures-auc-rho")

HEATMAP_COLORS = sns.diverging_palette(h_neg=359, h_pos=250, as_cmap=True)
plt.style.use('ggplot')

USE_INITIAL = True
USE_CLUSTERED_SE = False

n repeats (R): 10
n folds: 5
SE adjustment factor: 0.520
See: https://stats.stackexchange.com/questions/305804/get-ci-and-p-values-for-cross-validated-performance-measures-auc-rho


# Import Data

In [4]:
if USE_INITIAL:
    with open(Path('../data/classification_data_initial.pkl'), 'rb') as f:
    	data = pickle.load(f)
else:
    with open(Path('../data/classification_data_all.pkl'), 'rb') as f:
    	data = pickle.load(f)
	
X = data.get('X')
y = data.get('y').squeeze()
body_features = data.get('body_features')
cardio_features = data.get('cardio_features')
control_features = data.get('controls')
clot_features = data.get('clot_features')
all_features = body_features + cardio_features + control_features + clot_features

print(X.shape)
print(y.shape)
print(body_features)
print(cardio_features)
print(control_features)
print(clot_features)

(45, 40)
(45,)
['volume_visceral_fat', 'density_visceral_fat', 'mass_visceral_fat', 'volume_subcutaneous_fat', 'density_subcutaneous_fat', 'mass_subcutaneous_fat', 'volume_intermuscular_fat', 'density_intermuscular_fat', 'mass_intermuscular_fat', 'volume_muscle', 'density_muscle', 'mass_muscle', 'volume_bone', 'density_bone', 'mass_bone', 'bmi']
['emphysema_volume_950hu', 'lung_volume', 'extrapulmonary_artery_volume', 'extrapulmonary_vein_volume', 'intrapulmonary_artery_volume', 'intrapulmonary_vein_volume', 'artery_vein_ratio', 'bv5', 'bv10', 'pb_larger_10', 'pv_diameter', 'a_diameter', 'pv_a', 'heart_volume', 'airway_volume', 'airway_ratio']
['age', 'gender_cl_Male']
['superior_right', 'superior_left', 'middle_right', 'inferior_right', 'inferior_left', 'centralartery']


In [5]:
pe_numbers = y.index.str[:-2]
print(len(pe_numbers))
pe_numbers

45


Index(['PE1', 'PE12', 'PE14', 'PE15', 'PE16', 'PE17', 'PE18', 'PE19', 'PE2',
       'PE20', 'PE21', 'PE22', 'PE23', 'PE24', 'PE25', 'PE27', 'PE28', 'PE3',
       'PE31', 'PE32', 'PE33', 'PE34', 'PE35', 'PE36', 'PE37', 'PE4', 'PE40',
       'PE41', 'PE42', 'PE43', 'PE44', 'PE45', 'PE47', 'PE48', 'PE49', 'PE5',
       'PE51', 'PE52', 'PE53', 'PE54', 'PE56', 'PE6', 'PE7', 'PE8', 'PE9'],
      dtype='object')

In [6]:
y.head(2)

PE1_0     1
PE12_0    0
Name: resolved_pe, dtype: int64

In [7]:
y.value_counts(dropna=False)

0    29
1    16
Name: resolved_pe, dtype: int64

In [8]:
X[clot_features].head()

Unnamed: 0,superior_right,superior_left,middle_right,inferior_right,inferior_left,centralartery
PE1_0,2.554945,0.620985,2.490367,0.536218,0.406558,1.317463
PE12_0,-0.116409,-0.542577,0.723262,-0.488418,-0.50415,-0.622559
PE14_0,-0.634061,-0.506953,-0.597167,-0.518845,0.071844,-0.457526
PE15_0,-0.7776,-0.542577,-0.585189,-0.513589,-0.575543,-0.622403
PE16_0,0.654504,6.566739,0.75048,0.164219,1.336248,1.170846


# Logit Regression Functions

In [9]:
def get_sorted_params(fitted_model):
    """Returns pd.Series of coefs for comparison with statsmodels params."""
    coef = pd.Series(
        np.array(fitted_model.coef_).flatten(), 
        index=np.array(fitted_model.feature_names_in_).flatten()
    )
    # print(fitted_model.get_params().get('fit_intercept'))
    if fitted_model.get_params().get('fit_intercept'):
        coef['const'] = fitted_model.intercept_[0]
        
    return coef.sort_index()

In [10]:
def check_params_equal(model_sm, model_sk):
    """Checks whether the coefficients from an sklearn and statsmodel regression are the same"""
    sorted_params = get_sorted_params(model_sk)
    coefs_are_equal = np.all(np.isclose(sorted_params, model_sm.params.sort_index(), atol=1e-04))
    return coefs_are_equal

In [11]:
def model_residual_correlation(model):
    """Returns measure of correlation."""
    return np.corrcoef(np.arange(len(model.resid)), model.resid)[1, 0]

In [12]:
def fit_model(X, y):
    """Fit statsmodels OLS model with robust SEs and sklearn OLS model."""
    
    # Fit statsmodels
    model_sm = sm.GLM(y.copy(), sm.add_constant(X.copy()), family=sm.families.Binomial())
    if USE_CLUSTERED_SE:
        model_sm = model_sm.fit(cov_type='cluster', cov_kwds={'groups': pe_numbers})
    else: 
        model_sm = model_sm.fit(cov_type='HC3')

    # Fit sklearn 
    model_sk = LogisticRegression(
        random_state=SEED,
        fit_intercept=True,
        max_iter=5_000, 
        penalty=None, 
        solver='lbfgs',
    )
    model_sk.fit(X.copy(), y.copy())

    # Check coefs equal
    params_are_equal = check_params_equal(model_sm, model_sk)
    if not params_are_equal:
        print("\nModels did not have same coefs")
        print(get_sorted_params(model_sk))
        print(model_sm.params.sort_index())
        print("---------------------------------")
    return model_sm, model_sk

In [13]:
def store_model_results(model_sm, model_sk, X, y):
    """
    Params:
        - model_sm: statsmodel model for coefs, pvalues, and residuals.
        - model_sk: sklearn model for cross validation
        - X: X data.
        - y: y data.
    """
    # Calculate CV scores
    cv_scores = cross_val_score(
        model_sk, X, y, 
        scoring='roc_auc', 
        cv=CV_FOLDS, n_jobs=-1
    )
    # Store model results
    cv_mean = np.mean(cv_scores)
    cv_std = np.std(cv_scores)
    
    model_results = pd.DataFrame(
        {
            'y': y.name,
            'model_dfn': [('const',) + tuple(X.columns.values)],
            'nobs': model_sm.nobs,
            'shapiro_resid_pvalue': np.nan,
            'metric_train': model_sk.score(X, y),
            'metric_cv_mean': cv_mean,
            'metric_cv_std': cv_std,
            'metric_cv_se': cv_std / np.sqrt(model_sm.nobs),
            'metric_cv_se_adj': cv_std / np.sqrt(model_sm.nobs) * SE_ADJ_FACTOR
        }
    )
    # Set model index
    model_results = model_results.set_index(['y', 'model_dfn'])
    return model_results

In [14]:
def store_coef_results(model_sm, y):
    """
    Params:
        - model_sm: statsmodel model for coefs, pvalues, and residuals.
        - y: y data.
    """
    results = pd.DataFrame(
        {
            'model_dfn': [tuple(model_sm.params.index) for _ in range(len(model_sm.params))],
            'coef': model_sm.params, 
            'pval': model_sm.pvalues,
        },
    )
    results['signif'] = results['pval'].apply(reg.add_significance)
    results = results.reset_index(names='x')
    results['y'] = y.name
    results = results.pivot(index=['y', 'model_dfn'], columns=['x'], values=['coef', 'pval', 'signif'])
    results.columns = ['_'.join(idx) for idx in results.columns]
    return results

In [15]:
def combine_model_results(model_sm, model_sk, X, y):
    model_results = store_model_results(model_sm, model_sk, X, y)
    coef_results = store_coef_results(model_sm, y)
    assert model_results.shape[0] == coef_results.shape[0] 
    combined_results = pd.concat([model_results, coef_results], axis=1)
    return combined_results

## Example

In [16]:
target = 'resolved_pe'
features = ['pv_a', 'density_bone']
X_temp = X[features]
y_temp = y.copy()
model_sm, model_sk = fit_model(X_temp, y_temp)
combine_model_results(model_sm, model_sk, X_temp, y_temp)

Unnamed: 0_level_0,Unnamed: 1_level_0,nobs,shapiro_resid_pvalue,metric_train,metric_cv_mean,metric_cv_std,metric_cv_se,metric_cv_se_adj,coef_const,coef_density_bone,coef_pv_a,pval_const,pval_density_bone,pval_pv_a,signif_const,signif_density_bone,signif_pv_a
y,model_dfn,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
resolved_pe,"(const, pv_a, density_bone)",45,,0.733333,0.586667,0.282266,0.042078,0.021864,-0.654106,-0.533357,-0.214508,0.053716,0.065609,0.564215,,,


# Perform univariable regressions 

In [17]:
univariable_results = pd.DataFrame()

for feature in tqdm(all_features):

    X_temp = X[[feature]]
    y_temp = y.copy()
    model_sm, model_sk = fit_model(X_temp, y_temp)

    univariable_results = pd.concat(
        [univariable_results, combine_model_results(model_sm, model_sk, X_temp, y_temp)],
        axis=0
    )

print(univariable_results.shape)

univariable_results = univariable_results.reset_index()
univariable_results['selection_method'] = 'All'
univariable_results['model_dfn'] = univariable_results['model_dfn'].apply(lambda x: x[1])
univariable_results['category'] = 'univariable_' + univariable_results['model_dfn']
univariable_results['controls'] = 'None'
univariable_results.index = univariable_results[['category', 'selection_method', 'y', 'controls']].apply('%'.join, axis=1)
univariable_results.index.name = 'Lookup'
print(univariable_results.shape)
univariable_results.tail()

  0%|          | 0/40 [00:00<?, ?it/s]

(40, 130)
(40, 135)


Unnamed: 0_level_0,y,model_dfn,nobs,shapiro_resid_pvalue,metric_train,metric_cv_mean,metric_cv_std,metric_cv_se,metric_cv_se_adj,coef_const,...,signif_inferior_right,coef_inferior_left,pval_inferior_left,signif_inferior_left,coef_centralartery,pval_centralartery,signif_centralartery,selection_method,category,controls
Lookup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
univariable_superior_left%All%resolved_pe%None,resolved_pe,superior_left,45,,0.644444,0.524444,0.127347,0.018984,0.009864,-0.543565,...,,,,,,,,All,univariable_superior_left,
univariable_middle_right%All%resolved_pe%None,resolved_pe,middle_right,45,,0.644444,0.437778,0.096826,0.014434,0.0075,-0.600882,...,,,,,,,,All,univariable_middle_right,
univariable_inferior_right%All%resolved_pe%None,resolved_pe,inferior_right,45,,0.644444,0.588889,0.249938,0.037259,0.01936,-0.669609,...,,,,,,,,All,univariable_inferior_right,
univariable_inferior_left%All%resolved_pe%None,resolved_pe,inferior_left,45,,0.644444,0.258889,0.107186,0.015978,0.008303,-0.58724,...,,-0.037174,0.896264,,,,,All,univariable_inferior_left,
univariable_centralartery%All%resolved_pe%None,resolved_pe,centralartery,45,,0.644444,0.426667,0.134366,0.02003,0.010408,-0.570735,...,,,,,-0.053122,0.830146,,All,univariable_centralartery,


In [18]:
univariable_age_results = pd.DataFrame()

for feature in tqdm(all_features):
    
    if feature in model_config.controls_encoded:
        continue
        
    X_temp = X[[feature, 'age']]
    y_temp = y.copy()
    model_sm, model_sk = fit_model(X_temp, y_temp)

    univariable_age_results = pd.concat(
        [univariable_age_results, combine_model_results(model_sm, model_sk, X_temp, y_temp)],
        axis=0
    )

print(univariable_age_results.shape)

univariable_age_results = univariable_age_results.reset_index()
univariable_age_results['selection_method'] = 'All'
univariable_age_results['model_dfn'] = univariable_age_results['model_dfn'].apply(lambda x: x[1])
univariable_age_results['category'] = 'univariable_' + univariable_age_results['model_dfn']
univariable_age_results['controls'] = 'age'
univariable_age_results.index = univariable_age_results[['category', 'selection_method', 'y', 'controls']].apply('%'.join, axis=1)
univariable_age_results.index.name = 'Lookup'
print(univariable_age_results.shape)
univariable_age_results.tail()

  0%|          | 0/40 [00:00<?, ?it/s]

(38, 127)
(38, 132)


Unnamed: 0_level_0,y,model_dfn,nobs,shapiro_resid_pvalue,metric_train,metric_cv_mean,metric_cv_std,metric_cv_se,metric_cv_se_adj,coef_age,...,signif_inferior_right,coef_inferior_left,pval_inferior_left,signif_inferior_left,coef_centralartery,pval_centralartery,signif_centralartery,selection_method,category,controls
Lookup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
univariable_superior_left%All%resolved_pe%age,resolved_pe,superior_left,45,,0.755556,0.826667,0.153735,0.022918,0.011908,0.953226,...,,,,,,,,All,univariable_superior_left,age
univariable_middle_right%All%resolved_pe%age,resolved_pe,middle_right,45,,0.666667,0.773333,0.1702,0.025372,0.013184,0.906992,...,,,,,,,,All,univariable_middle_right,age
univariable_inferior_right%All%resolved_pe%age,resolved_pe,inferior_right,45,,0.666667,0.737778,0.156284,0.023297,0.012106,0.872082,...,,,,,,,,All,univariable_inferior_right,age
univariable_inferior_left%All%resolved_pe%age,resolved_pe,inferior_left,45,,0.688889,0.694444,0.099381,0.014815,0.007698,0.907563,...,,0.018364,0.953067,,,,,All,univariable_inferior_left,age
univariable_centralartery%All%resolved_pe%age,resolved_pe,centralartery,45,,0.711111,0.796667,0.14638,0.021821,0.011339,0.923484,...,,,,,-0.131185,0.608545,,All,univariable_centralartery,age


In [19]:
univariable_gender_results = pd.DataFrame()

for feature in tqdm(all_features):
    
    if feature in model_config.controls_encoded:
        continue
        
    X_temp = X[[feature, 'gender_cl_Male']]
    y_temp = y.copy()
    model_sm, model_sk = fit_model(X_temp, y_temp)

    univariable_gender_results = pd.concat(
        [univariable_gender_results, combine_model_results(model_sm, model_sk, X_temp, y_temp)],
        axis=0
    )

print(univariable_gender_results.shape)

univariable_gender_results = univariable_gender_results.reset_index()
univariable_gender_results['selection_method'] = 'All'
univariable_gender_results['model_dfn'] = univariable_gender_results['model_dfn'].apply(lambda x: x[1])
univariable_gender_results['category'] = 'univariable_' + univariable_gender_results['model_dfn']
univariable_gender_results['controls'] = 'gender'
univariable_gender_results.index = univariable_gender_results[['category', 'selection_method', 'y', 'controls']].apply('%'.join, axis=1)
univariable_gender_results.index.name = 'Lookup'
print(univariable_gender_results.shape)
univariable_gender_results.tail()

  0%|          | 0/40 [00:00<?, ?it/s]

(38, 127)
(38, 132)


Unnamed: 0_level_0,y,model_dfn,nobs,shapiro_resid_pvalue,metric_train,metric_cv_mean,metric_cv_std,metric_cv_se,metric_cv_se_adj,coef_const,...,signif_inferior_right,coef_inferior_left,pval_inferior_left,signif_inferior_left,coef_centralartery,pval_centralartery,signif_centralartery,selection_method,category,controls
Lookup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
univariable_superior_left%All%resolved_pe%gender,resolved_pe,superior_left,45,,0.777778,0.746667,0.078457,0.011696,0.006077,-1.455498,...,,,,,,,,All,univariable_superior_left,gender
univariable_middle_right%All%resolved_pe%gender,resolved_pe,middle_right,45,,0.711111,0.68,0.163019,0.024301,0.012627,-1.460528,...,,,,,,,,All,univariable_middle_right,gender
univariable_inferior_right%All%resolved_pe%gender,resolved_pe,inferior_right,45,,0.711111,0.64,0.224032,0.033397,0.017353,-1.414674,...,,,,,,,,All,univariable_inferior_right,gender
univariable_inferior_left%All%resolved_pe%gender,resolved_pe,inferior_left,45,,0.733333,0.686667,0.192116,0.028639,0.014881,-1.463834,...,,-0.299552,0.469904,,,,,All,univariable_inferior_left,gender
univariable_centralartery%All%resolved_pe%gender,resolved_pe,centralartery,45,,0.688889,0.664444,0.199431,0.029729,0.015448,-1.3916,...,,,,,-0.184639,0.563862,,All,univariable_centralartery,gender


# Perform multivariable regressions

## Feature selection

### Lasso Regularization

In [20]:
model_dfns = [
    control_features, 
    body_features, 
    cardio_features, 
    clot_features, 
    all_features
]
model_dfn_names = [
    'Demo', 
    'Body', 
    'Cardio', 
    'Clot', 
    'All'
]
model_dfns_remaining = dict()

MAX_NUM_REGRESSORS = len(y) // 10
print(f"MAX_NUM_REGRESSORS: {MAX_NUM_REGRESSORS}")

y_temp = y.copy()

multivariable_results = pd.DataFrame()
for i, feats in enumerate(model_dfns):

    low_Cs = -2
    high_Cs = 2

    logitCV = LogisticRegressionCV(
        Cs=np.logspace(low_Cs, high_Cs, 50), 
        cv=CUSTOM_CV, 
        penalty='l1', 
        solver='liblinear', 
        max_iter=5_000, 
        scoring='roc_auc',
        fit_intercept=True,
        random_state=SEED,
        n_jobs=-1
    )
        
    # Select features
    X_init = X.loc[:, feats]

    more_or_less_than_needed = True
    while more_or_less_than_needed:
        logitCV.fit(X_init, y_temp)
        coefs = pd.DataFrame(
            {'coef': np.squeeze(logitCV.coef_)},
            index=logitCV.feature_names_in_
        )
        remaining_features = list(coefs[coefs['coef'] != 0].index.values)
        if len(remaining_features) > MAX_NUM_REGRESSORS:
            high_Cs -= 0.05 / (MAX_NUM_REGRESSORS / (len(remaining_features) - MAX_NUM_REGRESSORS))
        elif len(remaining_features) == 0:
            low_Cs += 0.2
        else:
            more_or_less_than_needed = False
        logitCV.set_params(**{'Cs': np.logspace(low_Cs, high_Cs, 50)})
        print(f"C_range=({10**low_Cs:.3f}, {10**high_Cs:.3f}), C={logitCV.C_.item():.3f}, {remaining_features}")

    print(f"{i}, {logitCV.C_.item():.3f}, {remaining_features}")
    print("---------------------------------------------------------")
    model_dfns_remaining[model_dfn_names[i]] = remaining_features
    
    # Fit model with selected features
    X_selected = X.loc[:, remaining_features]
    model_sm, model_sk = fit_model(X_selected, y_temp)

    # Store results
    temp_results = combine_model_results(model_sm, model_sk, X_selected, y_temp)
    temp_results['category'] = 'composite_'+ model_dfn_names[i]
    multivariable_results = pd.concat(
        [multivariable_results, temp_results], 
        axis=0
    )

multivariable_results

MAX_NUM_REGRESSORS: 4
C_range=(0.010, 100.000), C=12.649, ['age', 'gender_cl_Male']
0, 12.649, ['age', 'gender_cl_Male']
---------------------------------------------------------
C_range=(0.010, 100.000), C=0.295, ['density_visceral_fat', 'density_bone']
1, 0.295, ['density_visceral_fat', 'density_bone']
---------------------------------------------------------
C_range=(0.010, 84.140), C=1.931, ['extrapulmonary_artery_volume', 'extrapulmonary_vein_volume', 'artery_vein_ratio', 'bv10', 'pb_larger_10', 'a_diameter', 'pv_a', 'heart_volume', 'airway_volume', 'airway_ratio']
C_range=(0.010, 70.795), C=1.749, ['extrapulmonary_artery_volume', 'extrapulmonary_vein_volume', 'artery_vein_ratio', 'bv10', 'pb_larger_10', 'a_diameter', 'pv_a', 'heart_volume', 'airway_volume', 'airway_ratio']
C_range=(0.010, 59.566), C=1.899, ['extrapulmonary_artery_volume', 'extrapulmonary_vein_volume', 'artery_vein_ratio', 'bv10', 'pb_larger_10', 'a_diameter', 'pv_a', 'heart_volume', 'airway_volume', 'airway_ratio

Unnamed: 0_level_0,Unnamed: 1_level_0,nobs,shapiro_resid_pvalue,metric_train,metric_cv_mean,metric_cv_std,metric_cv_se,metric_cv_se_adj,coef_age,coef_const,coef_gender_cl_Male,...,signif_heart_volume,coef_inferior_left,coef_inferior_right,coef_superior_left,pval_inferior_left,pval_inferior_right,pval_superior_left,signif_inferior_left,signif_inferior_right,signif_superior_left
y,model_dfn,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
resolved_pe,"(const, age, gender_cl_Male)",45,,0.711111,0.801111,0.125038,0.01864,0.009685,0.896204,-1.715932,1.697251,...,,,,,,,,,,
resolved_pe,"(const, density_visceral_fat, density_bone)",45,,0.622222,0.714444,0.22797,0.033984,0.017658,,-0.810352,,...,,,,,,,,,,
resolved_pe,"(const, a_diameter, heart_volume, airway_volume, airway_ratio)",45,,0.844444,0.877778,0.146566,0.021849,0.011353,,-1.657819,,...,*,,,,,,,,,
resolved_pe,"(const, superior_left, inferior_right, inferior_left)",45,,0.666667,0.541111,0.10986,0.016377,0.00851,,-0.616034,,...,,-0.387879,0.683331,-0.263409,0.396049,0.092501,0.385163,,,
resolved_pe,"(const, a_diameter, heart_volume, airway_ratio, age)",45,,0.8,0.823333,0.177861,0.026514,0.013777,1.07567,-1.781446,,...,*,,,,,,,,,


In [21]:
y_temp = y.copy()

feats = ['a_diameter', 'heart_volume', 'airway_ratio', 'superior_left']
model_dfns_remaining['Custom'] = feats

# Fit model with selected features
X_selected = X.loc[:, feats]

model_sm, model_sk = fit_model(X_selected, y_temp)

# Store results
temp_results = combine_model_results(model_sm, model_sk, X_selected, y_temp)
temp_results['category'] = 'composite_custom'

multivariable_results = pd.concat(
        [multivariable_results, temp_results], 
        axis=0
    )

multivariable_results

Unnamed: 0_level_0,Unnamed: 1_level_0,nobs,shapiro_resid_pvalue,metric_train,metric_cv_mean,metric_cv_std,metric_cv_se,metric_cv_se_adj,coef_age,coef_const,coef_gender_cl_Male,...,signif_heart_volume,coef_inferior_left,coef_inferior_right,coef_superior_left,pval_inferior_left,pval_inferior_right,pval_superior_left,signif_inferior_left,signif_inferior_right,signif_superior_left
y,model_dfn,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
resolved_pe,"(const, age, gender_cl_Male)",45,,0.711111,0.801111,0.125038,0.01864,0.009685,0.896204,-1.715932,1.697251,...,,,,,,,,,,
resolved_pe,"(const, density_visceral_fat, density_bone)",45,,0.622222,0.714444,0.22797,0.033984,0.017658,,-0.810352,,...,,,,,,,,,,
resolved_pe,"(const, a_diameter, heart_volume, airway_volume, airway_ratio)",45,,0.844444,0.877778,0.146566,0.021849,0.011353,,-1.657819,,...,*,,,,,,,,,
resolved_pe,"(const, superior_left, inferior_right, inferior_left)",45,,0.666667,0.541111,0.10986,0.016377,0.00851,,-0.616034,,...,,-0.387879,0.683331,-0.263409,0.396049,0.092501,0.385163,,,
resolved_pe,"(const, a_diameter, heart_volume, airway_ratio, age)",45,,0.8,0.823333,0.177861,0.026514,0.013777,1.07567,-1.781446,,...,*,,,,,,,,,
resolved_pe,"(const, a_diameter, heart_volume, airway_ratio, superior_left)",45,,0.844444,0.778889,0.205793,0.030678,0.015941,,-1.32626,,...,*,,,-0.796484,,,0.208523,,,


In [22]:
multivariable_results = multivariable_results.reset_index()
multivariable_results['selection_method'] = 'LassoCV'
multivariable_results['controls'] = 'None'
multivariable_results.index = multivariable_results[['category', 'selection_method', 'y', 'controls']].apply('%'.join, axis=1)
multivariable_results.index.name = 'Lookup'
print(multivariable_results.shape)
multivariable_results.tail()

(6, 48)


Unnamed: 0_level_0,y,model_dfn,nobs,shapiro_resid_pvalue,metric_train,metric_cv_mean,metric_cv_std,metric_cv_se,metric_cv_se_adj,coef_age,...,coef_inferior_right,coef_superior_left,pval_inferior_left,pval_inferior_right,pval_superior_left,signif_inferior_left,signif_inferior_right,signif_superior_left,selection_method,controls
Lookup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
composite_Body%LassoCV%resolved_pe%None,resolved_pe,"(const, density_visceral_fat, density_bone)",45,,0.622222,0.714444,0.22797,0.033984,0.017658,,...,,,,,,,,,LassoCV,
composite_Cardio%LassoCV%resolved_pe%None,resolved_pe,"(const, a_diameter, heart_volume, airway_volum...",45,,0.844444,0.877778,0.146566,0.021849,0.011353,,...,,,,,,,,,LassoCV,
composite_Clot%LassoCV%resolved_pe%None,resolved_pe,"(const, superior_left, inferior_right, inferio...",45,,0.666667,0.541111,0.10986,0.016377,0.00851,,...,0.683331,-0.263409,0.396049,0.092501,0.385163,,,,LassoCV,
composite_All%LassoCV%resolved_pe%None,resolved_pe,"(const, a_diameter, heart_volume, airway_ratio...",45,,0.8,0.823333,0.177861,0.026514,0.013777,1.07567,...,,,,,,,,,LassoCV,
composite_custom%LassoCV%resolved_pe%None,resolved_pe,"(const, a_diameter, heart_volume, airway_ratio...",45,,0.844444,0.778889,0.205793,0.030678,0.015941,,...,,-0.796484,,,0.208523,,,,LassoCV,


# Combine univariable and multivariable results

In [23]:
logit_results = pd.concat(
    [
        univariable_results,
        univariable_age_results,
        univariable_gender_results,
        multivariable_results
    ], axis=0
)

fname = 'logit_results'
if USE_INITIAL:
    fname += '_initial'
else: 
    fname += '_all'
if USE_CLUSTERED_SE:
    fname += '_clustered'
else: 
    fname += '_robust'
    
logit_results.to_csv(f'../output/regressions/{fname}.csv')

logit_results.head()

Unnamed: 0_level_0,y,model_dfn,nobs,shapiro_resid_pvalue,metric_train,metric_cv_mean,metric_cv_std,metric_cv_se,metric_cv_se_adj,coef_const,...,signif_inferior_right,coef_inferior_left,pval_inferior_left,signif_inferior_left,coef_centralartery,pval_centralartery,signif_centralartery,selection_method,category,controls
Lookup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
univariable_volume_visceral_fat%All%resolved_pe%None,resolved_pe,volume_visceral_fat,45,,0.622222,0.51,0.162253,0.024187,0.012568,-0.632674,...,,,,,,,,All,univariable_volume_visceral_fat,
univariable_density_visceral_fat%All%resolved_pe%None,resolved_pe,density_visceral_fat,45,,0.666667,0.654444,0.104728,0.015612,0.008112,-0.771831,...,,,,,,,,All,univariable_density_visceral_fat,
univariable_mass_visceral_fat%All%resolved_pe%None,resolved_pe,mass_visceral_fat,45,,0.622222,0.51,0.162253,0.024187,0.012568,-0.632411,...,,,,,,,,All,univariable_mass_visceral_fat,
univariable_volume_subcutaneous_fat%All%resolved_pe%None,resolved_pe,volume_subcutaneous_fat,45,,0.644444,0.537778,0.229933,0.034276,0.017811,-0.59409,...,,,,,,,,All,univariable_volume_subcutaneous_fat,
univariable_density_subcutaneous_fat%All%resolved_pe%None,resolved_pe,density_subcutaneous_fat,45,,0.644444,0.37,0.095942,0.014302,0.007432,-0.601785,...,,,,,,,,All,univariable_density_subcutaneous_fat,


In [25]:
classifier = LogisticRegression(
    random_state=SEED,
    fit_intercept=True,
    max_iter=1_000, 
    penalty=None, 
    solver='newton-cg',
)

predictions = pd.DataFrame()
ground_truths = pd.DataFrame()

for i, model_name in enumerate(model_dfn_names):

    feat = model_dfns_remaining[model_name]
    X_temp = X.loc[:, feat].reset_index(drop=True)
    y_temp = y.copy().reset_index(drop=True)

    predicted_probs = cross_val_predict(
        estimator=classifier,
        X=X_temp,
        y=y_temp,
        cv=5,
        method='predict_proba',
        n_jobs=-1
    )
    predicted_probs = pd.Series(
        predicted_probs[:, 1],
        name=model_name
    )
    
    predictions = pd.concat(
        [predictions, predicted_probs],
        axis=1
    )

    truth = pd.Series(y_temp, name=f"ground_truth({model_name}")
    ground_truths = pd.concat(
        [ground_truths, y_temp],
        axis=1
    )

ground_truths['matching'] = ground_truths.eq(ground_truths.iloc[:, 0], axis=0).all(1)
assert np.all(ground_truths['matching'])

predictions = pd.concat([predictions, ground_truths.iloc[:, 0]], axis=1)
predictions = predictions.rename(columns={"resolved_pe": "ground_truth"})
predictions

Unnamed: 0,Demo,Body,Cardio,Clot,All,Custom,ground_truth
0,0.290358,0.582352,0.436193,0.310603,0.196829,0.516945,1
1,0.215027,0.382247,0.281783,0.357208,0.084044,0.187286,0
2,0.012505,0.191218,0.41575,0.294507,0.034187,0.438639,0
3,0.003425,0.126771,0.000851,0.362047,9.2e-05,0.0026,0
4,0.016822,0.278788,0.021975,0.059156,0.007601,2.7e-05,0
5,0.270132,0.158316,0.37618,0.058388,0.190028,0.013721,0
6,0.078482,0.280973,0.789503,0.280119,0.32815,0.656739,1
7,0.048574,0.133857,0.000459,0.218015,1.3e-05,0.000491,0
8,0.099959,0.333403,0.648324,0.530886,0.154094,0.394365,1
9,0.627261,0.712981,0.901868,0.278839,0.972942,0.955441,0


In [26]:
predictions.to_csv('../output/eval/logit_CV_predictions.csv', index=False)

In [38]:
pd.DataFrame(
    [
        brier_score_loss(predictions['ground_truth'], predictions['Demo']),
        brier_score_loss(predictions['ground_truth'], predictions['Body']),
        brier_score_loss(predictions['ground_truth'], predictions['Cardio']),
        brier_score_loss(predictions['ground_truth'], predictions['Clot']),
        brier_score_loss(predictions['ground_truth'], predictions['All']),
        brier_score_loss(predictions['ground_truth'], predictions['Custom']),
    ],
    index=['Demo', 'Body', 'Cardio', 'Clot', 'All', 'Custom'],
    columns=['Brier Score']
).transpose().to_csv('../output/eval/brier_scores.csv', index=True)