# PE Resolution Classification

We perform a univariable classifications using each explanatory variable. We then perform multivariable classifications. For these, we focus on body composition only, cardiopulmonary features only, and then a composite model. For each of these, we perform three forms of feature selection, using (1) recursive feature elimination with cross validation, (2) forward sequential feature selection with cross validation, and (3) backward feature selection with cross validation. For these groups of selected features, we also perform sensitivities controlling for gender, age, and both gender and age.

In [79]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
from pathlib import Path
import os
import pickle
from tqdm.notebook import trange, tqdm
from config import model_config

from scipy.stats import shapiro
from sklearn.ensemble import (
	RandomForestClassifier
)
from sklearn.feature_selection import(
	RFECV, SequentialFeatureSelector
)
from sklearn.linear_model import (
	LinearRegression, LogisticRegression,
)
from sklearn.metrics import (
	confusion_matrix, classification_report, f1_score,
	roc_curve, roc_auc_score
)
from sklearn.model_selection import (
	train_test_split, RandomizedSearchCV, GridSearchCV, 
	cross_val_score, cross_val_predict, KFold,
)
from sklearn.pipeline import (
	Pipeline
)
from sklearn.preprocessing import (
	LabelEncoder, OneHotEncoder, StandardScaler,
	RobustScaler, QuantileTransformer,
)
import statsmodels.api as sm

from regression import reg

In [2]:
SEED = 123
TEST_SIZE = 0.25
CV_FOLDS = 10

HEATMAP_COLORS = sns.diverging_palette(h_neg=359, h_pos=250, as_cmap=True)

# Import Data

In [87]:
with open(Path('../data/classification_data.pkl'), 'rb') as f:
	data = pickle.load(f)
	
X = data.get('X')
y = data.get('y').squeeze()
body_features = data.get('body_features')
cardio_features = data.get('cardio_features')
control_features = data.get('controls')
all_features = body_features + cardio_features + control_features

print(X.shape)
print(y.shape)
print(body_features)
print(cardio_features)
print(control_features)

(43, 17)
(43,)
['density_visceral_fat', 'mass_visceral_fat', 'density_intermuscular_fat', 'density_muscle', 'density_bone', 'bmi']
['emphysema_volume_950hu', 'extrapulmonary_vein_volume', 'artery_vein_ratio', 'bv10', 'pb_larger_10', 'pv_a', 'heart_volume', 'airway_ratio', 'ild_volume']
['age', 'gender_cl_Male']


In [73]:
y.head(2)

PE1_0     1.0
PE12_0    0.0
Name: resolved_pe, dtype: float64

In [61]:
y.value_counts(dropna=False)

resolved_pe
0.0            28
1.0            15
dtype: int64

# Logit Regression Functions

In [69]:
def get_params(model, X, y):
    """Returns pd.Series of coefs for comparison with statsmodels params."""
    y = np.array(y).ravel()
    model.fit(X, y)
    coef = pd.Series(np.squeeze(model.coef_), index=np.squeeze(model.feature_names_in_))
    # coef['const'] = model.intercept_
    return coef.sort_index()

In [8]:
def model_residual_correlation(model):
    """Returns measure of correlation."""
    return np.corrcoef(np.arange(len(model.resid)), model.resid)[1, 0]

In [77]:
def fit_model(X, y):
    """Fit statsmodels OLS model with robust SEs and sklearn OLS model."""
    # Fit statsmodels model for pvalues and coef
    model_sm = sm.Logit(y, X).fit(cov_type='HC3', disp=False)
    # Define sklearn model for CV evaluation
    model_sk = LogisticRegression(
        random_state=SEED,
        fit_intercept=False,
        max_iter=10_000, 
        tol=0.000001,
        penalty=None, 
        solver='newton-cg',
    )
    # Check that model params match
    # print(get_params(model_sk, X, y))
    # print(model_sm.params.sort_values())
    # print(np.isclose(get_params(model_sk, X, y), model_sm.params.sort_values()))
    assert np.all(np.isclose(get_params(model_sk, X, y), model_sm.params.sort_index()))
    return model_sm, model_sk

In [81]:
def store_model_results(model_sm, model_sk, X, y):
    """
    Params:
        - model_sm: statsmodel model for coefs, pvalues, and residuals.
        - model_sk: sklearn model for cross validation
        - X: X data.
        - y: y data.
    """
    # Calculate CV scores
    cv_scores = cross_val_score(
        model_sk, X, y, 
        scoring='roc_auc', 
        cv=CV_FOLDS, n_jobs=-1
    )
    # Store model results
    model_results = pd.DataFrame(
        {
            'y': y.name,
            'model_dfn': [tuple(X.columns.values)],
            'nobs': model_sm.nobs,
            'shapiro_resid_pvalue': np.nan,
            'metric_train': model_sk.score(X, y),
            'metric_cv_mean': np.mean(np.abs(cv_scores)),
            'metric_cv_std': np.std(cv_scores),
        }
    )
    # Set model index
    model_results = model_results.set_index(['y', 'model_dfn'])
    return model_results

In [11]:
def store_coef_results(model_sm, y):
    """
    Params:
        - model_sm: statsmodel model for coefs, pvalues, and residuals.
        - y: y data.
    """
    results = pd.DataFrame(
        {
            'model_dfn': [tuple(model_sm.params.index) for _ in range(len(model_sm.params))],
            'coef': model_sm.params, 
            'pval': model_sm.pvalues,
        },
    )
    results['signif'] = results['pval'].apply(reg.add_significance)
    results = results.reset_index(names='x')
    results['y'] = y.name
    results = results.pivot(index=['y', 'model_dfn'], columns=['x'], values=['coef', 'pval', 'signif'])
    results.columns = ['_'.join(idx) for idx in results.columns]
    return results

## Example

In [88]:
target = 'resolved_pe'
features = 'density_bone'
X_temp = sm.add_constant(X[features])
y_temp = y.copy()
model_sm, model_sk = fit_model(X_temp, y_temp)

In [89]:
store_model_results(model_sm, model_sk, X_temp, y_temp)

Unnamed: 0_level_0,Unnamed: 1_level_0,nobs,shapiro_resid_pvalue,metric_train,metric_cv_mean,metric_cv_std
y,model_dfn,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
resolved_pe,"(const, density_bone)",43,,0.651163,0.708333,0.281982


In [90]:
store_coef_results(model_sm, y_temp)

Unnamed: 0_level_0,Unnamed: 1_level_0,coef_const,coef_density_bone,pval_const,pval_density_bone,signif_const,signif_density_bone
y,model_dfn,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
resolved_pe,"(const, density_bone)",-0.757331,-0.665165,0.03019,0.031173,*,*


# Perform univariable regressions 

In [92]:
univariate_models = pd.DataFrame()
univariate_coefs = pd.DataFrame()

for feature in tqdm(all_features):
    # Fit model
    X_temp = sm.add_constant(X[feature])
    y_temp = y.copy()
    model_sm, model_sk = fit_model(X_temp, y_temp)

    # Collect model information
    univariate_models = pd.concat([univariate_models, store_model_results(model_sm, model_sk, X_temp, y_temp)], 
                                  axis=0)

    # Collect coef information
    univariate_coefs = pd.concat([univariate_coefs, store_coef_results(model_sm, y_temp)], 
                                 axis=0)
    
print(univariate_models.shape)
print(univariate_coefs.shape)

  0%|          | 0/17 [00:00<?, ?it/s]

(17, 5)
(17, 54)


In [93]:
univariate_results = univariate_models.join(univariate_coefs, how='left', validate='1:1')
univariate_results = univariate_results.reset_index()
univariate_results['selection_method'] = 'All'
univariate_results['model_dfn'] = univariate_results['model_dfn'].apply(lambda x: x[1])
univariate_results['category'] = 'univariable_' + univariate_results['model_dfn']
univariate_results['controls'] = 'None'
univariate_results.index = univariate_results[['category', 'selection_method', 'y', 'controls']].apply('%'.join, axis=1)
univariate_results.index.name = 'Lookup'
print(univariate_results.shape)
univariate_results

(17, 64)


Unnamed: 0_level_0,y,model_dfn,nobs,shapiro_resid_pvalue,metric_train,metric_cv_mean,metric_cv_std,coef_const,coef_density_visceral_fat,pval_const,...,signif_ild_volume,coef_age,pval_age,signif_age,coef_gender_cl_Male,pval_gender_cl_Male,signif_gender_cl_Male,selection_method,category,controls
Lookup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
univariable_density_visceral_fat%All%resolved_pe%None,resolved_pe,density_visceral_fat,43,,0.627907,0.5375,0.223956,-0.678737,-0.282329,0.036291,...,,,,,,,,All,univariable_density_visceral_fat,
univariable_mass_visceral_fat%All%resolved_pe%None,resolved_pe,mass_visceral_fat,43,,0.627907,0.616667,0.197906,-0.702599,,0.032814,...,,,,,,,,All,univariable_mass_visceral_fat,
univariable_density_intermuscular_fat%All%resolved_pe%None,resolved_pe,density_intermuscular_fat,43,,0.651163,0.433333,0.366667,-0.624545,,0.051216,...,,,,,,,,All,univariable_density_intermuscular_fat,
univariable_density_muscle%All%resolved_pe%None,resolved_pe,density_muscle,43,,0.651163,0.35,0.174005,-0.623175,,0.051808,...,,,,,,,,All,univariable_density_muscle,
univariable_density_bone%All%resolved_pe%None,resolved_pe,density_bone,43,,0.651163,0.708333,0.281982,-0.757331,,0.03019,...,,,,,,,,All,univariable_density_bone,
univariable_bmi%All%resolved_pe%None,resolved_pe,bmi,43,,0.651163,0.5,0.333333,-0.652601,,0.042747,...,,,,,,,,All,univariable_bmi,
univariable_emphysema_volume_950hu%All%resolved_pe%None,resolved_pe,emphysema_volume_950hu,43,,0.651163,0.383333,0.316667,-0.625839,,0.050798,...,,,,,,,,All,univariable_emphysema_volume_950hu,
univariable_extrapulmonary_vein_volume%All%resolved_pe%None,resolved_pe,extrapulmonary_vein_volume,43,,0.651163,0.175,0.168531,-0.625914,,0.061355,...,,,,,,,,All,univariable_extrapulmonary_vein_volume,
univariable_artery_vein_ratio%All%resolved_pe%None,resolved_pe,artery_vein_ratio,43,,0.651163,0.608333,0.30288,-0.483087,,0.168392,...,,,,,,,,All,univariable_artery_vein_ratio,
univariable_bv10%All%resolved_pe%None,resolved_pe,bv10,43,,0.651163,0.608333,0.357557,-0.648974,,0.046105,...,,,,,,,,All,univariable_bv10,


# Perform multivariable regressions

In [94]:
feature_options = {
    'body': body_features,
    'cardio': cardio_features,
    'composite': body_features + cardio_features,
}

control_options = {
    'None': None,
    'Age': ['age'],
    'Gender': ['gender_cl_Male'],
    'Age and Gender': ['age', 'gender_cl_Male'],
}

ols = LinearRegression(fit_intercept=True)

selector_options = {
    # 'univariable': None,
    'rfecv': RFECV(
        estimator=ols, 
        step=1, 
        scoring='neg_mean_squared_error', 
        cv=10,
        n_jobs=-1,
    ),
    'sfs_fwd': SequentialFeatureSelector(
        estimator=ols,
        n_features_to_select='auto', 
        tol=0.01, 
        scoring='neg_mean_squared_error', 
        direction='forward', 
        cv=10,
        n_jobs=-1,
    ),
    'sfs_bwd': SequentialFeatureSelector(
        estimator=ols,
        n_features_to_select='auto', 
        tol=0.01, 
        scoring='neg_mean_squared_error', 
        direction='backward', 
        cv=10,
        n_jobs=-1
    ),
}

def get_selected_features(selector, X, y):
    selector.fit(X, y)
    return list(selector.get_feature_names_out().flatten())

## Feature selection

In [95]:
feature_option_dict = dict()
for feature_key, feature_option in tqdm(feature_options.items()):
    
    # Create dictionary to store target results
    target_dict = dict()
    
    # Select features (varying methods)
    y_temp = y.copy()
    rfecv_feat = get_selected_features(selector_options['rfecv'], X[feature_option], y_temp)
    sfs_fwd_feat = get_selected_features(selector_options['sfs_fwd'], X[feature_option], y_temp)
    sfs_bwd_feat = get_selected_features(selector_options['sfs_bwd'], X[feature_option], y_temp)
    
    # Create dictionary to store control results
    control_dict = dict()
    
    for control_key, control_option in control_options.items():
        
        selector_dict = dict()
        # Store rfecv results
        selector_dict['rfecv'] = rfecv_feat if control_option is None else rfecv_feat + control_option 
        # Store SFS fwd
        selector_dict['sfs_fwd'] = sfs_fwd_feat if control_option is None else sfs_fwd_feat + control_option 
        # Store SFS bwd
        selector_dict['sfs_bwd'] = sfs_bwd_feat if control_option is None else sfs_bwd_feat + control_option 
        
        control_dict[control_key] = selector_dict
    
    target_dict[y_temp.name] = control_dict
        
    feature_option_dict[feature_key] = target_dict

  0%|          | 0/3 [00:00<?, ?it/s]

## Run models

In [96]:
model_definitions = feature_option_dict.copy()

print("Testing out a few...")
print(model_definitions['cardio']['resolved_pe']['Age']['sfs_bwd'])
print(model_definitions['cardio']['resolved_pe']['Age']['rfecv'])
print(model_definitions['cardio']['resolved_pe']['Age']['sfs_fwd'])
print(model_definitions['cardio']['resolved_pe']['None']['sfs_bwd'])
print(model_definitions['cardio']['resolved_pe']['None']['rfecv'])
print(model_definitions['cardio']['resolved_pe']['None']['sfs_fwd'])

Testing out a few...
['emphysema_volume_950hu', 'artery_vein_ratio', 'heart_volume', 'airway_ratio', 'ild_volume', 'age']
['artery_vein_ratio', 'heart_volume', 'airway_ratio', 'age']
['ild_volume', 'age']
['emphysema_volume_950hu', 'artery_vein_ratio', 'heart_volume', 'airway_ratio', 'ild_volume']
['artery_vein_ratio', 'heart_volume', 'airway_ratio']
['ild_volume']


In [97]:
multivariable_models = pd.DataFrame()
multivariable_coefs = pd.DataFrame()
multivariable_results = pd.DataFrame()

for feature_key, feature_option in tqdm(feature_options.items()):
    for control_key, control_option in control_options.items():
        for selector_key, selector_option in selector_options.items():
            # Unwrap features
            X_temp = sm.add_constant(X[features])
            y_temp = y.copy()
            features = model_definitions[feature_key][y_temp.name][control_key][selector_key]

            # Fit model
            model_sm, model_sk = fit_model(X_temp, y_temp)
    
            # Collect model information
            temp_models = store_model_results(model_sm, model_sk, X_temp, y_temp)
            multivariable_models = pd.concat([multivariable_models, temp_models], axis=0)
    
            # Collect coef information
            temp_coefs = store_coef_results(model_sm, y_temp)
            multivariable_coefs = pd.concat([multivariable_coefs, temp_coefs], axis=0)

            # Combine model and coef information
            temp_results = pd.concat([temp_models, temp_coefs], axis=1)
            temp_results = temp_results.reset_index()
            temp_results['model_dfn'] = temp_results['model_dfn'].apply('_'.join)
            temp_results['category'] = feature_key
            temp_results['controls'] = control_key
            temp_results['selection_method'] = selector_key
            temp_results.index = temp_results[['category', 'selection_method', 'y', 'controls']].apply('%'.join, axis=1)
            temp_results.index.name = 'Lookup'

            # Combine
            multivariable_results = pd.concat([multivariable_results, temp_results], axis=0)

print(multivariable_models.shape)
print(multivariable_coefs.shape)
print(multivariable_results.shape)

  0%|          | 0/3 [00:00<?, ?it/s]

(36, 5)
(36, 39)
(36, 49)


# Combine univariable and multivariable results

In [98]:
ols_results = pd.concat([univariate_results, multivariable_results], axis=0)
ols_results.to_csv('../output/regressions/logit_results.csv')

# OLD -------------->

# Logit Regression

In [6]:
X_temp = sm.add_constant(X.loc[:, ['heart_volume', 'age']])
model = sm.Logit(y, X_temp).fit(cov_type='HC3', disp=False)
model.pvalues['heart_volume']

0.0678289785659096

In [7]:
def combine_feat_and_controls(features=None, controls=None):
	if isinstance(features, str):
		feat_set = set([features])
	elif isinstance(features, list):
		feat_set = set(features)
	if controls is None:
		control_set = set()
	else:
		control_set = set(controls)
	all_feat = list(feat_set.union(control_set))
	return all_feat

In [8]:
def logit_regression(X, y, feat=None, controls=None):
	all_feat = combine_feat_and_controls(feat, controls)

	# Fit statsmodels model for pvalues
	X_temp = sm.add_constant(X.loc[:, all_feat])
	model = sm.Logit(y, X_temp).fit(cov_type='HC3', disp=False)
	
	# Fit sklearn model for cross validation
	model_for_cv = LogisticRegression(random_state=SEED, max_iter=10_000, penalty=None, solver='lbfgs')

	logit_probs = model.predict()
	fpr, tpr, thresholds = roc_curve(y, logit_probs)
	auc = roc_auc_score(y, logit_probs)
	cv_auc = np.mean(cross_val_score(model_for_cv, X_temp, y, scoring='roc_auc', cv=CV_FOLDS, n_jobs=-1))

	return (
		model.params, 
		model.pvalues,
		fpr,
		tpr,
		thresholds,
		auc,
		cv_auc
    )

In [9]:
def logit_regressions(feature_pool, controls):
	logit_coefs = dict()
	logit_pvals = dict()
	logit_auc = dict()
	logit_cv_auc = dict()

	for feature in feature_pool:
		(
			coef, pvalues,
			fpr, tpr, thresholds, 
			auc, cv_auc
		) = logit_regression(X, y, feature, controls)

		logit_coefs[feature] = coef[feature]
		logit_pvals[feature] = pvalues[feature]
		logit_auc[feature] = auc
		logit_cv_auc[feature] = cv_auc

	if controls is None:
		controls_str = 'None'
	else:
		controls_str = '_'.join(controls)

	logit_results_df = pd.DataFrame(
		{
			'coef': logit_coefs,
			'pvalue': logit_pvals,
			'auc': logit_auc,
			'cv_auc': logit_cv_auc,
			'controls': controls_str,
		}
	)
	return logit_results_df

In [10]:
def plot_roc(fpr, tpr, auc, custom_title=None):
	fig, ax = plt.subplots()
	ax.plot([0, 1], ls='--', c='r')
	ax.plot([0, 0], [1, 0], c='0.8')
	ax.plot([1, 0], [1, 1], c='0.8')
	ax.plot(fpr, tpr, label=f'ROC Curve (AUC={auc:.3})')
	if custom_title is not None:
		ax.set_title(f'Y=Resolved PE, {custom_title}', fontsize=16)
	else:
		ax.set_title(f'Y=Resolved PE', fontsize=16)
	ax.set_xlabel('False Positive Rate')
	ax.set_ylabel('True Positive Rate')
	plt.legend(fontsize=14)
	plt.show()

In [11]:
def select_features_REFCV(model, scoring, features, target):
	feature_selection_results = dict()
	selector = RFECV(
		estimator=model,
		step=1,
		scoring=scoring,
		cv=CV_FOLDS
	)
	selector.fit(X[features], np.array(y).ravel())
	feature_selection_results[target] = list(selector.get_feature_names_out())
	return feature_selection_results

def select_features_SFS(model, scoring, features, target, direction):
	feature_selection_results = dict()
	selector = SequentialFeatureSelector(
		estimator=model,
		n_features_to_select='auto',
		tol=0.01,
		scoring=scoring,
		direction=direction,
		cv=10
	)
	selector.fit(X[features], np.array(y).ravel())
	feature_selection_results[target] = list(selector.get_feature_names_out())
	return feature_selection_results

In [42]:
def evaluate_multivariable_logit(feature_selection_results, selection_method, model_type, controls=None):
    
    if controls is None:
        controls_str = 'None'
    else:
        controls_str = '_'.join(controls)
    
    target = 'resolved_pe'
    x_vars = combine_feat_and_controls(list(feature_selection_results[target]), controls)
    X_temp = X.loc[:, x_vars]
    y_temp = y.loc[:, target]
    
    # Fit model (statsmodels, for p-values)
    model_sm = sm.Logit(y_temp, sm.add_constant(X_temp)).fit(cov_type='HC3', disp=False)
    
    # Fit models (sklearn, for CV scores)
    model_sk = LogisticRegression(random_state=SEED, max_iter=10_000, penalty=None, solver='lbfgs')
    model_sk_l2 = LogisticRegression(random_state=SEED, max_iter=10_000, penalty='l2', solver='liblinear')
    model_sk_l1 = LogisticRegression(random_state=SEED, max_iter=10_000, penalty='l1', solver='saga')
    model_sk_elastic = LogisticRegression(random_state=SEED, max_iter=10_000, penalty='elasticnet', solver='saga', l1_ratio=0.5)
    
    # Get score for simple model
    model_sk.fit(X_temp, np.array(y_temp).ravel())
    logit_probs = model_sk.predict_proba(X_temp)
    fpr, tpr, thresholds = roc_curve(np.array(y_temp).ravel(), logit_probs[:, 1])
    train_auc = roc_auc_score(np.array(y_temp).ravel(), logit_probs[:, 1])
    
    # Get cross validation scores for variously regularized models
    model_sk_cv_auc = cross_val_score(model_sk, X_temp, y_temp, scoring='roc_auc', cv=CV_FOLDS, n_jobs=-1)
    
    # Store results in df
    model_results = pd.DataFrame({
        'coef': model_sm.params,
        'pval': model_sm.pvalues,
        'nobs': model_sm.nobs,
    })
    model_results['signif'] = model_results['pval'].apply(reg.add_significance)
    model_results['train_auc'] = train_auc
    model_results['cv_auc_mean'] = np.mean(model_sk_cv_auc)
    model_results['cv_auc_std'] = np.std(model_sk_cv_auc)
    
    # Create new index
    model_results.index = pd.MultiIndex.from_tuples(
        list(zip(
            [model_type for _ in range(model_results.shape[0])],
            [selection_method for _ in range(model_results.shape[0])],
            [target for _ in range(model_results.shape[0])], 
            model_results.index,
            [controls_str for _ in range(model_results.shape[0])],
        )),
        names=('model_type', 'feature_selection_method', 'y', 'X', 'controls')
    )
    return model_results

## Univariate Analysis

### Body Composition

In [23]:
# Get logit results
logit_body_dfs = list()
logit_body_dfs.append(logit_regressions(body_features, controls=None))
logit_body_dfs.append(logit_regressions(body_features, controls=['age']))
logit_body_dfs.append(logit_regressions(body_features, controls=['gender_cl_Male']))
logit_body_dfs.append(logit_regressions(body_features, controls=['gender_cl_Male', 'age']))

# Combine and reset index
all_logit_body_dfs = pd.concat(logit_body_dfs, axis=0)
all_logit_body_dfs['x_var'] = all_logit_body_dfs.index
all_logit_body_dfs.index = all_logit_body_dfs.index + '_' + all_logit_body_dfs['controls']
all_logit_body_dfs.index.name = 'Lookup'

# Add significance
all_logit_body_dfs['signif'] = all_logit_body_dfs['pvalue'].apply(reg.add_significance)

# Export and show snippet
# all_logit_body_dfs.to_csv('../output/regressions/logit_body.csv')
all_logit_body_dfs.tail()

Unnamed: 0_level_0,coef,pvalue,auc,cv_auc,controls,x_var,signif
Lookup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
mass_visceral_fat_gender_cl_Male_age,-0.232165,0.604739,0.804762,0.675,gender_cl_Male_age,mass_visceral_fat,
density_intermuscular_fat_gender_cl_Male_age,0.05554,0.856964,0.795238,0.708333,gender_cl_Male_age,density_intermuscular_fat,
density_muscle_gender_cl_Male_age,-0.104961,0.854202,0.792857,0.741667,gender_cl_Male_age,density_muscle,
density_bone_gender_cl_Male_age,-0.349649,0.375176,0.807143,0.725,gender_cl_Male_age,density_bone,
bmi_gender_cl_Male_age,-0.010298,0.977295,0.790476,0.683333,gender_cl_Male_age,bmi,


### Cardiopulmonary

In [24]:
# Get logit results
logit_cardio_dfs = list()
logit_cardio_dfs.append(logit_regressions(cardio_features, controls=None))
logit_cardio_dfs.append(logit_regressions(cardio_features, controls=['age']))
logit_cardio_dfs.append(logit_regressions(cardio_features, controls=['gender_cl_Male']))
logit_cardio_dfs.append(logit_regressions(cardio_features, controls=['gender_cl_Male', 'age']))

# Combine and reset index
all_logit_cardio_dfs = pd.concat(logit_cardio_dfs, axis=0)
all_logit_cardio_dfs['x_var'] = all_logit_cardio_dfs.index
all_logit_cardio_dfs.index = all_logit_cardio_dfs.index + '_' + all_logit_cardio_dfs['controls']
all_logit_cardio_dfs.index.name = 'Lookup'

# Add significance
all_logit_cardio_dfs['signif'] = all_logit_cardio_dfs['pvalue'].apply(reg.add_significance)

# Export and show snippet
# all_logit_cardio_dfs.to_csv('../output/regressions/logit_cardio.csv')
all_logit_cardio_dfs.tail()

Unnamed: 0_level_0,coef,pvalue,auc,cv_auc,controls,x_var,signif
Lookup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
pb_larger_10_gender_cl_Male_age,-0.155669,0.627738,0.790476,0.725,gender_cl_Male_age,pb_larger_10,
pv_a_gender_cl_Male_age,1.057619,0.084248,0.840476,0.75,gender_cl_Male_age,pv_a,
heart_volume_gender_cl_Male_age,0.322187,0.458636,0.802381,0.708333,gender_cl_Male_age,heart_volume,
airway_ratio_gender_cl_Male_age,0.205298,0.646258,0.802381,0.7,gender_cl_Male_age,airway_ratio,
ild_volume_gender_cl_Male_age,0.348984,0.419294,0.811905,0.775,gender_cl_Male_age,ild_volume,


### Export results

In [25]:
logit_results = pd.concat([all_logit_body_dfs, all_logit_cardio_dfs], axis=0)
logit_results.to_csv('../output/regressions/logit_univariate_all.csv')

## Multivariable Analysis

In [16]:
# Define model for CV feat selection
logit = LogisticRegression(random_state=SEED, max_iter=10_000, penalty=None, solver='lbfgs')
logit_target = 'resolved_pe'

In [17]:
# List to store results
multivariable_results = dict()

### Recursive Feature Selection

In [33]:
# Cardio
refcv_cardio_features = select_features_REFCV(
    model=logit, 
    scoring='roc_auc', 
    features=cardio_features, 
    target=logit_target
)
print(refcv_cardio_features)

# Body
refcv_body_features = select_features_REFCV(
    model=logit, 
    scoring='roc_auc', 
    features=body_features, 
    target=logit_target
)
print(refcv_body_features)

# Composite
refcv_composite_features = select_features_REFCV(
    model=logit, 
    scoring='roc_auc', 
    features=body_features + cardio_features, 
    target=logit_target
)
print(refcv_composite_features)

{'resolved_pe': ['artery_vein_ratio', 'heart_volume', 'airway_ratio']}
{'resolved_pe': ['density_visceral_fat', 'density_muscle', 'density_bone', 'bmi']}
{'resolved_pe': ['density_bone', 'artery_vein_ratio', 'airway_ratio']}


### Sequential Feature Selection (Backward)

In [34]:
# Cardio
sfs_bwd_cardio_features = select_features_SFS(
    model=logit, 
    scoring='roc_auc', 
    features=cardio_features, 
    target=logit_target, 
    direction='backward'
)
print(sfs_bwd_cardio_features)

# Body
sfs_bwd_body_features = select_features_SFS(
    model=logit, 
    scoring='roc_auc', 
    features=body_features, 
    target=logit_target, 
    direction='backward'
)
print(sfs_bwd_body_features)

# Composite
sfs_bwd_composite_features = select_features_SFS(
    model=logit, 
    scoring='roc_auc', 
    features=body_features + cardio_features, 
    target=logit_target, 
    direction='backward'
)
print(sfs_bwd_composite_features)

{'resolved_pe': ['extrapulmonary_vein_volume', 'artery_vein_ratio', 'bv10', 'pv_a', 'heart_volume', 'airway_ratio', 'ild_volume']}
{'resolved_pe': ['density_visceral_fat', 'density_muscle', 'density_bone', 'bmi']}
{'resolved_pe': ['density_visceral_fat', 'mass_visceral_fat', 'density_muscle', 'density_bone', 'bmi', 'emphysema_volume_950hu', 'extrapulmonary_vein_volume', 'artery_vein_ratio', 'bv10', 'airway_ratio', 'ild_volume']}


### Sequential Feature Selection (Forward)

In [35]:
# Cardio
sfs_fwd_cardio_features = select_features_SFS(
    model=logit, 
    scoring='roc_auc', 
    features=cardio_features, 
    target=logit_target, 
    direction='forward'
)
print(sfs_fwd_cardio_features)

# Body
sfs_fwd_body_features = select_features_SFS(
    model=logit, 
    scoring='roc_auc', 
    features=body_features, 
    target=logit_target, 
    direction='forward'
)
print(sfs_fwd_body_features)

# Composite
sfs_fwd_composite_features = select_features_SFS(
    model=logit, 
    scoring='roc_auc', 
    features=body_features + cardio_features, 
    target=logit_target, 
    direction='forward'
)
print(sfs_fwd_composite_features)

{'resolved_pe': ['heart_volume']}
{'resolved_pe': ['density_visceral_fat', 'density_bone']}
{'resolved_pe': ['heart_volume']}


### Export

In [44]:
selection_results_options = [
    ('refcv',   'cardio',    refcv_cardio_features),
    ('refcv',   'body',      refcv_body_features),
    ('refcv',   'composite', refcv_composite_features),
    ('sfs_bwd', 'cardio',    sfs_bwd_cardio_features),
    ('sfs_bwd', 'body',      sfs_bwd_body_features),
    ('sfs_bwd', 'composite', sfs_bwd_composite_features),
    ('sfs_fwd', 'cardio',    sfs_fwd_cardio_features),
    ('sfs_fwd', 'body',      sfs_fwd_body_features),
    ('sfs_fwd', 'composite', sfs_fwd_composite_features),
]

control_options = [
    None, 
    ['age'],
    ['gender_cl_Male'],
    ['age', 'gender_cl_Male']
]

logit_multi_results = pd.DataFrame()
for selection_result in selection_results_options:
    for control in control_options:
        logit_multi_results = pd.concat(
            [logit_multi_results, 
             evaluate_multivariable_logit(
                 feature_selection_results=selection_result[2], 
                 selection_method=selection_result[0], 
                 model_type=selection_result[1],
                 controls=control)
            ],
            axis=0
        )

index_names = ['model_type', 'feature_selection_method', 'y', 'X', 'controls']
logit_multi_results = logit_multi_results.reset_index(names=index_names)
logit_multi_results.index = logit_multi_results[index_names].apply('_'.join, axis=1)
logit_multi_results.index.name = 'Lookup'

logit_multi_results.to_csv('../output/regressions/logit_multivariable_all.csv')

logit_multi_results.tail()

Unnamed: 0_level_0,model_type,feature_selection_method,y,X,controls,coef,pval,nobs,signif,train_auc,cv_auc_mean,cv_auc_std
Lookup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
composite_sfs_fwd_resolved_pe_heart_volume_gender_cl_Male,composite,sfs_fwd,resolved_pe,heart_volume,gender_cl_Male,0.190836,0.627564,43,,0.754762,0.758333,0.34651
composite_sfs_fwd_resolved_pe_const_age_gender_cl_Male,composite,sfs_fwd,resolved_pe,const,age_gender_cl_Male,-1.759531,0.004237,43,**,0.802381,0.708333,0.371278
composite_sfs_fwd_resolved_pe_gender_cl_Male_age_gender_cl_Male,composite,sfs_fwd,resolved_pe,gender_cl_Male,age_gender_cl_Male,1.505519,0.075164,43,,0.802381,0.708333,0.371278
composite_sfs_fwd_resolved_pe_heart_volume_age_gender_cl_Male,composite,sfs_fwd,resolved_pe,heart_volume,age_gender_cl_Male,0.322187,0.458636,43,,0.802381,0.708333,0.371278
composite_sfs_fwd_resolved_pe_age_age_gender_cl_Male,composite,sfs_fwd,resolved_pe,age,age_gender_cl_Male,0.777716,0.032155,43,*,0.802381,0.708333,0.371278


# OLD