# PE Resolution Classification

We perform a univariable classifications using each explanatory variable. We then perform multivariable classifications. For these, we focus on body composition only, cardiopulmonary features only, and then a composite model. For each of these, we perform three forms of feature selection, using (1) recursive feature elimination with cross validation, (2) forward sequential feature selection with cross validation, and (3) backward feature selection with cross validation. For these groups of selected features, we also perform sensitivities controlling for gender, age, and both gender and age.

In [137]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
from pathlib import Path
import os
import pickle
from tqdm.notebook import trange, tqdm
from config import model_config

from scipy.stats import shapiro
from sklearn.ensemble import (
	RandomForestClassifier
)
from sklearn.feature_selection import(
	RFECV, SequentialFeatureSelector
)
from sklearn.linear_model import (
	LinearRegression, LogisticRegression,
)
from sklearn.metrics import (
	confusion_matrix, classification_report, f1_score,
	roc_curve, roc_auc_score
)
from sklearn.model_selection import (
	train_test_split, RandomizedSearchCV, GridSearchCV, 
	cross_val_score, cross_val_predict, KFold,
)
from sklearn.pipeline import (
	Pipeline
)
from sklearn.preprocessing import (
	LabelEncoder, OneHotEncoder, StandardScaler,
	RobustScaler, QuantileTransformer,
)
import statsmodels.api as sm

from regression import reg

In [138]:
SEED = 123
TEST_SIZE = 0.25
CV_FOLDS = 5

HEATMAP_COLORS = sns.diverging_palette(h_neg=359, h_pos=250, as_cmap=True)

# Import Data

In [139]:
with open(Path('../data/classification_data.pkl'), 'rb') as f:
	data = pickle.load(f)
	
X = data.get('X')
y = data.get('y').squeeze()
body_features = data.get('body_features')
cardio_features = data.get('cardio_features')
control_features = data.get('controls')
all_features = body_features + cardio_features + control_features

print(X.shape)
print(y.shape)
print(body_features)
print(cardio_features)
print(control_features)

(95, 17)
(95,)
['density_visceral_fat', 'density_intermuscular_fat', 'volume_bone', 'density_bone', 'bmi']
['emphysema_volume_950hu', 'extrapulmonary_vein_volume', 'artery_vein_ratio', 'bv10', 'pb_larger_10', 'a_diameter', 'pv_a', 'heart_volume', 'airway_ratio', 'ild_volume']
['age', 'gender_cl_Male']


In [140]:
pe_numbers = y.index.str[:-2]
pe_numbers

Index(['PE1', 'PE12', 'PE12', 'PE14', 'PE15', 'PE15', 'PE15', 'PE15', 'PE16',
       'PE16', 'PE17', 'PE18', 'PE18', 'PE18', 'PE18', 'PE19', 'PE2', 'PE20',
       'PE21', 'PE21', 'PE22', 'PE22', 'PE22', 'PE23', 'PE23', 'PE23', 'PE24',
       'PE25', 'PE25', 'PE25', 'PE27', 'PE27', 'PE28', 'PE3', 'PE3', 'PE31',
       'PE32', 'PE32', 'PE32', 'PE32', 'PE32', 'PE33', 'PE34', 'PE34', 'PE35',
       'PE35', 'PE36', 'PE36', 'PE37', 'PE37', 'PE37', 'PE4', 'PE40', 'PE40',
       'PE40', 'PE41', 'PE41', 'PE41', 'PE41', 'PE42', 'PE42', 'PE43', 'PE44',
       'PE45', 'PE47', 'PE48', 'PE48', 'PE49', 'PE49', 'PE5', 'PE51', 'PE51',
       'PE51', 'PE51', 'PE52', 'PE52', 'PE52', 'PE52', 'PE52', 'PE52', 'PE52',
       'PE53', 'PE6', 'PE6', 'PE6', 'PE6', 'PE6', 'PE6', 'PE6', 'PE7', 'PE7',
       'PE8', 'PE8', 'PE8', 'PE9'],
      dtype='object')

In [141]:
y.head(2)

PE1_0     1
PE12_0    0
Name: resolved_pe, dtype: int64

In [142]:
y.value_counts(dropna=False)

0    69
1    26
Name: resolved_pe, dtype: int64

# Logit Regression Functions

In [143]:
def get_params(model, X, y):
    """Returns pd.Series of coefs for comparison with statsmodels params."""
    y = np.array(y).ravel()
    model.fit(X, y)
    coef = pd.Series(np.squeeze(model.coef_), index=np.squeeze(model.feature_names_in_))
    # coef['const'] = model.intercept_
    return coef.sort_index()

In [144]:
def model_residual_correlation(model):
    """Returns measure of correlation."""
    return np.corrcoef(np.arange(len(model.resid)), model.resid)[1, 0]

In [145]:
def fit_model(X, y):
    """Fit statsmodels OLS model with robust SEs and sklearn OLS model."""
    # Fit statsmodels model for pvalues and coef
    
    # For clustered standard errors:
    # model_sm = sm.Logit(y, X).fit(cov_type='cluster', disp=False, cov_kwds={'groups': pe_numbers})

    # For robust standard errors:
    model_sm = sm.Logit(y, X).fit(cov_type='HC3', disp=False)

    
    # Define sklearn model for CV evaluation
    model_sk = LogisticRegression(
        random_state=SEED,
        fit_intercept=False,
        max_iter=10_000, 
        tol=0.000001,
        penalty=None, 
        solver='newton-cg',
    )
    # Check that model params match
    # print(get_params(model_sk, X, y))
    # print(model_sm.params.sort_values())
    # print(np.isclose(get_params(model_sk, X, y), model_sm.params.sort_values()))
    assert np.all(np.isclose(get_params(model_sk, X, y), model_sm.params.sort_index()))
    return model_sm, model_sk

In [146]:
def store_model_results(model_sm, model_sk, X, y):
    """
    Params:
        - model_sm: statsmodel model for coefs, pvalues, and residuals.
        - model_sk: sklearn model for cross validation
        - X: X data.
        - y: y data.
    """
    # Calculate CV scores
    cv_scores = cross_val_score(
        model_sk, X, y, 
        scoring='roc_auc', 
        cv=CV_FOLDS, n_jobs=-1
    )
    # Store model results
    model_results = pd.DataFrame(
        {
            'y': y.name,
            'model_dfn': [tuple(X.columns.values)],
            'nobs': model_sm.nobs,
            'shapiro_resid_pvalue': np.nan,
            'metric_train': model_sk.score(X, y),
            'metric_cv_mean': np.mean(np.abs(cv_scores)),
            'metric_cv_std': np.std(cv_scores),
        }
    )
    # Set model index
    model_results = model_results.set_index(['y', 'model_dfn'])
    return model_results

In [147]:
def store_coef_results(model_sm, y):
    """
    Params:
        - model_sm: statsmodel model for coefs, pvalues, and residuals.
        - y: y data.
    """
    results = pd.DataFrame(
        {
            'model_dfn': [tuple(model_sm.params.index) for _ in range(len(model_sm.params))],
            'coef': model_sm.params, 
            'pval': model_sm.pvalues,
        },
    )
    results['signif'] = results['pval'].apply(reg.add_significance)
    results = results.reset_index(names='x')
    results['y'] = y.name
    results = results.pivot(index=['y', 'model_dfn'], columns=['x'], values=['coef', 'pval', 'signif'])
    results.columns = ['_'.join(idx) for idx in results.columns]
    return results

## Example

In [148]:
target = 'resolved_pe'
features = 'density_bone'
X_temp = sm.add_constant(X[features])
y_temp = y.copy()
model_sm, model_sk = fit_model(X_temp, y_temp)

In [149]:
store_model_results(model_sm, model_sk, X_temp, y_temp)

Unnamed: 0_level_0,Unnamed: 1_level_0,nobs,shapiro_resid_pvalue,metric_train,metric_cv_mean,metric_cv_std
y,model_dfn,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
resolved_pe,"(const, density_bone)",95,,0.715789,0.675311,0.188679


In [150]:
store_coef_results(model_sm, y_temp)

Unnamed: 0_level_0,Unnamed: 1_level_0,coef_const,coef_density_bone,pval_const,pval_density_bone,signif_const,signif_density_bone
y,model_dfn,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
resolved_pe,"(const, density_bone)",-1.114081,-0.818988,0.008661,0.02519,**,*


# Perform univariable regressions 

In [151]:
univariate_models = pd.DataFrame()
univariate_coefs = pd.DataFrame()

for feature in tqdm(all_features):
    # Fit model
    X_temp = sm.add_constant(X[feature])
    y_temp = y.copy()
    model_sm, model_sk = fit_model(X_temp, y_temp)

    # Collect model information
    univariate_models = pd.concat([univariate_models, store_model_results(model_sm, model_sk, X_temp, y_temp)], 
                                  axis=0)

    # Collect coef information
    univariate_coefs = pd.concat([univariate_coefs, store_coef_results(model_sm, y_temp)], 
                                 axis=0)
    
print(univariate_models.shape)
print(univariate_coefs.shape)

  0%|          | 0/17 [00:00<?, ?it/s]

(17, 5)
(17, 54)


In [152]:
univariate_results = univariate_models.join(univariate_coefs, how='left', validate='1:1')
univariate_results = univariate_results.reset_index()
univariate_results['selection_method'] = 'All'
univariate_results['model_dfn'] = univariate_results['model_dfn'].apply(lambda x: x[1])
univariate_results['category'] = 'univariable_' + univariate_results['model_dfn']
univariate_results['controls'] = 'None'
univariate_results.index = univariate_results[['category', 'selection_method', 'y', 'controls']].apply('%'.join, axis=1)
univariate_results.index.name = 'Lookup'
print(univariate_results.shape)
univariate_results

(17, 64)


Unnamed: 0_level_0,y,model_dfn,nobs,shapiro_resid_pvalue,metric_train,metric_cv_mean,metric_cv_std,coef_const,coef_density_visceral_fat,pval_const,...,signif_ild_volume,coef_age,pval_age,signif_age,coef_gender_cl_Male,pval_gender_cl_Male,signif_gender_cl_Male,selection_method,category,controls
Lookup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
univariable_density_visceral_fat%All%resolved_pe%None,resolved_pe,density_visceral_fat,95,,0.705263,0.578315,0.254388,-1.020967,-0.457844,0.021826,...,,,,,,,,All,univariable_density_visceral_fat,
univariable_density_intermuscular_fat%All%resolved_pe%None,resolved_pe,density_intermuscular_fat,95,,0.726316,0.37619,0.059666,-0.976028,,0.018604,...,,,,,,,,All,univariable_density_intermuscular_fat,
univariable_volume_bone%All%resolved_pe%None,resolved_pe,volume_bone,95,,0.726316,0.399853,0.243733,-0.988099,,0.027182,...,,,,,,,,All,univariable_volume_bone,
univariable_density_bone%All%resolved_pe%None,resolved_pe,density_bone,95,,0.715789,0.675311,0.188679,-1.114081,,0.008661,...,,,,,,,,All,univariable_density_bone,
univariable_bmi%All%resolved_pe%None,resolved_pe,bmi,95,,0.726316,0.373407,0.141654,-0.980277,,0.021291,...,,,,,,,,All,univariable_bmi,
univariable_emphysema_volume_950hu%All%resolved_pe%None,resolved_pe,emphysema_volume_950hu,95,,0.726316,0.6237,0.066266,-0.979907,,0.017294,...,,,,,,,,All,univariable_emphysema_volume_950hu,
univariable_extrapulmonary_vein_volume%All%resolved_pe%None,resolved_pe,extrapulmonary_vein_volume,95,,0.726316,0.427399,0.235213,-0.994293,,0.021471,...,,,,,,,,All,univariable_extrapulmonary_vein_volume,
univariable_artery_vein_ratio%All%resolved_pe%None,resolved_pe,artery_vein_ratio,95,,0.726316,0.375604,0.156595,-0.976367,,0.019837,...,,,,,,,,All,univariable_artery_vein_ratio,
univariable_bv10%All%resolved_pe%None,resolved_pe,bv10,95,,0.726316,0.4,0.077196,-0.976056,,0.018444,...,,,,,,,,All,univariable_bv10,
univariable_pb_larger_10%All%resolved_pe%None,resolved_pe,pb_larger_10,95,,0.726316,0.31663,0.084454,-0.976709,,0.01815,...,,,,,,,,All,univariable_pb_larger_10,


# Perform multivariable regressions

In [153]:
feature_options = {
    'body': body_features,
    'cardio': cardio_features,
    'composite': body_features + cardio_features,
}

control_options = {
    'None': None,
    'Age': ['age'],
    'Gender': ['gender_cl_Male'],
    'Age and Gender': ['age', 'gender_cl_Male'],
}

ols = LinearRegression(fit_intercept=True)

selector_options = {
    # 'univariable': None,
    'rfecv': RFECV(
        estimator=ols, 
        step=1, 
        scoring='neg_mean_squared_error', 
        cv=10,
        n_jobs=-1,
    ),
    'sfs_fwd': SequentialFeatureSelector(
        estimator=ols,
        n_features_to_select='auto', 
        tol=0.01, 
        scoring='neg_mean_squared_error', 
        direction='forward', 
        cv=10,
        n_jobs=-1,
    ),
    'sfs_bwd': SequentialFeatureSelector(
        estimator=ols,
        n_features_to_select='auto', 
        tol=0.01, 
        scoring='neg_mean_squared_error', 
        direction='backward', 
        cv=10,
        n_jobs=-1
    ),
}

def get_selected_features(selector, X, y):
    selector.fit(X, y)
    return list(selector.get_feature_names_out().flatten())

get_selected_features(selector_options['sfs_bwd'], X[body_features], y)

['density_intermuscular_fat', 'volume_bone', 'density_bone', 'bmi']

## Feature selection

In [154]:
feature_option_dict = dict()
for feature_key, feature_option in tqdm(feature_options.items()):
    
    # Create dictionary to store target results
    target_dict = dict()
    
    # Select features (varying methods)
    y_temp = y.copy()
    rfecv_feat = get_selected_features(selector_options['rfecv'], X[feature_option], y_temp)
    sfs_fwd_feat = get_selected_features(selector_options['sfs_fwd'], X[feature_option], y_temp)
    sfs_bwd_feat = get_selected_features(selector_options['sfs_bwd'], X[feature_option], y_temp)
    
    # Create dictionary to store control results
    control_dict = dict()
    
    for control_key, control_option in tqdm(control_options.items()):
        
        selector_dict = dict()
        # Store rfecv results
        selector_dict['rfecv'] = rfecv_feat if control_option is None else rfecv_feat + control_option 
        # Store SFS fwd
        selector_dict['sfs_fwd'] = sfs_fwd_feat if control_option is None else sfs_fwd_feat + control_option 
        # Store SFS bwd
        selector_dict['sfs_bwd'] = sfs_bwd_feat if control_option is None else sfs_bwd_feat + control_option 
        
        control_dict[control_key] = selector_dict
    
    target_dict[y_temp.name] = control_dict
        
    feature_option_dict[feature_key] = target_dict

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

## Run models

In [155]:
model_definitions = feature_option_dict.copy()

print("Testing out a few...")
print(model_definitions['body']['resolved_pe']['Age']['sfs_bwd'])
print(model_definitions['body']['resolved_pe']['Age']['rfecv'])
print(model_definitions['body']['resolved_pe']['Age']['sfs_fwd'])
print(model_definitions['body']['resolved_pe']['None']['sfs_bwd'])
print(model_definitions['body']['resolved_pe']['None']['rfecv'])
print(model_definitions['body']['resolved_pe']['None']['sfs_fwd'])
print(model_definitions['composite']['resolved_pe']['None']['sfs_bwd'])
print(model_definitions['composite']['resolved_pe']['None']['rfecv'])
print(model_definitions['composite']['resolved_pe']['None']['sfs_fwd'])

Testing out a few...
['density_intermuscular_fat', 'volume_bone', 'density_bone', 'bmi', 'age']
['density_bone', 'age']
['density_bone', 'age']
['density_intermuscular_fat', 'volume_bone', 'density_bone', 'bmi']
['density_bone']
['density_bone']
['density_intermuscular_fat', 'volume_bone', 'density_bone', 'emphysema_volume_950hu', 'extrapulmonary_vein_volume', 'artery_vein_ratio', 'bv10', 'pb_larger_10', 'a_diameter', 'airway_ratio', 'ild_volume']
['density_bone']
['a_diameter']


In [156]:
multivariable_models = pd.DataFrame()
multivariable_coefs = pd.DataFrame()
multivariable_results = pd.DataFrame()

for feature_key, feature_option in tqdm(feature_options.items()):
    for control_key, control_option in tqdm(control_options.items()):
        for selector_key, selector_option in selector_options.items():
            # Unwrap features
            y_temp = y.copy()
            features = model_definitions[feature_key][y_temp.name][control_key][selector_key]
            X_temp = sm.add_constant(X[features])

            # Fit model
            model_sm, model_sk = fit_model(X_temp, y_temp)
    
            # Collect model information
            temp_models = store_model_results(model_sm, model_sk, X_temp, y_temp)
            multivariable_models = pd.concat([multivariable_models, temp_models], axis=0)
    
            # Collect coef information
            temp_coefs = store_coef_results(model_sm, y_temp)
            multivariable_coefs = pd.concat([multivariable_coefs, temp_coefs], axis=0)

            # Combine model and coef information
            temp_results = pd.concat([temp_models, temp_coefs], axis=1)
            temp_results = temp_results.reset_index()
            temp_results['model_dfn'] = temp_results['model_dfn'].apply('_'.join)
            temp_results['category'] = feature_key
            temp_results['controls'] = control_key
            temp_results['selection_method'] = selector_key
            temp_results.index = temp_results[['category', 'selection_method', 'y', 'controls']].apply('%'.join, axis=1)
            temp_results.index.name = 'Lookup'

            # Combine
            multivariable_results = pd.concat([multivariable_results, temp_results], axis=0)

print(multivariable_models.shape)
print(multivariable_coefs.shape)
print(multivariable_results.shape)

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

(36, 5)
(36, 48)
(36, 58)


# Combine univariable and multivariable results

In [157]:
ols_results = pd.concat([univariate_results, multivariable_results], axis=0)
ols_results.to_csv('../output/regressions/logit_results.csv')