In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
from pathlib import Path
import os
import pickle
from tqdm import tqdm
from config import model_config

from sklearn.ensemble import (
	RandomForestClassifier
)
from sklearn.feature_selection import(
	RFECV, SequentialFeatureSelector
)
from sklearn.linear_model import (
	LinearRegression, LogisticRegression,
)
from sklearn.metrics import (
	confusion_matrix, classification_report, f1_score,
	roc_curve, roc_auc_score
)
from sklearn.model_selection import (
	train_test_split, RandomizedSearchCV, GridSearchCV, 
	cross_val_score, cross_val_predict, KFold,
)
from sklearn.pipeline import (
	Pipeline
)
from sklearn.preprocessing import (
	LabelEncoder, OneHotEncoder, StandardScaler,
	RobustScaler, QuantileTransformer,
)
import statsmodels.api as sm

from regression import reg

In [2]:
SEED = 123
TEST_SIZE = 0.25
CV_FOLDS = 10

HEATMAP_COLORS = sns.diverging_palette(h_neg=359, h_pos=250, as_cmap=True)

# Import Data

In [3]:
with open(Path('../data/classification_data.pkl'), 'rb') as f:
	data = pickle.load(f)
	
X = data.get('X')
y = data.get('y')
body_features = data.get('body_features')
cardio_features = data.get('cardio_features')

print(X.shape)
print(y.shape)
print(body_features)
print(cardio_features)

X.head()

(43, 17)
(43, 1)
['density_visceral_fat', 'mass_visceral_fat', 'density_intermuscular_fat', 'density_muscle', 'density_bone', 'bmi']
['emphysema_volume_950hu', 'extrapulmonary_vein_volume', 'artery_vein_ratio', 'bv10', 'pb_larger_10', 'pv_a', 'heart_volume', 'airway_ratio', 'ild_volume']


Unnamed: 0,density_visceral_fat,mass_visceral_fat,density_intermuscular_fat,density_muscle,density_bone,bmi,emphysema_volume_950hu,extrapulmonary_vein_volume,artery_vein_ratio,bv10,pb_larger_10,pv_a,heart_volume,airway_ratio,ild_volume,age,gender_cl_Male
PE1_0,-0.664818,-1.165809,-0.147435,-0.256701,-1.777867,0.203818,-0.106865,2.137818,-1.150427,0.494738,1.912364,1.263457,1.782875,-0.309816,-0.721954,-0.014636,1.0
PE12_0,-0.041982,1.031574,-0.799236,-0.453552,-0.626438,-0.164653,-0.476252,0.372377,0.021395,0.979777,0.403948,1.315332,0.19527,0.391257,1.080036,-0.282771,1.0
PE14_0,1.297917,-0.722682,1.549786,0.317693,-0.27097,-0.478136,1.156899,0.394634,-0.15787,0.451135,0.093329,0.009487,0.217828,-0.568843,-1.825349,-1.199497,0.0
PE15_0,1.177249,-1.684023,1.401407,1.052018,0.819345,-1.881309,-0.465319,-0.547764,0.329446,0.485155,-0.773213,0.272429,-1.091411,0.044701,-0.201688,-2.011256,0.0
PE16_0,-2.262411,1.481516,-1.468937,-0.35476,1.241557,1.294754,-0.753382,-0.176271,-1.989854,-1.067518,0.07641,-0.024398,-0.798537,1.291209,1.186939,-1.006231,0.0


In [4]:
y.head()

Unnamed: 0,resolved_pe
PE1_0,1.0
PE12_0,0.0
PE14_0,0.0
PE15_0,0.0
PE16_0,0.0


In [5]:
y.value_counts(dropna=False)

resolved_pe
0.0            28
1.0            15
dtype: int64

# Logit Regression

In [6]:
X_temp = sm.add_constant(X.loc[:, ['heart_volume', 'age']])
model = sm.Logit(y, X_temp).fit(cov_type='HC3', disp=False)
model.pvalues['heart_volume']

0.0678289785659096

In [7]:
def combine_feat_and_controls(features=None, controls=None):
	if isinstance(features, str):
		feat_set = set([features])
	elif isinstance(features, list):
		feat_set = set(features)
	if controls is None:
		control_set = set()
	else:
		control_set = set(controls)
	all_feat = list(feat_set.union(control_set))
	return all_feat

In [8]:
def logit_regression(X, y, feat=None, controls=None):
	all_feat = combine_feat_and_controls(feat, controls)

	# Fit statsmodels model for pvalues
	X_temp = sm.add_constant(X.loc[:, all_feat])
	model = sm.Logit(y, X_temp).fit(cov_type='HC3', disp=False)
	
	# Fit sklearn model for cross validation
	model_for_cv = LogisticRegression(random_state=SEED, max_iter=10_000, penalty=None, solver='lbfgs')

	logit_probs = model.predict()
	fpr, tpr, thresholds = roc_curve(y, logit_probs)
	auc = roc_auc_score(y, logit_probs)
	cv_auc = np.mean(cross_val_score(model_for_cv, X_temp, y, scoring='roc_auc', cv=CV_FOLDS, n_jobs=-1))

	return (
		model.params, 
		model.pvalues,
		fpr,
		tpr,
		thresholds,
		auc,
		cv_auc
    )

In [9]:
def logit_regressions(feature_pool, controls):
	logit_coefs = dict()
	logit_pvals = dict()
	logit_auc = dict()
	logit_cv_auc = dict()

	for feature in feature_pool:
		(
			coef, pvalues,
			fpr, tpr, thresholds, 
			auc, cv_auc
		) = logit_regression(X, y, feature, controls)

		logit_coefs[feature] = coef[feature]
		logit_pvals[feature] = pvalues[feature]
		logit_auc[feature] = auc
		logit_cv_auc[feature] = cv_auc

	if controls is None:
		controls_str = 'None'
	else:
		controls_str = '_'.join(controls)

	logit_results_df = pd.DataFrame(
		{
			'coef': logit_coefs,
			'pvalue': logit_pvals,
			'auc': logit_auc,
			'cv_auc': logit_cv_auc,
			'controls': controls_str,
		}
	)
	return logit_results_df

In [10]:
def plot_roc(fpr, tpr, auc, custom_title=None):
	fig, ax = plt.subplots()
	ax.plot([0, 1], ls='--', c='r')
	ax.plot([0, 0], [1, 0], c='0.8')
	ax.plot([1, 0], [1, 1], c='0.8')
	ax.plot(fpr, tpr, label=f'ROC Curve (AUC={auc:.3})')
	if custom_title is not None:
		ax.set_title(f'Y=Resolved PE, {custom_title}', fontsize=16)
	else:
		ax.set_title(f'Y=Resolved PE', fontsize=16)
	ax.set_xlabel('False Positive Rate')
	ax.set_ylabel('True Positive Rate')
	plt.legend(fontsize=14)
	plt.show()

In [11]:
def select_features_REFCV(model, scoring, features, target):
	feature_selection_results = dict()
	selector = RFECV(
		estimator=model,
		step=1,
		scoring=scoring,
		cv=CV_FOLDS
	)
	selector.fit(X[features], np.array(y).ravel())
	feature_selection_results[target] = list(selector.get_feature_names_out())
	return feature_selection_results

def select_features_SFS(model, scoring, features, target, direction):
	feature_selection_results = dict()
	selector = SequentialFeatureSelector(
		estimator=model,
		n_features_to_select='auto',
		tol=0.01,
		scoring=scoring,
		direction=direction,
		cv=10
	)
	selector.fit(X[features], np.array(y).ravel())
	feature_selection_results[target] = list(selector.get_feature_names_out())
	return feature_selection_results

In [42]:
def evaluate_multivariable_logit(feature_selection_results, selection_method, model_type, controls=None):
    
    if controls is None:
        controls_str = 'None'
    else:
        controls_str = '_'.join(controls)
    
    target = 'resolved_pe'
    x_vars = combine_feat_and_controls(list(feature_selection_results[target]), controls)
    X_temp = X.loc[:, x_vars]
    y_temp = y.loc[:, target]
    
    # Fit model (statsmodels, for p-values)
    model_sm = sm.Logit(y_temp, sm.add_constant(X_temp)).fit(cov_type='HC3', disp=False)
    
    # Fit models (sklearn, for CV scores)
    model_sk = LogisticRegression(random_state=SEED, max_iter=10_000, penalty=None, solver='lbfgs')
    model_sk_l2 = LogisticRegression(random_state=SEED, max_iter=10_000, penalty='l2', solver='liblinear')
    model_sk_l1 = LogisticRegression(random_state=SEED, max_iter=10_000, penalty='l1', solver='saga')
    model_sk_elastic = LogisticRegression(random_state=SEED, max_iter=10_000, penalty='elasticnet', solver='saga', l1_ratio=0.5)
    
    # Get score for simple model
    model_sk.fit(X_temp, np.array(y_temp).ravel())
    logit_probs = model_sk.predict_proba(X_temp)
    fpr, tpr, thresholds = roc_curve(np.array(y_temp).ravel(), logit_probs[:, 1])
    train_auc = roc_auc_score(np.array(y_temp).ravel(), logit_probs[:, 1])
    
    # Get cross validation scores for variously regularized models
    model_sk_cv_auc = cross_val_score(model_sk, X_temp, y_temp, scoring='roc_auc', cv=CV_FOLDS, n_jobs=-1)
    
    # Store results in df
    model_results = pd.DataFrame({
        'coef': model_sm.params,
        'pval': model_sm.pvalues,
        'nobs': model_sm.nobs,
    })
    model_results['signif'] = model_results['pval'].apply(reg.add_significance)
    model_results['train_auc'] = train_auc
    model_results['cv_auc_mean'] = np.mean(model_sk_cv_auc)
    model_results['cv_auc_std'] = np.std(model_sk_cv_auc)
    
    # Create new index
    model_results.index = pd.MultiIndex.from_tuples(
        list(zip(
            [model_type for _ in range(model_results.shape[0])],
            [selection_method for _ in range(model_results.shape[0])],
            [target for _ in range(model_results.shape[0])], 
            model_results.index,
            [controls_str for _ in range(model_results.shape[0])],
        )),
        names=('model_type', 'feature_selection_method', 'y', 'X', 'controls')
    )
    return model_results

## Univariate Analysis

### Body Composition

In [23]:
# Get logit results
logit_body_dfs = list()
logit_body_dfs.append(logit_regressions(body_features, controls=None))
logit_body_dfs.append(logit_regressions(body_features, controls=['age']))
logit_body_dfs.append(logit_regressions(body_features, controls=['gender_cl_Male']))
logit_body_dfs.append(logit_regressions(body_features, controls=['gender_cl_Male', 'age']))

# Combine and reset index
all_logit_body_dfs = pd.concat(logit_body_dfs, axis=0)
all_logit_body_dfs['x_var'] = all_logit_body_dfs.index
all_logit_body_dfs.index = all_logit_body_dfs.index + '_' + all_logit_body_dfs['controls']
all_logit_body_dfs.index.name = 'Lookup'

# Add significance
all_logit_body_dfs['signif'] = all_logit_body_dfs['pvalue'].apply(reg.add_significance)

# Export and show snippet
# all_logit_body_dfs.to_csv('../output/regressions/logit_body.csv')
all_logit_body_dfs.tail()

Unnamed: 0_level_0,coef,pvalue,auc,cv_auc,controls,x_var,signif
Lookup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
mass_visceral_fat_gender_cl_Male_age,-0.232165,0.604739,0.804762,0.675,gender_cl_Male_age,mass_visceral_fat,
density_intermuscular_fat_gender_cl_Male_age,0.05554,0.856964,0.795238,0.708333,gender_cl_Male_age,density_intermuscular_fat,
density_muscle_gender_cl_Male_age,-0.104961,0.854202,0.792857,0.741667,gender_cl_Male_age,density_muscle,
density_bone_gender_cl_Male_age,-0.349649,0.375176,0.807143,0.725,gender_cl_Male_age,density_bone,
bmi_gender_cl_Male_age,-0.010298,0.977295,0.790476,0.683333,gender_cl_Male_age,bmi,


### Cardiopulmonary

In [24]:
# Get logit results
logit_cardio_dfs = list()
logit_cardio_dfs.append(logit_regressions(cardio_features, controls=None))
logit_cardio_dfs.append(logit_regressions(cardio_features, controls=['age']))
logit_cardio_dfs.append(logit_regressions(cardio_features, controls=['gender_cl_Male']))
logit_cardio_dfs.append(logit_regressions(cardio_features, controls=['gender_cl_Male', 'age']))

# Combine and reset index
all_logit_cardio_dfs = pd.concat(logit_cardio_dfs, axis=0)
all_logit_cardio_dfs['x_var'] = all_logit_cardio_dfs.index
all_logit_cardio_dfs.index = all_logit_cardio_dfs.index + '_' + all_logit_cardio_dfs['controls']
all_logit_cardio_dfs.index.name = 'Lookup'

# Add significance
all_logit_cardio_dfs['signif'] = all_logit_cardio_dfs['pvalue'].apply(reg.add_significance)

# Export and show snippet
# all_logit_cardio_dfs.to_csv('../output/regressions/logit_cardio.csv')
all_logit_cardio_dfs.tail()

Unnamed: 0_level_0,coef,pvalue,auc,cv_auc,controls,x_var,signif
Lookup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
pb_larger_10_gender_cl_Male_age,-0.155669,0.627738,0.790476,0.725,gender_cl_Male_age,pb_larger_10,
pv_a_gender_cl_Male_age,1.057619,0.084248,0.840476,0.75,gender_cl_Male_age,pv_a,
heart_volume_gender_cl_Male_age,0.322187,0.458636,0.802381,0.708333,gender_cl_Male_age,heart_volume,
airway_ratio_gender_cl_Male_age,0.205298,0.646258,0.802381,0.7,gender_cl_Male_age,airway_ratio,
ild_volume_gender_cl_Male_age,0.348984,0.419294,0.811905,0.775,gender_cl_Male_age,ild_volume,


### Export results

In [25]:
logit_results = pd.concat([all_logit_body_dfs, all_logit_cardio_dfs], axis=0)
logit_results.to_csv('../output/regressions/logit_univariate_all.csv')

## Multivariable Analysis

In [16]:
# Define model for CV feat selection
logit = LogisticRegression(random_state=SEED, max_iter=10_000, penalty=None, solver='lbfgs')
logit_target = 'resolved_pe'

In [17]:
# List to store results
multivariable_results = dict()

### Recursive Feature Selection

In [33]:
# Cardio
refcv_cardio_features = select_features_REFCV(
    model=logit, 
    scoring='roc_auc', 
    features=cardio_features, 
    target=logit_target
)
print(refcv_cardio_features)

# Body
refcv_body_features = select_features_REFCV(
    model=logit, 
    scoring='roc_auc', 
    features=body_features, 
    target=logit_target
)
print(refcv_body_features)

# Composite
refcv_composite_features = select_features_REFCV(
    model=logit, 
    scoring='roc_auc', 
    features=body_features + cardio_features, 
    target=logit_target
)
print(refcv_composite_features)

{'resolved_pe': ['artery_vein_ratio', 'heart_volume', 'airway_ratio']}
{'resolved_pe': ['density_visceral_fat', 'density_muscle', 'density_bone', 'bmi']}
{'resolved_pe': ['density_bone', 'artery_vein_ratio', 'airway_ratio']}


### Sequential Feature Selection (Backward)

In [34]:
# Cardio
sfs_bwd_cardio_features = select_features_SFS(
    model=logit, 
    scoring='roc_auc', 
    features=cardio_features, 
    target=logit_target, 
    direction='backward'
)
print(sfs_bwd_cardio_features)

# Body
sfs_bwd_body_features = select_features_SFS(
    model=logit, 
    scoring='roc_auc', 
    features=body_features, 
    target=logit_target, 
    direction='backward'
)
print(sfs_bwd_body_features)

# Composite
sfs_bwd_composite_features = select_features_SFS(
    model=logit, 
    scoring='roc_auc', 
    features=body_features + cardio_features, 
    target=logit_target, 
    direction='backward'
)
print(sfs_bwd_composite_features)

{'resolved_pe': ['extrapulmonary_vein_volume', 'artery_vein_ratio', 'bv10', 'pv_a', 'heart_volume', 'airway_ratio', 'ild_volume']}
{'resolved_pe': ['density_visceral_fat', 'density_muscle', 'density_bone', 'bmi']}
{'resolved_pe': ['density_visceral_fat', 'mass_visceral_fat', 'density_muscle', 'density_bone', 'bmi', 'emphysema_volume_950hu', 'extrapulmonary_vein_volume', 'artery_vein_ratio', 'bv10', 'airway_ratio', 'ild_volume']}


### Sequential Feature Selection (Forward)

In [35]:
# Cardio
sfs_fwd_cardio_features = select_features_SFS(
    model=logit, 
    scoring='roc_auc', 
    features=cardio_features, 
    target=logit_target, 
    direction='forward'
)
print(sfs_fwd_cardio_features)

# Body
sfs_fwd_body_features = select_features_SFS(
    model=logit, 
    scoring='roc_auc', 
    features=body_features, 
    target=logit_target, 
    direction='forward'
)
print(sfs_fwd_body_features)

# Composite
sfs_fwd_composite_features = select_features_SFS(
    model=logit, 
    scoring='roc_auc', 
    features=body_features + cardio_features, 
    target=logit_target, 
    direction='forward'
)
print(sfs_fwd_composite_features)

{'resolved_pe': ['heart_volume']}
{'resolved_pe': ['density_visceral_fat', 'density_bone']}
{'resolved_pe': ['heart_volume']}


### Export

In [44]:
selection_results_options = [
    ('refcv',   'cardio',    refcv_cardio_features),
    ('refcv',   'body',      refcv_body_features),
    ('refcv',   'composite', refcv_composite_features),
    ('sfs_bwd', 'cardio',    sfs_bwd_cardio_features),
    ('sfs_bwd', 'body',      sfs_bwd_body_features),
    ('sfs_bwd', 'composite', sfs_bwd_composite_features),
    ('sfs_fwd', 'cardio',    sfs_fwd_cardio_features),
    ('sfs_fwd', 'body',      sfs_fwd_body_features),
    ('sfs_fwd', 'composite', sfs_fwd_composite_features),
]

control_options = [
    None, 
    ['age'],
    ['gender_cl_Male'],
    ['age', 'gender_cl_Male']
]

logit_multi_results = pd.DataFrame()
for selection_result in selection_results_options:
    for control in control_options:
        logit_multi_results = pd.concat(
            [logit_multi_results, 
             evaluate_multivariable_logit(
                 feature_selection_results=selection_result[2], 
                 selection_method=selection_result[0], 
                 model_type=selection_result[1],
                 controls=control)
            ],
            axis=0
        )

index_names = ['model_type', 'feature_selection_method', 'y', 'X', 'controls']
logit_multi_results = logit_multi_results.reset_index(names=index_names)
logit_multi_results.index = logit_multi_results[index_names].apply('_'.join, axis=1)
logit_multi_results.index.name = 'Lookup'

logit_multi_results.to_csv('../output/regressions/logit_multivariable_all.csv')

logit_multi_results.tail()

Unnamed: 0_level_0,model_type,feature_selection_method,y,X,controls,coef,pval,nobs,signif,train_auc,cv_auc_mean,cv_auc_std
Lookup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
composite_sfs_fwd_resolved_pe_heart_volume_gender_cl_Male,composite,sfs_fwd,resolved_pe,heart_volume,gender_cl_Male,0.190836,0.627564,43,,0.754762,0.758333,0.34651
composite_sfs_fwd_resolved_pe_const_age_gender_cl_Male,composite,sfs_fwd,resolved_pe,const,age_gender_cl_Male,-1.759531,0.004237,43,**,0.802381,0.708333,0.371278
composite_sfs_fwd_resolved_pe_gender_cl_Male_age_gender_cl_Male,composite,sfs_fwd,resolved_pe,gender_cl_Male,age_gender_cl_Male,1.505519,0.075164,43,,0.802381,0.708333,0.371278
composite_sfs_fwd_resolved_pe_heart_volume_age_gender_cl_Male,composite,sfs_fwd,resolved_pe,heart_volume,age_gender_cl_Male,0.322187,0.458636,43,,0.802381,0.708333,0.371278
composite_sfs_fwd_resolved_pe_age_age_gender_cl_Male,composite,sfs_fwd,resolved_pe,age,age_gender_cl_Male,0.777716,0.032155,43,*,0.802381,0.708333,0.371278


# OLD