In [330]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
from pathlib import Path
import os
import pickle
from tqdm import tqdm
from config import model_config

from sklearn.ensemble import (
	RandomForestClassifier
)
from sklearn.feature_selection import(
	RFECV, SequentialFeatureSelector
)
from sklearn.linear_model import (
	LinearRegression, LogisticRegression,
)
from sklearn.metrics import (
	confusion_matrix, classification_report, f1_score,
	roc_curve, roc_auc_score
)
from sklearn.model_selection import (
	train_test_split, RandomizedSearchCV, GridSearchCV, 
	cross_val_score, cross_val_predict, KFold,
)
from sklearn.pipeline import (
	Pipeline
)
from sklearn.preprocessing import (
	LabelEncoder, OneHotEncoder, StandardScaler,
	RobustScaler, QuantileTransformer,
)
import statsmodels.api as sm

from regression import reg

In [331]:
SEED = 123
TEST_SIZE = 0.25
CV_FOLDS = 10

HEATMAP_COLORS = sns.diverging_palette(h_neg=359, h_pos=250, as_cmap=True)

# Import Data

In [332]:
with open(Path('../data/classification_data.pkl'), 'rb') as f:
	data = pickle.load(f)
	
X = data.get('X')
y = data.get('y')
body_features = data.get('body_features')
cardio_features = data.get('cardio_features')

print(X.shape)
print(y.shape)
print(body_features)
print(cardio_features)

X.head()

(43, 20)
(43, 1)
['density_visceral_fat', 'mass_visceral_fat', 'mass_subcutaneous_fat', 'volume_intermuscular_fat', 'density_intermuscular_fat', 'volume_bone', 'density_bone']
['emphysema_volume_950hu', 'extrapulmonary_artery_volume', 'extrapulmonary_vein_volume', 'artery_vein_ratio', 'bv5', 'pb_larger_10', 'a_diameter', 'pv_a', 'heart_volume', 'airway_ratio', 'ild_volume']


Unnamed: 0,density_visceral_fat,mass_visceral_fat,mass_subcutaneous_fat,volume_intermuscular_fat,density_intermuscular_fat,volume_bone,density_bone,emphysema_volume_950hu,extrapulmonary_artery_volume,extrapulmonary_vein_volume,artery_vein_ratio,bv5,pb_larger_10,a_diameter,pv_a,heart_volume,airway_ratio,ild_volume,age,gender_cl_Male
PE1_0,-0.804596,-0.969463,0.497278,1.349906,-0.066012,1.477535,-1.765047,0.43893,5.199338,1.999251,-1.002616,0.069569,1.82216,0.991202,1.210445,1.850031,-0.337287,-0.526597,-0.112621,1.0
PE12_0,-0.132313,0.916458,-0.046799,1.242486,-0.804596,1.040478,-0.614999,0.087296,0.840123,0.333656,0.030102,0.738867,0.526128,-0.635812,1.25252,0.160081,0.476079,0.91547,-0.382953,1.0
PE14_0,1.412188,-0.616023,-0.01443,-0.973609,1.354934,0.845303,-0.23061,0.919527,-0.291114,0.480681,-0.150563,0.096692,0.103346,-0.177808,0.107054,0.173045,-0.634757,-5.199338,-1.003148,0.0
PE15_0,1.25212,-5.199338,-1.714405,-1.607818,1.25212,-1.002303,0.854752,0.107185,-0.850771,-0.550969,0.291864,0.510249,-0.823013,-1.651436,0.332595,-1.000827,0.098444,-0.090063,-5.199338,0.0
PE16_0,-2.028069,1.610495,1.703935,1.761314,-1.619856,0.802369,1.249852,-5.199338,-1.251416,-0.114048,-2.02288,-0.859783,0.072388,-1.234177,0.058014,-0.672627,1.599289,1.20117,-0.851059,0.0


In [333]:
y.head()

Unnamed: 0,resolved_pe
PE1_0,1.0
PE12_0,0.0
PE14_0,0.0
PE15_0,0.0
PE16_0,0.0


In [334]:
y.value_counts(dropna=False)

resolved_pe
0.0            28
1.0            15
dtype: int64

# Logit Regression

In [335]:
def combine_feat_and_controls(feat, controls=None):
	all_feat = list()
	all_feat.append(feat)
	if controls is not None:
		for control in controls:
			all_feat.append(control)
	return all_feat

def logit_regression(X, y, feat=None, controls=None):
	all_feat = combine_feat_and_controls(feat, controls)

	# Fit statsmodels model for pvalues
	X_temp = sm.add_constant(X.loc[:, all_feat])
	model = sm.Logit(y, X_temp).fit(cov_type='HC3', disp=False)
	
	# Fit sklearn model for cross validation
	model_for_cv = LogisticRegression(random_state=SEED, max_iter=10_000, penalty=None, solver='lbfgs')

	logit_probs = model.predict()
	fpr, tpr, thresholds = roc_curve(y, logit_probs)
	auc = roc_auc_score(y, logit_probs)
	cv_auc = np.mean(cross_val_score(model_for_cv, X_temp, y, scoring='roc_auc', cv=CV_FOLDS, n_jobs=-1))

	return (
		model.params, 
		model.pvalues,
		fpr,
		tpr,
		thresholds,
		auc,
		cv_auc
	)

def logit_regressions(feature_pool, controls):
	logit_coefs = dict()
	logit_pvals = dict()
	logit_auc = dict()
	logit_cv_auc = dict()

	for feature in feature_pool:
		(
			coef, pvalues,
			fpr, tpr, thresholds, 
			auc, cv_auc
		) = logit_regression(X, y, feature, controls)

		logit_coefs[feature] = coef[0]
		logit_pvals[feature] = pvalues[0]
		logit_auc[feature] = auc
		logit_cv_auc[feature] = cv_auc

	if controls is None:
		controls_str = 'None'
	else:
		controls_str = ', '.join(controls)

	logit_results_df = pd.DataFrame(
		{
			'coef': logit_coefs,
			'pvalue': logit_pvals,
			'auc': logit_auc,
			'cv_auc': logit_cv_auc,
			'controls': controls_str,
		}
	)
	return logit_results_df

def plot_roc(fpr, tpr, auc, custom_title=None):
	fig, ax = plt.subplots()
	ax.plot([0, 1], ls='--', c='r')
	ax.plot([0, 0], [1, 0], c='0.8')
	ax.plot([1, 0], [1, 1], c='0.8')
	ax.plot(fpr, tpr, label=f'ROC Curve (AUC={auc:.3})')
	if custom_title is not None:
		ax.set_title(f'Y=Resolved PE, {custom_title}', fontsize=16)
	else:
		ax.set_title(f'Y=Resolved PE', fontsize=16)
	ax.set_xlabel('False Positive Rate')
	ax.set_ylabel('True Positive Rate')
	plt.legend(fontsize=14)
	plt.show()
	
def select_features_REFCV(model, scoring, features, target):
	feature_selection_results = dict()
	selector = RFECV(
		estimator=model,
		step=1,
		scoring=scoring,
		cv=CV_FOLDS
	)
	selector.fit(X[features], np.array(y).ravel())
	feature_selection_results[target] = list(selector.get_feature_names_out())
	return feature_selection_results

def select_features_SFS(model, scoring, features, target, direction):
	feature_selection_results = dict()
	selector = SequentialFeatureSelector(
		estimator=model,
		n_features_to_select='auto',
		tol=0.01,
		scoring=scoring,
		direction=direction,
		cv=10
	)
	selector.fit(X[features], np.array(y).ravel())
	feature_selection_results[target] = list(selector.get_feature_names_out())
	return feature_selection_results

def evaluate_multivariable_logit(feature_selection_results):
	regression_df = pd.DataFrame()
	target = 'resolved_pe'
	x_vars = feature_selection_results[target]
	X_temp = X.loc[:, x_vars]
	y_temp = y.loc[:, target]

	# Fit model (statsmodels, for p-values)
	model_sm = sm.Logit(y_temp, sm.add_constant(X_temp)).fit(cov_type='HC3', disp=False)

	# Fit models (sklearn, for CV scores)
	model_sk = LogisticRegression(random_state=SEED, max_iter=10_000, penalty=None, solver='lbfgs')
	model_sk_l2 = LogisticRegression(random_state=SEED, max_iter=10_000, penalty='l2', solver='liblinear')
	model_sk_l1 = LogisticRegression(random_state=SEED, max_iter=10_000, penalty='l1', solver='saga')
	model_sk_elastic = LogisticRegression(random_state=SEED, max_iter=10_000, penalty='elasticnet', solver='saga', l1_ratio=0.5)

	# Get score for simple model
	model_sk.fit(X_temp, np.array(y_temp).ravel())
	logit_probs = model_sk.predict_proba(X_temp)
	fpr, tpr, thresholds = roc_curve(np.array(y_temp).ravel(), logit_probs[:, 1])
	model_sk_auc = roc_auc_score(np.array(y_temp).ravel(), logit_probs[:, 1])

	# Get cross validation scores for variously regularized models
	model_sk_cv_auc = cross_val_score(model_sk, X_temp, y_temp, scoring='roc_auc', cv=CV_FOLDS, n_jobs=-1)
	model_sk_l2_cv_auc = cross_val_score(model_sk_l2, X_temp, y_temp, scoring='roc_auc', cv=CV_FOLDS, n_jobs=-1)
	model_sk_l1_cv_auc = cross_val_score(model_sk_l1, X_temp, y_temp, scoring='roc_auc', cv=CV_FOLDS, n_jobs=-1)
	model_sk_elastic_cv_auc = cross_val_score(model_sk_elastic, X_temp, y_temp, scoring='roc_auc', cv=CV_FOLDS, n_jobs=-1)

	# Store metrics in a df
	metrics_df = pd.DataFrame(
		[
			[np.mean(model_sk_auc), np.nan],
			[np.mean(model_sk_cv_auc), np.std(model_sk_cv_auc)],
			[np.mean(model_sk_l2_cv_auc), np.std(model_sk_l2_cv_auc)],
			[np.mean(model_sk_l1_cv_auc), np.std(model_sk_l1_cv_auc)],
			[np.mean(model_sk_elastic_cv_auc), np.std(model_sk_elastic_cv_auc)],
		],
		columns=['Mean AUC', 'Std AUC'],
		index=['All Data', 'No Regularization, CV-10', 'L2 Regularization, CV-10', 'L1 Regularization, CV-10', 'ElasticNet Regularization, CV-10']
	)

	# Store results in df
	res = pd.DataFrame({
		'coef': model_sm.params,
		'pval': model_sm.pvalues,
		'nobs': model_sm.nobs,
	})
	res['signif'] = res['pval'].apply(reg.add_significance)

	# Create new index
	res.index = pd.MultiIndex.from_tuples(
		list(zip([target for _ in range(res.shape[0])], res.index)),
		names=('y', 'X')
	)
	res.index.names

	# Add to final df
	regression_df = pd.concat((regression_df, res), axis=0)
	######################################################################################
	# Show results
	print("="*80)
	print(metrics_df)
	print("="*80)
	print("="*80)
	print(regression_df)
	print("="*80)
	print("\n")
	######################################################################################
	return dict(
		evaluation_metrics = metrics_df,
		regression_results = regression_df
	)



## Univariate Analysis

### Body Composition

In [336]:
# Get logit results
logit_body_dfs = list()
logit_body_dfs.append(logit_regressions(body_features, controls=None))
logit_body_dfs.append(logit_regressions(body_features, controls=['age']))
logit_body_dfs.append(logit_regressions(body_features, controls=['gender_cl_Male']))
logit_body_dfs.append(logit_regressions(body_features, controls=['gender_cl_Male', 'age']))

# Combine and reset index
all_logit_body_dfs = pd.concat(logit_body_dfs, axis=0)
all_logit_body_dfs['x_var'] = all_logit_body_dfs.index
all_logit_body_dfs.index = all_logit_body_dfs.index + ' (' + all_logit_body_dfs['controls'] + ')'

# Add significance
all_logit_body_dfs['signif'] = all_logit_body_dfs['pvalue'].apply(reg.add_significance)

# Export and show snippet
# all_logit_body_dfs.to_csv('../output/regressions/logit_body.csv')
all_logit_body_dfs.tail()

Unnamed: 0,coef,pvalue,auc,cv_auc,controls,x_var,signif
"mass_subcutaneous_fat (gender_cl_Male, age)",-1.725819,0.003343,0.804762,0.658333,"gender_cl_Male, age",mass_subcutaneous_fat,**
"volume_intermuscular_fat (gender_cl_Male, age)",-1.799,0.002745,0.788095,0.725,"gender_cl_Male, age",volume_intermuscular_fat,**
"density_intermuscular_fat (gender_cl_Male, age)",-1.777164,0.002837,0.8,0.725,"gender_cl_Male, age",density_intermuscular_fat,**
"volume_bone (gender_cl_Male, age)",-1.860463,0.00127,0.816667,0.716667,"gender_cl_Male, age",volume_bone,**
"density_bone (gender_cl_Male, age)",-1.830273,0.003253,0.821429,0.708333,"gender_cl_Male, age",density_bone,**


### Cardiopulmonary

In [337]:
# Get logit results
logit_cardio_dfs = list()
logit_cardio_dfs.append(logit_regressions(cardio_features, controls=None))
logit_cardio_dfs.append(logit_regressions(cardio_features, controls=['age']))
logit_cardio_dfs.append(logit_regressions(cardio_features, controls=['gender_cl_Male']))
logit_cardio_dfs.append(logit_regressions(cardio_features, controls=['gender_cl_Male', 'age']))

# Combine and reset index
all_logit_cardio_dfs = pd.concat(logit_cardio_dfs, axis=0)
all_logit_cardio_dfs['x_var'] = all_logit_cardio_dfs.index
all_logit_cardio_dfs.index = all_logit_cardio_dfs.index + ' (' + all_logit_cardio_dfs['controls'] + ')'

# Add significance
all_logit_cardio_dfs['signif'] = all_logit_cardio_dfs['pvalue'].apply(reg.add_significance)

# Export and show snippet
# all_logit_cardio_dfs.to_csv('../output/regressions/logit_cardio.csv')
all_logit_cardio_dfs.tail()

Unnamed: 0,coef,pvalue,auc,cv_auc,controls,x_var,signif
"a_diameter (gender_cl_Male, age)",-1.799159,0.001435,0.790476,0.683333,"gender_cl_Male, age",a_diameter,**
"pv_a (gender_cl_Male, age)",-1.930897,0.001761,0.838095,0.783333,"gender_cl_Male, age",pv_a,**
"heart_volume (gender_cl_Male, age)",-1.747486,0.005053,0.802381,0.75,"gender_cl_Male, age",heart_volume,**
"airway_ratio (gender_cl_Male, age)",-1.766234,0.002736,0.8,0.708333,"gender_cl_Male, age",airway_ratio,**
"ild_volume (gender_cl_Male, age)",-1.769054,0.004748,0.807143,0.725,"gender_cl_Male, age",ild_volume,**


### Export results

In [338]:
logit_results = pd.concat([all_logit_body_dfs, all_logit_cardio_dfs], axis=0)
logit_results.to_csv('../output/regressions/logit_body_cardio.csv')

## Multivariable Analysis

In [339]:
# Define model for CV feat selection
logit = LogisticRegression(random_state=SEED, max_iter=10_000, penalty=None, solver='lbfgs')
logit_target = 'resolved_pe'

In [340]:
# List to store results
multivariable_results = dict()

### Recursive Feature Selection

In [341]:
# Cardio
refcv_cardio_features = select_features_REFCV(model=logit, scoring='roc_auc', features=cardio_features, target=logit_target)
print(refcv_cardio_features)

# Body
refcv_body_features = select_features_REFCV(model=logit, scoring='roc_auc', features=body_features, target=logit_target)
print(refcv_body_features)

{'resolved_pe': ['extrapulmonary_artery_volume', 'extrapulmonary_vein_volume', 'bv5', 'pb_larger_10', 'pv_a', 'heart_volume']}
{'resolved_pe': ['mass_subcutaneous_fat', 'volume_intermuscular_fat', 'density_bone']}


In [342]:
refcv_cardio = evaluate_multivariable_logit(refcv_cardio_features)
refcv_body = evaluate_multivariable_logit(refcv_body_features)

multivariable_results['refcv_cardio'] = refcv_cardio
multivariable_results['refcv_body'] = refcv_body

                                  Mean AUC   Std AUC
All Data                          0.845238       NaN
No Regularization, CV-10          0.816667  0.311359
L2 Regularization, CV-10          0.833333  0.316228
L1 Regularization, CV-10          0.733333  0.300000
ElasticNet Regularization, CV-10  0.783333  0.307770
                                              coef      pval  nobs signif
y           X                                                            
resolved_pe const                        -0.754472  0.135693    43       
            extrapulmonary_artery_volume  1.961426  0.015018    43      *
            extrapulmonary_vein_volume   -1.493965  0.001389    43     **
            bv5                          -1.005753  0.149677    43       
            pb_larger_10                 -0.947489  0.021349    43      *
            pv_a                         -1.496098  0.104889    43       
            heart_volume                  1.473341  0.020709    43      *


              

### Sequential Feature Selection

In [343]:
# Cardio
sfs_cardio_features = select_features_SFS(model=logit, scoring='roc_auc', features=cardio_features, target=logit_target, direction='backward')
print(sfs_cardio_features)

# Body
sfs_body_features = select_features_SFS(model=logit, scoring='roc_auc', features=body_features, target=logit_target, direction='backward')
print(sfs_body_features)

{'resolved_pe': ['extrapulmonary_artery_volume', 'extrapulmonary_vein_volume', 'bv5', 'pv_a', 'heart_volume', 'airway_ratio', 'ild_volume']}
{'resolved_pe': ['mass_subcutaneous_fat', 'volume_intermuscular_fat', 'density_intermuscular_fat', 'density_bone']}


In [344]:
sfs_cardio = evaluate_multivariable_logit(sfs_cardio_features)
sfs_body = evaluate_multivariable_logit(sfs_body_features)

multivariable_results['sfs_cardio'] = sfs_cardio
multivariable_results['sfs_body'] = sfs_body

                                  Mean AUC   Std AUC
All Data                          0.866667       NaN
No Regularization, CV-10          0.816667  0.311359
L2 Regularization, CV-10          0.783333  0.307770
L1 Regularization, CV-10          0.716667  0.289156
ElasticNet Regularization, CV-10  0.750000  0.300463
                                              coef      pval  nobs signif
y           X                                                            
resolved_pe const                        -1.020137  0.146111    43       
            extrapulmonary_artery_volume  2.076252  0.027555    43      *
            extrapulmonary_vein_volume   -1.186079  0.006435    43     **
            bv5                          -0.551734  0.275579    43       
            pv_a                         -1.634756  0.140521    43       
            heart_volume                  0.806650  0.174680    43       
            airway_ratio                  0.728829  0.043949    43      *
            ild_

### Export

In [345]:
metric_df = pd.DataFrame()

for key, result_dict in multivariable_results.items():
	temp_metric_df = result_dict['evaluation_metrics'].copy()
	feature_selection_method = [key] * temp_metric_df.shape[0]
	model_type = ['Logit'] * temp_metric_df.shape[0]
	temp_metric_df.index = pd.MultiIndex.from_tuples(zip(model_type, feature_selection_method, temp_metric_df.index))
	metric_df = pd.concat([metric_df, temp_metric_df], axis=0)

index_names = ['model_type', 'feature_selection_method', 'regularization']
metric_df = metric_df.reset_index(names=index_names)
metric_df.index = metric_df[index_names].apply('_'.join, axis=1)
metric_df.index.name = 'Lookup'

metric_df.to_csv('../output/regressions/logit_multi_metrics.csv')
metric_df.tail()

Unnamed: 0_level_0,model_type,feature_selection_method,regularization,Mean AUC,Std AUC
Lookup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Logit_sfs_body_All Data,Logit,sfs_body,All Data,0.814286,
"Logit_sfs_body_No Regularization, CV-10",Logit,sfs_body,"No Regularization, CV-10",0.833333,0.144338
"Logit_sfs_body_L2 Regularization, CV-10",Logit,sfs_body,"L2 Regularization, CV-10",0.766667,0.213437
"Logit_sfs_body_L1 Regularization, CV-10",Logit,sfs_body,"L1 Regularization, CV-10",0.75,0.226691
"Logit_sfs_body_ElasticNet Regularization, CV-10",Logit,sfs_body,"ElasticNet Regularization, CV-10",0.75,0.226691


In [346]:
regression_results_df = pd.DataFrame()

for key, result_dict in multivariable_results.items():
	temp_regression_results_df = result_dict['regression_results'].copy()
	feature_selection_method = [key] * temp_regression_results_df.shape[0]
	model_type = ['Logit'] * temp_regression_results_df.shape[0]
	temp_regression_results_df.index = pd.MultiIndex.from_tuples(
		zip(
			model_type, 
			feature_selection_method, 
			temp_regression_results_df.index.get_level_values(0),
			temp_regression_results_df.index.get_level_values(1),
		)
	)
	regression_results_df = pd.concat([regression_results_df, temp_regression_results_df], axis=0)
	
index_names = ['model_type', 'feature_selection_method', 'y_var', 'x_var']
regression_results_df = regression_results_df.reset_index(names=index_names)
regression_results_df.index = regression_results_df[index_names].apply('_'.join, axis=1)
regression_results_df.index.name = 'Lookup'

regression_results_df.to_csv('../output/regressions/logit_multi_regressions.csv')
regression_results_df.tail()

Unnamed: 0_level_0,model_type,feature_selection_method,y_var,x_var,coef,pval,nobs,signif
Lookup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Logit_sfs_body_resolved_pe_const,Logit,sfs_body,resolved_pe,const,-0.996538,0.02063,43,*
Logit_sfs_body_resolved_pe_mass_subcutaneous_fat,Logit,sfs_body,resolved_pe,mass_subcutaneous_fat,-1.840717,0.012208,43,*
Logit_sfs_body_resolved_pe_volume_intermuscular_fat,Logit,sfs_body,resolved_pe,volume_intermuscular_fat,1.191364,0.120587,43,
Logit_sfs_body_resolved_pe_density_intermuscular_fat,Logit,sfs_body,resolved_pe,density_intermuscular_fat,-0.050414,0.853542,43,
Logit_sfs_body_resolved_pe_density_bone,Logit,sfs_body,resolved_pe,density_bone,-1.120219,0.003075,43,**


# OLD