In [1]:
from pathlib import Path
import os
import pickle
from config import model_config

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.base import (
	BaseEstimator, TransformerMixin
)
from sklearn.compose import (
	ColumnTransformer
)
from sklearn.pipeline import (
	Pipeline
)
from sklearn.preprocessing import (
	LabelEncoder, OneHotEncoder, StandardScaler,
	RobustScaler, QuantileTransformer, FunctionTransformer, 
    PowerTransformer
)
from statsmodels.stats.outliers_influence import variance_inflation_factor



SEED = 123

In [2]:
df = pd.read_pickle(Path('../data/df_clean_w_outliers.pkl'))
df = df.drop(columns=['study_date_mask_cl', 'study_date_mask_pe',])
df = df.dropna(subset='total_clot_burden')
df['resolved_pe'] = df['resolved_pe'].map({'Unresolved': 0, 'Resolved': 1})
print(df.shape)
df.head()

(104, 116)


Unnamed: 0,pe_study_number,patient_id,gender_pe,series_id,slice_number,centralartery,apical_rs1,posterior_rs2,anterior_rs3,lateral_rs4,...,peak_paao2,peak_fick_co,peak_vd_vt,peak_cavo2_a_art_hb,peak_pa_elastance_ea_mmhg_ml_m2,hyperventilation,hyperventilation_num,normal,dob_mask_cl,study_age_cl
PE1_0,PE1,HB046619,M,3.0,174.0,11.9319,0.399756,0.412309,0.326371,0.653708,...,2.905439,15.604727,0.13,0.88597,1.170445,1,1.0,0,1977-12-13,49.886379
PE12_0,PE12,HB046622,M,4.0,264.0,0.102822,0.041968,0.0,0.002098,0.405691,...,19.274715,16.523663,0.205714,0.799035,0.540537,1,1.0,0,1986-05-02,45.412731
PE12_1,PE12,HB046622,M,2.0,95.0,1.39117,0.0,0.0,0.0,0.114441,...,19.274715,16.523663,0.205714,0.799035,0.540537,1,1.0,0,1986-05-02,45.412731
PE14_0,PE14,HB046616,F,4.0,111.0,0.825384,0.0,0.0,0.0,0.097104,...,3.302519,18.254263,0.21,0.737339,0.650262,1,1.0,0,1997-04-05,31.616701
PE15_0,PE15,HB046618,F,3.0,315.0,0.0,0.0,0.0,0.0,0.0,...,23.80438,11.849891,0.26,0.778135,0.559831,0,0.0,0,2010-08-21,18.699521


# Preprocessing 

## Pipelines

The primary pipeline will be to pass all numeric data through a standard scaler and a quantile transformer (to achieve normal distributions). The quantile transformer is especially important for the target variables (clot burdens), because the clot burden variables often do not follow a normal distribution. Categorical features will be passed through a one hot encoder. Labels will be passed through a label encoder.


In [3]:
num_columns = list(df.select_dtypes(['int', 'float']).columns)
cat_columns = list(df.select_dtypes(['category']).columns.difference(model_config.cat_targets))
all_columns = num_columns + cat_columns

In [4]:
cols = model_config.cat_targets + model_config.num_targets + model_config.body_feat + model_config.cardiopulmonary_feat + model_config.controls

df_nonnull = df.dropna(subset=cols)
df_nonnull.shape

(95, 116)

In [5]:
encoder = OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False)
transformer = PowerTransformer(method='yeo-johnson', standardize=True)
label_encoder = LabelEncoder()

y_temp = pd.Series(
    label_encoder.fit_transform(df_nonnull[model_config.cat_targets].squeeze()),
    index=df_nonnull.index,
    name='resolved_pe'
)

df_temp_cat = pd.DataFrame(
    encoder.fit_transform(df_nonnull[cat_columns]),
    index = df_nonnull.index,
    columns = encoder.get_feature_names_out()
)
df_temp_num = pd.concat([df_nonnull[num_columns], df_temp_cat], axis=1)
df_temp_num = pd.DataFrame(
    transformer.fit_transform(df_temp_num),
    columns=df_temp_num.columns,
    index=df_temp_num.index
)

df_pp = pd.concat([y_temp, df_temp_num], axis=1)
df_pp.head()

Unnamed: 0,resolved_pe,series_id,slice_number,centralartery,apical_rs1,posterior_rs2,anterior_rs3,lateral_rs4,medial_rs5,superior_rs6,...,inappropriate_o2_extraction_1.0,normal_1,normal_study_1,preload_insufficiency_1.0,preload_insufficiency_nan,race_White,resting_hfpef_1.0,resting_pah_1,surpassed_ventilatory_ceiling_1.0,systemic_htn_response_1.0
PE1_0,1,-0.264415,0.466237,1.281445,1.806963,2.116559,2.230291,1.966303,2.395154,2.299967,...,-0.695971,-0.781133,-0.781133,1.877181,-0.235702,0.41574,-0.282038,-0.180579,-0.342997,-1.198289
PE12_0,0,0.255038,1.246744,-0.839974,-0.099962,-0.513168,-0.478392,1.605058,1.245726,-0.492346,...,-0.695971,-0.781133,-0.781133,-0.532714,-0.235702,0.41574,-0.282038,-0.180579,-0.342997,-1.198289
PE12_1,0,-1.13914,-0.802803,0.156289,-0.597556,-0.513168,-0.519294,0.416968,-0.496983,-0.626172,...,-0.695971,-0.781133,-0.781133,-0.532714,-0.235702,0.41574,-0.282038,-0.180579,-0.342997,-1.198289
PE14_0,0,0.255038,-0.460189,-0.143623,-0.597556,-0.513168,-0.519294,0.288128,-0.496983,-0.626172,...,1.436842,-0.781133,-0.781133,-0.532714,-0.235702,0.41574,-0.282038,-0.180579,-0.342997,0.834523
PE15_0,0,-0.264415,1.556086,-0.999449,-0.597556,-0.513168,-0.519294,-0.669858,-0.496983,-0.626172,...,1.436842,-0.781133,-0.781133,1.877181,-0.235702,0.41574,-0.282038,-0.180579,-0.342997,-1.198289


In [6]:
def plot_distributions(data, features, fname, pre_or_post, rows, cols, figsize=(12, 4)):
    fig, axs = plt.subplots(rows, cols, figsize=figsize)
    for i, ax in enumerate(axs.reshape(-1)):
        if i < len(features):
            feat_name = features[i]
            ax.hist(data[feat_name], bins=20)
            ax.set_title(feat_name, fontsize=10)
    plt.suptitle(f'Distributions: {pre_or_post}-processing')
    plt.tight_layout()
    plt.savefig(f'../figures/{fname}.png')
    plt.close()

plot_distributions(df, model_config.body_feat, 'distribution_body_pre.png', 'pre', 3, 6)
plot_distributions(df_pp, model_config.body_feat, 'distribution_body_post.png', 'post', 3, 6)
plot_distributions(df, model_config.cardiopulmonary_feat, 'distribution_cardio_pre.png', 'pre', 3, 6)
plot_distributions(df_pp, model_config.cardiopulmonary_feat, 'distribution_cardio_post.png', 'post', 3, 6)
plot_distributions(df, model_config.num_targets, 'distribution_targets_pre.png', 'pre', 4, 6, figsize=(12, 5))
plot_distributions(df_pp, model_config.num_targets, 'distribution_targets_post.png', 'post', 4, 6, figsize=(12, 5))

In [7]:
targets = model_config.num_targets + model_config.cat_targets
# Separate X
X = df_pp.loc[:, df_pp.columns.difference(targets)]
# Separate Y
Y = df_pp.loc[:, targets]

print(f"X.shape: {X.shape}")
print(f"Y.shape: {Y.shape}")

X.shape: (95, 87)
Y.shape: (95, 22)


### Variance Inflation Factor Elimination

We will loop through multiple VIF elimination thresholds to see what variables remain at each cutoff. From there, we will select a specific cutoff and eliminate features from both the body composition and cardiopulmonary data 

In [8]:
def sequential_VIF(df, threshold):
	vif = pd.DataFrame()
	vif['Variable'] = df.columns
	vif['VIF'] = [variance_inflation_factor(df.values, i) for i in range(df.shape[-1])]

	while vif["VIF"].max() > threshold: 
		max_vif_index = vif["VIF"].idxmax()
		variable_to_remove = vif.loc[max_vif_index, "Variable"]
		df = df.drop(variable_to_remove, axis=1)

		vif = pd.DataFrame()
		vif["Variable"] = df.columns
		vif["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[-1])]
		
		vif.index = vif['Variable']
	vif = vif.drop(columns='Variable')
	vif.columns=[f'VIF<={threshold}']

	return vif

#### Body Composition VIF Feature Elimination

#### Cardiopulmonary VIF Feature Elimination 

#### All Numerical VIF Feature Elimination 

In [9]:
# Iterable of thresholds
thresholds = np.arange(2, 6.5, 0.5)

# Create dataframe to store results
feat_index = model_config.body_feat + model_config.cardiopulmonary_feat + model_config.controls_encoded
all_num_vif_df = pd.DataFrame(
	index=feat_index
)

# Loop through thresholds
for thresh in thresholds:
	# Subset to cardiopulmonary data
	feat_df = df_pp.loc[:, feat_index].dropna(axis=0, how='any')
	vif = sequential_VIF(feat_df, thresh)
	all_num_vif_df = pd.concat((all_num_vif_df, vif), axis=1)

all_num_vif_df.to_csv('../output/regressions/vif_feature_elimination.csv')

In [10]:
VIF_CUTOFF = 3.5

uncorrelated_feat = list(all_num_vif_df.loc[:, f"VIF<={VIF_CUTOFF}"].dropna().index)

uncorrelated_body_feat = [feat for feat in uncorrelated_feat if feat in model_config.body_feat]
uncorrelated_cardio_feat = [feat for feat in uncorrelated_feat if feat in model_config.cardiopulmonary_feat]
uncorrelated_controls = [feat for feat in uncorrelated_feat if feat in model_config.controls_encoded]

print(f"Body feat:\n{uncorrelated_body_feat}")
print(f"\nCardio feat:\n{uncorrelated_cardio_feat}")
print(f"\nControls:\n{uncorrelated_controls}")

Body feat:
['density_visceral_fat', 'density_intermuscular_fat', 'volume_bone', 'density_bone', 'bmi']

Cardio feat:
['emphysema_volume_950hu', 'extrapulmonary_vein_volume', 'artery_vein_ratio', 'bv10', 'pb_larger_10', 'a_diameter', 'pv_a', 'heart_volume', 'airway_ratio', 'ild_volume']

Controls:
['age', 'gender_cl_Male']


In [11]:
uncorrelated_features = uncorrelated_body_feat + uncorrelated_cardio_feat + uncorrelated_controls

post_vif_corr = feat_df[uncorrelated_features].corr()
post_vif_corr.to_csv('../output/regressions/post_vif_corr.csv')
post_vif_corr

Unnamed: 0,density_visceral_fat,density_intermuscular_fat,volume_bone,density_bone,bmi,emphysema_volume_950hu,extrapulmonary_vein_volume,artery_vein_ratio,bv10,pb_larger_10,a_diameter,pv_a,heart_volume,airway_ratio,ild_volume,age,gender_cl_Male
density_visceral_fat,1.0,0.4749,-0.282723,0.119525,-0.248317,-0.052039,-0.264514,0.161648,-0.072867,-0.312365,-0.343782,-0.125083,-0.291607,-0.118713,-0.106771,-0.293805,-0.276747
density_intermuscular_fat,0.4749,1.0,0.105116,-0.28473,-0.109116,-0.175005,-0.307024,0.090303,-0.390661,0.071313,0.13359,-0.334997,0.042096,0.002069,0.004683,0.007652,0.080641
volume_bone,-0.282723,0.105116,1.0,-0.049216,0.164692,-0.047271,0.232174,-0.133144,-0.194251,0.470024,0.197637,-0.15265,0.608244,0.047382,0.043464,0.187785,0.550064
density_bone,0.119525,-0.28473,-0.049216,1.0,-0.055446,0.026801,0.094461,-0.036928,0.198911,-0.216137,-0.536506,0.124393,-0.24878,-0.150023,0.075323,-0.443538,-0.269084
bmi,-0.248317,-0.109116,0.164692,-0.055446,1.0,-0.317974,-0.021268,0.066075,-0.278526,0.534097,0.08756,-0.064201,0.256326,-0.239791,0.091729,0.082041,-0.159457
emphysema_volume_950hu,-0.052039,-0.175005,-0.047271,0.026801,-0.317974,1.0,0.141239,0.093418,0.45152,-0.357728,-0.019412,0.080336,-0.127615,-0.016814,-0.447791,0.185432,-0.057444
extrapulmonary_vein_volume,-0.264514,-0.307024,0.232174,0.094461,-0.021268,0.141239,1.0,-0.44455,0.457665,0.035134,0.06731,0.318772,0.286687,0.057001,-0.140956,0.106826,0.256262
artery_vein_ratio,0.161648,0.090303,-0.133144,-0.036928,0.066075,0.093418,-0.44455,1.0,-0.043309,0.103381,-0.138109,0.06952,-0.078048,-0.309405,-0.197841,-0.045256,-0.298028
bv10,-0.072867,-0.390661,-0.194251,0.198911,-0.278526,0.45152,0.457665,-0.043309,1.0,-0.291455,-0.100551,0.510574,-0.041319,-0.047151,-0.436209,-0.12372,-0.098391
pb_larger_10,-0.312365,0.071313,0.470024,-0.216137,0.534097,-0.357728,0.035134,0.103381,-0.291455,1.0,0.283404,0.0652,0.544185,-0.15166,0.096435,0.071137,0.245219


#### Ensure all necessary columns are in data

In [12]:
all_needed_columns = (
	model_config.cat_targets + 
	model_config.num_targets + 
	uncorrelated_body_feat + 
	uncorrelated_cardio_feat + 
	uncorrelated_controls
)

prediction_needed_columns = (
	model_config.num_targets + 
	uncorrelated_body_feat + 
	uncorrelated_cardio_feat + 
	uncorrelated_controls
)

classification_needed_columns = (
	model_config.cat_targets + 
	uncorrelated_body_feat + 
	uncorrelated_cardio_feat + 
	uncorrelated_controls
)

# Check columns to drop are named correctly
assert set(all_needed_columns).issubset(set(df_pp.columns))
assert set(prediction_needed_columns).issubset(set(df_pp.columns))
assert set(classification_needed_columns).issubset(set(df_pp.columns))

## Prediction

### Drop missing values for needed columns

In [13]:
# Drop columns for ols
df_prediction = df_pp.loc[:, prediction_needed_columns].dropna()
prediction_features = uncorrelated_body_feat + uncorrelated_cardio_feat + model_config.controls_encoded
X_prediction = df_prediction.loc[:, prediction_features]
y_prediction = df_prediction.loc[:, model_config.num_targets]

print(f"X.shape: {X_prediction.shape}")
print(f"y.shape: {y_prediction.shape}")

prediction_data = dict(
	X = X_prediction,
	y = y_prediction,
	body_features = uncorrelated_body_feat,
	cardio_features = uncorrelated_cardio_feat,
    controls = uncorrelated_controls
)

X.shape: (95, 17)
y.shape: (95, 21)


### Export

In [14]:
with open(Path('../data/prediction_data.pkl'), 'wb') as f:
    pickle.dump(prediction_data, f)

## Classification

### Drop missing values for needed columns

In [15]:
# Drop columns for ols
df_classification = df_pp.loc[:, classification_needed_columns].dropna()
# df_classification = df_pp.loc[df.pe_obs==0, classification_needed_columns].dropna()
classification_features = uncorrelated_body_feat + uncorrelated_cardio_feat + model_config.controls_encoded
X_classification = df_classification.loc[:, classification_features]
y_classification = df_classification.loc[:, model_config.cat_targets]

print(f"X.shape: {X_classification.shape}")
print(f"y.shape: {y_classification.shape}")

classification_data = dict(
	X = X_classification,
	y = y_classification,
	body_features = uncorrelated_body_feat,
	cardio_features = uncorrelated_cardio_feat,
    controls = uncorrelated_controls
)

X.shape: (95, 17)
y.shape: (95, 1)


In [16]:
X.isna().sum()[X.isna().sum()>0]

peak_cavo2_a_art_hb                 1
peak_pa_elastance_ea_mmhg_ml_m2     1
peak_paao2                          1
peak_pvr_wu                         2
ve_vco2_slope                      10
vo2_work_slope_output              14
dtype: int64

### Export

In [17]:
with open(Path('../data/classification_data.pkl'), 'wb') as f:
    pickle.dump(classification_data, f)