In [159]:
from pathlib import Path
import os
import pickle
from config import model_config

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.base import (
	BaseEstimator, TransformerMixin
)
from sklearn.compose import (
	ColumnTransformer
)
from sklearn.pipeline import (
	Pipeline
)
from sklearn.preprocessing import (
	LabelEncoder, OneHotEncoder, StandardScaler,
	RobustScaler, QuantileTransformer, FunctionTransformer
)
from statsmodels.stats.outliers_influence import variance_inflation_factor



SEED = 123

# Preprocessing 

In [160]:
df = pd.read_pickle(Path('../data/df_clean.pkl'))
df = df.drop(columns=['study_date_mask_cl', 'study_date_mask_pe',])
df = df.dropna(subset='total_clot_burden')
print(df.shape)
df.head()

(99, 104)


Unnamed: 0,a_diameter,age,airway_ratio,airway_volume,anterior_basal_rs8,anterior_ls3,anterior_rs3,anteromedial_basal_ls7_8,apical_ls1,apical_rs1,...,ve_vco2_slope,vo2_hr_peak_percent_,vo2_ml_kg_min_at_at,vo2_work_slope_output,volume_bone,volume_intermuscular_fat,volume_muscle,volume_subcutaneous_fat,volume_visceral_fat,weight_kg
PE1_0,26.7526,49.0,0.011805,0.066788,0.201809,0.206637,0.326371,0.06566,0.0,0.399756,...,,45.0,9.6,,2.03983,0.986196,5.87431,6.59418,0.242408,150.0
PE12_0,21.0758,45.0,0.014465,0.06984,0.395899,0.0,0.002098,0.0,0.0,0.041968,...,29.557,75.0,10.7,8.038,1.85461,0.904912,6.00182,4.8675,1.98806,111.5
PE12_1,22.9284,45.0,0.018161,0.047697,0.0,0.0,0.0,0.0,0.0,0.0,...,29.557,75.0,10.7,8.038,2.02425,0.629943,8.01657,6.28189,2.44348,111.5
PE14_0,22.3061,31.0,0.010895,0.054162,0.238598,0.0,0.0,0.0,0.0,0.0,...,32.776,92.0,10.2,7.718,1.68335,0.347496,5.08316,4.88387,0.470233,85.55
PE15_0,17.006,18.0,0.013111,0.029738,0.0,0.0,0.0,0.0,0.0,0.0,...,28.446,80.0,17.1,,1.16264,0.168386,3.62723,1.53712,0.017716,55.0


## Pipelines

In [161]:
num_columns = list(df.select_dtypes(['int', 'float']).columns)
cat_columns = list(df.select_dtypes(['category']).columns.difference(model_config.cat_targets))
all_columns = num_columns + cat_columns

In [162]:
cols = model_config.cat_targets + model_config.num_targets + model_config.body_feat + model_config.cardiopulmonary_feat + model_config.controls

df_nonnull = df.dropna(subset=cols)
df_nonnull.shape

(90, 104)

In [163]:
class LabelEncoderTransformer(BaseEstimator, TransformerMixin):

	def fit(self, X, y=None):
		return self

	def transform(self, X, y=None):
		X_reshaped = np.squeeze(X)
		label_encoder = LabelEncoder()
		X_transformed = label_encoder.fit_transform(X_reshaped)
		return X_transformed.reshape(-1, 1)

	def get_feature_names_out(self, X, y=None):
		return X


num_pipeline = Pipeline([
	('Scaler', StandardScaler()),
	('QuantileTransformer', QuantileTransformer(n_quantiles=20, output_distribution='normal', random_state=SEED)),
])

cat_pipeline = Pipeline([
	('Encoder', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

preprocessor = ColumnTransformer(
    transformers=[
		('label_encoder', LabelEncoderTransformer(), model_config.cat_targets),
        ('num_pipeline', num_pipeline, num_columns),
        ('cat_pipeline', cat_pipeline, cat_columns),
	],
	remainder='passthrough'
)

In [164]:
df_pp = preprocessor.fit_transform(df_nonnull)

# Get the feature names for the entire ColumnTransformer
all_feature_names = list()
for transformer_name, transformer, columns in preprocessor.transformers_:
	all_feature_names.extend(transformer.get_feature_names_out(columns))

df_pp = pd.DataFrame(
	df_pp,
	index=df_nonnull.index,
	columns=all_feature_names
)

df_pp.head()

Unnamed: 0,resolved_pe,a_diameter,age,airway_ratio,airway_volume,anterior_basal_rs8,anterior_ls3,anterior_rs3,anteromedial_basal_ls7_8,apical_ls1,...,inappropriate_o2_extraction_1.0,normal_1,normal_study_1,preload_insufficiency_1.0,preload_insufficiency_nan,race_White,resting_hfpef_1.0,resting_pah_1,surpassed_ventilatory_ceiling_1.0,systemic_htn_response_1.0
PE1_0,0.0,0.947761,-0.099108,-0.337578,1.208424,1.145468,1.006726,1.682629,0.722378,-5.199338,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
PE12_0,1.0,-0.640149,-0.382953,0.469267,1.439863,1.620714,-5.199338,0.523349,-5.199338,-5.199338,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
PE12_1,1.0,0.030429,-0.382953,1.594807,0.302507,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
PE14_0,1.0,-0.20047,-1.008759,-0.634274,0.600126,1.250845,-5.199338,-5.199338,-5.199338,-5.199338,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
PE15_0,1.0,-1.654127,-5.199338,0.077566,-1.009554,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


In [165]:
targets = model_config.num_targets + model_config.cat_targets
# Separate X
X = df_pp.loc[:, df_pp.columns.difference(targets)]
# Separate Y
Y = df_pp.loc[:, targets]

print(f"X.shape: {X.shape}")
print(f"Y.shape: {Y.shape}")

X.shape: (90, 81)
Y.shape: (90, 22)


### Variance Inflation Factor Elimination

We will loop through multiple VIF elimination thresholds to see what variables remain at each cutoff. From there, we will select a specific cutoff and eliminate features from both the body composition and cardiopulmonary data 

In [166]:
VIF_CUTOFF = 4

In [167]:
def sequential_VIF(df, threshold):
	vif = pd.DataFrame()
	vif['Variable'] = df.columns
	vif['VIF'] = [variance_inflation_factor(df.values, i) for i in range(df.shape[-1])]

	while vif["VIF"].max() > threshold: 
		max_vif_index = vif["VIF"].idxmax()
		variable_to_remove = vif.loc[max_vif_index, "Variable"]
		df = df.drop(variable_to_remove, axis=1)

		vif = pd.DataFrame()
		vif["Variable"] = df.columns
		vif["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[-1])]
		
		vif.index = vif['Variable']
	vif = vif.drop(columns='Variable')
	vif.columns=[threshold]

	return vif

#### Body Composition VIF Feature Elimination

In [168]:
# Iterable of thresholds
thresholds = np.arange(2, 11)

# Create dataframe to store results
body_vif_df = pd.DataFrame(
	index=model_config.body_feat
)

# Loop through thresholds
for thresh in thresholds:
	# Subset to body data only
	body_df = df_pp.loc[:, model_config.body_feat].dropna(axis=0, how='any')
	vif = sequential_VIF(body_df, thresh)
	body_vif_df = pd.concat((body_vif_df, vif), axis=1)

body_vif_df

Unnamed: 0,2,3,4,5,6,7,8,9,10
volume_visceral_fat,,,,,,,,,
density_visceral_fat,1.89381,2.068432,2.907728,3.228633,3.228633,3.27326,3.582931,3.582931,3.582931
mass_visceral_fat,1.881658,1.990025,2.396552,2.451086,2.451086,2.463734,2.635936,2.635936,2.635936
volume_subcutaneous_fat,,,,,,,,,
density_subcutaneous_fat,,,3.370755,3.789219,3.789219,3.986596,4.663525,4.663525,4.663525
mass_subcutaneous_fat,,,,4.376631,4.376631,4.432205,5.724696,5.724696,5.724696
volume_intermuscular_fat,,2.851287,2.901099,3.315116,3.315116,3.350177,3.457404,3.457404,3.457404
density_intermuscular_fat,1.75865,1.840109,2.097487,2.0995,2.0995,2.101524,2.153714,2.153714,2.153714
mass_intermuscular_fat,,,,,,,,,
volume_muscle,,,,,,,,,


In [169]:
uncorrelated_body_feat = list(body_vif_df.loc[:, VIF_CUTOFF].dropna().index)
uncorrelated_body_feat

['density_visceral_fat',
 'mass_visceral_fat',
 'density_subcutaneous_fat',
 'volume_intermuscular_fat',
 'density_intermuscular_fat',
 'density_muscle',
 'mass_muscle',
 'density_bone',
 'mass_bone',
 'bsa']

#### Cardiopulmonary VIF Feature Elimination 

In [170]:
# Iterable of thresholds
thresholds = np.arange(2, 11)

# Create dataframe to store results
cardio_vif_df = pd.DataFrame(
	index=model_config.cardiopulmonary_feat
)

# Loop through thresholds
for thresh in thresholds:
	# Subset to cardiopulmonary data
	cardio_df = df_pp.loc[:, model_config.cardiopulmonary_feat].dropna(axis=0, how='any')
	vif = sequential_VIF(cardio_df, thresh)
	cardio_vif_df = pd.concat((cardio_vif_df, vif), axis=1)

cardio_vif_df

Unnamed: 0,2,3,4,5,6,7,8,9,10
emphysema_volume_950hu,1.614103,1.650683,1.650683,1.703475,1.703475,1.703627,1.703627,1.703627,1.703627
lung_volume,,,,,,,,,
extrapulmonary_artery_volume,1.750693,2.324961,2.324961,2.656478,2.656478,2.664066,2.664066,2.664066,2.664066
extrapulmonary_vein_volume,1.824991,1.950475,1.950475,2.891703,2.891703,2.950722,2.950722,2.950722,2.950722
intrapulmonary_artery_volume,,,,4.397801,4.397801,4.609991,4.609991,4.609991,4.609991
intrapulmonary_vein_volume,,,,,,,,,
artery_vein_ratio,1.455782,1.462753,1.462753,1.505552,1.505552,1.515181,1.515181,1.515181,1.515181
bv5,1.69634,2.393507,2.393507,2.563207,2.563207,5.258391,5.258391,5.258391,5.258391
bv10,,,,,,6.731174,6.731174,6.731174,6.731174
pb_larger_10,1.832267,2.076968,2.076968,2.092354,2.092354,2.094179,2.094179,2.094179,2.094179


In [171]:
uncorrelated_cardio_feat = list(cardio_vif_df.loc[:, VIF_CUTOFF].dropna().index)
uncorrelated_cardio_feat

['emphysema_volume_950hu',
 'extrapulmonary_artery_volume',
 'extrapulmonary_vein_volume',
 'artery_vein_ratio',
 'bv5',
 'pb_larger_10',
 'a_diameter',
 'pv_a',
 'heart_volume',
 'airway_volume',
 'airway_ratio',
 'ild_volume']

In [172]:
all_needed_columns = (
	model_config.cat_targets + 
	model_config.num_targets + 
	uncorrelated_body_feat + 
	uncorrelated_cardio_feat + 
	model_config.controls_encoded
)

prediction_needed_columns = (
	model_config.num_targets + 
	uncorrelated_body_feat + 
	uncorrelated_cardio_feat + 
	model_config.controls_encoded
)

classification_needed_columns = (
	model_config.cat_targets + 
	uncorrelated_body_feat + 
	uncorrelated_cardio_feat + 
	model_config.controls_encoded
)

# Check columns to drop are named correctly
assert set(all_needed_columns).issubset(set(df_pp.columns))
assert set(prediction_needed_columns).issubset(set(df_pp.columns))
assert set(classification_needed_columns).issubset(set(df_pp.columns))

## Prediction

### Drop missing values for needed columns

In [173]:
# Drop columns for ols
df_prediction = df_pp.loc[:, prediction_needed_columns].dropna()
prediction_features = uncorrelated_body_feat + uncorrelated_cardio_feat + model_config.controls_encoded
X_prediction = df_prediction.loc[:, prediction_features]
y_prediction = df_prediction.loc[:, model_config.num_targets]

print(f"X.shape: {X_prediction.shape}")
print(f"y.shape: {y_prediction.shape}")

prediction_data = dict(
	X = X_prediction,
	y = y_prediction,
	body_features = uncorrelated_body_feat,
	cardio_features = uncorrelated_cardio_feat
)

X.shape: (90, 24)
y.shape: (90, 21)


### Export

In [174]:
with open(Path('../data/prediction_data.pkl'), 'wb') as f:
    pickle.dump(prediction_data, f)

## Classification

### Drop missing values for needed columns

In [175]:
# Drop columns for ols
df_classification = df_pp.loc[df.pe_obs==0, classification_needed_columns].dropna()
classification_features = uncorrelated_body_feat + uncorrelated_cardio_feat + model_config.controls_encoded
X_classification = df_classification.loc[:, classification_features]
y_classification = df_classification.loc[:, model_config.cat_targets]

print(f"X.shape: {X_classification.shape}")
print(f"y.shape: {y_classification.shape}")

classification_data = dict(
	X = X_classification,
	y = y_classification,
	body_features = uncorrelated_body_feat,
	cardio_features = uncorrelated_cardio_feat
)

X.shape: (42, 24)
y.shape: (42, 1)


### Export

In [176]:
with open(Path('../data/classification_data.pkl'), 'wb') as f:
    pickle.dump(classification_data, f)