In [2]:
from pathlib import Path
import os
import pickle
from config import model_config

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.base import (
	BaseEstimator, TransformerMixin
)
from sklearn.compose import (
	ColumnTransformer
)
from sklearn.pipeline import (
	Pipeline
)
from sklearn.preprocessing import (
	LabelEncoder, OneHotEncoder, StandardScaler,
	RobustScaler, QuantileTransformer, FunctionTransformer
)
from statsmodels.stats.outliers_influence import variance_inflation_factor



SEED = 123

# Preprocessing 

In [3]:
df = pd.read_pickle(Path('../data/df_clean.pkl'))
df = df.drop(columns=['study_date_mask_cl', 'study_date_mask_pe',])
df = df.dropna(subset='total_clot_burden')
df['resolved_pe'] = df['resolved_pe'].map({'Unresolved': 0, 'Resolved': 1})
print(df.shape)
df.head()

(99, 104)


Unnamed: 0,a_diameter,age,airway_ratio,airway_volume,anterior_basal_rs8,anterior_ls3,anterior_rs3,anteromedial_basal_ls7_8,apical_ls1,apical_rs1,...,ve_vco2_slope,vo2_hr_peak_percent_,vo2_ml_kg_min_at_at,vo2_work_slope_output,volume_bone,volume_intermuscular_fat,volume_muscle,volume_subcutaneous_fat,volume_visceral_fat,weight_kg
PE1_0,26.7526,49.0,0.011805,0.066788,0.201809,0.206637,0.326371,0.06566,0.0,0.399756,...,,45.0,9.6,,2.03983,0.986196,5.87431,6.59418,0.242408,150.0
PE12_0,21.0758,45.0,0.014465,0.06984,0.395899,0.0,0.002098,0.0,0.0,0.041968,...,29.557,75.0,10.7,8.038,1.85461,0.904912,6.00182,4.8675,1.98806,111.5
PE12_1,22.9284,45.0,0.018161,0.047697,0.0,0.0,0.0,0.0,0.0,0.0,...,29.557,75.0,10.7,8.038,2.02425,0.629943,8.01657,6.28189,2.44348,111.5
PE14_0,22.3061,31.0,0.010895,0.054162,0.238598,0.0,0.0,0.0,0.0,0.0,...,32.776,92.0,10.2,7.718,1.68335,0.347496,5.08316,4.88387,0.470233,85.55
PE15_0,17.006,18.0,0.013111,0.029738,0.0,0.0,0.0,0.0,0.0,0.0,...,28.446,80.0,17.1,,1.16264,0.168386,3.62723,1.53712,0.017716,55.0


## Pipelines

The primary pipeline will be to pass all numeric data through a standard scaler and a quantile transformer (to achieve normal distributions). The quantile transformer is especially important for the target variables (clot burdens), because the clot burden variables often do not follow a normal distribution. Categorical features will be passed through a one hot encoder. Labels will be passed through a label encoder.


In [4]:
num_columns = list(df.select_dtypes(['int', 'float']).columns)
cat_columns = list(df.select_dtypes(['category']).columns.difference(model_config.cat_targets))
all_columns = num_columns + cat_columns

In [5]:
cols = model_config.cat_targets + model_config.num_targets + model_config.body_feat + model_config.cardiopulmonary_feat + model_config.controls

df_nonnull = df.dropna(subset=cols)
df_nonnull.shape

(90, 104)

In [6]:
class LabelEncoderTransformer(BaseEstimator, TransformerMixin):

	def fit(self, X, y=None):
		return self

	def transform(self, X, y=None):
		X_reshaped = np.squeeze(X)
		label_encoder = LabelEncoder()
		X_transformed = label_encoder.fit_transform(X_reshaped)
		return X_transformed.reshape(-1, 1)

	def get_feature_names_out(self, X, y=None):
		return X


num_pipeline = Pipeline([
	('Scaler', StandardScaler()),
	('QuantileTransformer', QuantileTransformer(n_quantiles=20, output_distribution='normal', random_state=SEED)),
])

cat_pipeline = Pipeline([
	('Encoder', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

preprocessor = ColumnTransformer(
    transformers=[
		('label_encoder', LabelEncoderTransformer(), model_config.cat_targets),
        ('num_pipeline', num_pipeline, num_columns),
        ('cat_pipeline', cat_pipeline, cat_columns),
	],
	remainder='passthrough'
)

In [7]:
df_pp = preprocessor.fit_transform(df_nonnull)

# Get the feature names for the entire ColumnTransformer
all_feature_names = list()
for transformer_name, transformer, columns in preprocessor.transformers_:
	all_feature_names.extend(transformer.get_feature_names_out(columns))

df_pp = pd.DataFrame(
	df_pp,
	index=df_nonnull.index,
	columns=all_feature_names
)

df_pp.head()

Unnamed: 0,resolved_pe,a_diameter,age,airway_ratio,airway_volume,anterior_basal_rs8,anterior_ls3,anterior_rs3,anteromedial_basal_ls7_8,apical_ls1,...,inappropriate_o2_extraction_1.0,normal_1,normal_study_1,preload_insufficiency_1.0,preload_insufficiency_nan,race_White,resting_hfpef_1.0,resting_pah_1,surpassed_ventilatory_ceiling_1.0,systemic_htn_response_1.0
PE1_0,1.0,0.947761,-0.099108,-0.337578,1.208424,1.145468,1.006726,1.682629,0.722378,-5.199338,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
PE12_0,0.0,-0.640149,-0.382953,0.469267,1.439863,1.620714,-5.199338,0.523349,-5.199338,-5.199338,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
PE12_1,0.0,0.030429,-0.382953,1.594807,0.302507,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
PE14_0,0.0,-0.20047,-1.008759,-0.634274,0.600126,1.250845,-5.199338,-5.199338,-5.199338,-5.199338,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
PE15_0,0.0,-1.654127,-5.199338,0.077566,-1.009554,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


In [8]:
targets = model_config.num_targets + model_config.cat_targets
# Separate X
X = df_pp.loc[:, df_pp.columns.difference(targets)]
# Separate Y
Y = df_pp.loc[:, targets]

print(f"X.shape: {X.shape}")
print(f"Y.shape: {Y.shape}")

X.shape: (90, 81)
Y.shape: (90, 22)


### Variance Inflation Factor Elimination

We will loop through multiple VIF elimination thresholds to see what variables remain at each cutoff. From there, we will select a specific cutoff and eliminate features from both the body composition and cardiopulmonary data 

In [9]:
VIF_CUTOFF = 4

In [10]:
def sequential_VIF(df, threshold):
	vif = pd.DataFrame()
	vif['Variable'] = df.columns
	vif['VIF'] = [variance_inflation_factor(df.values, i) for i in range(df.shape[-1])]

	while vif["VIF"].max() > threshold: 
		max_vif_index = vif["VIF"].idxmax()
		variable_to_remove = vif.loc[max_vif_index, "Variable"]
		df = df.drop(variable_to_remove, axis=1)

		vif = pd.DataFrame()
		vif["Variable"] = df.columns
		vif["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[-1])]
		
		vif.index = vif['Variable']
	vif = vif.drop(columns='Variable')
	vif.columns=[f'VIF<={threshold}']

	return vif

#### Body Composition VIF Feature Elimination

#### Cardiopulmonary VIF Feature Elimination 

#### All Numerical VIF Feature Elimination 

In [11]:
# Iterable of thresholds
thresholds = np.arange(2, 11)

# Create dataframe to store results
feat_index = model_config.body_feat + model_config.cardiopulmonary_feat + model_config.controls_encoded
all_num_vif_df = pd.DataFrame(
	index=feat_index
)

# Loop through thresholds
for thresh in thresholds:
	# Subset to cardiopulmonary data
	feat_df = df_pp.loc[:, feat_index].dropna(axis=0, how='any')
	vif = sequential_VIF(feat_df, thresh)
	all_num_vif_df = pd.concat((all_num_vif_df, vif), axis=1)

all_num_vif_df

Unnamed: 0,VIF<=2,VIF<=3,VIF<=4,VIF<=5,VIF<=6,VIF<=7,VIF<=8,VIF<=9,VIF<=10
volume_visceral_fat,,,,,,,,,
density_visceral_fat,,2.137784,2.910636,2.916892,4.218997,4.218997,4.332478,4.48649,4.486681
mass_visceral_fat,,,3.233776,3.99744,4.039271,4.039271,4.2033,4.317846,4.437557
volume_subcutaneous_fat,,,,,,,,,
density_subcutaneous_fat,,,,,5.715687,5.715687,5.852498,6.469742,6.493655
mass_subcutaneous_fat,,,,,,,,,9.08206
volume_intermuscular_fat,,2.761452,2.881564,2.883939,3.292126,3.292126,3.611331,3.684012,3.95561
density_intermuscular_fat,1.413689,2.358416,2.442214,2.534243,2.762342,2.762342,2.853812,2.888946,2.995675
mass_intermuscular_fat,,,,,,,,,
volume_muscle,,,,,,,,,


In [12]:
uncorrelated_feat = list(all_num_vif_df.loc[:, f"VIF<={VIF_CUTOFF}"].dropna().index)

uncorrelated_body_feat = [feat for feat in uncorrelated_feat if feat in model_config.body_feat]
uncorrelated_cardio_feat = [feat for feat in uncorrelated_feat if feat in model_config.cardiopulmonary_feat]
uncorrelated_controls = [feat for feat in uncorrelated_feat if feat in model_config.controls_encoded]

print(f"Body feat:\n{uncorrelated_body_feat}")
print(f"\nCardio feat:\n{uncorrelated_cardio_feat}")
print(f"\nControls:\n{uncorrelated_controls}")

Body feat:
['density_visceral_fat', 'mass_visceral_fat', 'volume_intermuscular_fat', 'density_intermuscular_fat', 'density_bone', 'mass_bone', 'bmi']

Cardio feat:
['emphysema_volume_950hu', 'lung_volume', 'extrapulmonary_artery_volume', 'extrapulmonary_vein_volume', 'artery_vein_ratio', 'bv5', 'pb_larger_10', 'a_diameter', 'pv_a', 'heart_volume', 'airway_ratio', 'ild_volume']

Controls:
['age', 'gender_cl_Male']


#### Ensure all necessary columns are in data

In [13]:
all_needed_columns = (
	model_config.cat_targets + 
	model_config.num_targets + 
	uncorrelated_body_feat + 
	uncorrelated_cardio_feat + 
	uncorrelated_controls
)

prediction_needed_columns = (
	model_config.num_targets + 
	uncorrelated_body_feat + 
	uncorrelated_cardio_feat + 
	uncorrelated_controls
)

classification_needed_columns = (
	model_config.cat_targets + 
	uncorrelated_body_feat + 
	uncorrelated_cardio_feat + 
	uncorrelated_controls
)

# Check columns to drop are named correctly
assert set(all_needed_columns).issubset(set(df_pp.columns))
assert set(prediction_needed_columns).issubset(set(df_pp.columns))
assert set(classification_needed_columns).issubset(set(df_pp.columns))

## Prediction

### Drop missing values for needed columns

In [14]:
# Drop columns for ols
df_prediction = df_pp.loc[:, prediction_needed_columns].dropna()
prediction_features = uncorrelated_body_feat + uncorrelated_cardio_feat + model_config.controls_encoded
X_prediction = df_prediction.loc[:, prediction_features]
y_prediction = df_prediction.loc[:, model_config.num_targets]

print(f"X.shape: {X_prediction.shape}")
print(f"y.shape: {y_prediction.shape}")

prediction_data = dict(
	X = X_prediction,
	y = y_prediction,
	body_features = uncorrelated_body_feat,
	cardio_features = uncorrelated_cardio_feat
)

X.shape: (90, 21)
y.shape: (90, 21)


### Export

In [15]:
with open(Path('../data/prediction_data.pkl'), 'wb') as f:
    pickle.dump(prediction_data, f)

## Classification

### Drop missing values for needed columns

In [16]:
# Drop columns for ols
df_classification = df_pp.loc[df.pe_obs==0, classification_needed_columns].dropna()
classification_features = uncorrelated_body_feat + uncorrelated_cardio_feat + model_config.controls_encoded
X_classification = df_classification.loc[:, classification_features]
y_classification = df_classification.loc[:, model_config.cat_targets]

print(f"X.shape: {X_classification.shape}")
print(f"y.shape: {y_classification.shape}")

classification_data = dict(
	X = X_classification,
	y = y_classification,
	body_features = uncorrelated_body_feat,
	cardio_features = uncorrelated_cardio_feat
)

X.shape: (42, 21)
y.shape: (42, 1)


### Export

In [17]:
with open(Path('../data/classification_data.pkl'), 'wb') as f:
    pickle.dump(classification_data, f)