In [1]:
from pathlib import Path
import os
import pickle
from config import model_config

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.base import (
	BaseEstimator, TransformerMixin
)
from sklearn.compose import (
	ColumnTransformer
)
from sklearn.pipeline import (
	Pipeline
)
from sklearn.preprocessing import (
	LabelEncoder, OneHotEncoder, StandardScaler,
	RobustScaler, QuantileTransformer, FunctionTransformer
)
from statsmodels.stats.outliers_influence import variance_inflation_factor



SEED = 123

# Preprocessing 

In [2]:
df = pd.read_pickle(Path('../data/df_clean_w_outliers.pkl'))
df = df.drop(columns=['study_date_mask_cl', 'study_date_mask_pe',])
df = df.dropna(subset='total_clot_burden')
df['resolved_pe'] = df['resolved_pe'].map({'Unresolved': 0, 'Resolved': 1})
print(df.shape)
df.head()

(104, 113)


Unnamed: 0,pe_study_number,patient_id,gender_pe,series_id,slice_number,centralartery,apical_rs1,posterior_rs2,anterior_rs3,lateral_rs4,...,peak_cao2,peak_cvo2,peak_cavo2,peak_paao2,peak_fick_co,peak_vd_vt,peak_cavo2_a_art_hb,normal,dob_mask_cl,study_age_cl
PE1_0,PE1,HB046619,M,3.0,174.0,11.9319,0.399756,0.412309,0.326371,0.653708,...,20.46894,6.825,13.64394,2.905439,15.604727,0.13,0.88597,0,1977-12-13,49.886379
PE12_0,PE12,HB046622,M,4.0,264.0,0.102822,0.041968,0.0,0.002098,0.405691,...,20.68356,8.21862,12.46494,19.274715,16.523663,0.205714,0.799035,0,1986-05-02,45.412731
PE12_1,PE12,HB046622,M,2.0,95.0,1.39117,0.0,0.0,0.0,0.114441,...,20.68356,8.21862,12.46494,19.274715,16.523663,0.205714,0.799035,0,1986-05-02,45.412731
PE14_0,PE14,HB046616,F,4.0,111.0,0.825384,0.0,0.0,0.0,0.097104,...,16.34052,7.41872,8.9218,3.302519,18.254263,0.21,0.737339,0,1997-04-05,31.616701
PE15_0,PE15,HB046618,F,3.0,315.0,0.0,0.0,0.0,0.0,0.0,...,20.79862,8.5819,12.21672,23.80438,11.849891,0.26,0.778135,0,2010-08-21,18.699521


## Pipelines

The primary pipeline will be to pass all numeric data through a standard scaler and a quantile transformer (to achieve normal distributions). The quantile transformer is especially important for the target variables (clot burdens), because the clot burden variables often do not follow a normal distribution. Categorical features will be passed through a one hot encoder. Labels will be passed through a label encoder.


In [3]:
num_columns = list(df.select_dtypes(['int', 'float']).columns)
cat_columns = list(df.select_dtypes(['category']).columns.difference(model_config.cat_targets))
all_columns = num_columns + cat_columns

In [4]:
cols = model_config.cat_targets + model_config.num_targets + model_config.body_feat + model_config.cardiopulmonary_feat + model_config.controls

df_nonnull = df.dropna(subset=cols)
df_nonnull.shape

(95, 113)

In [5]:
class LabelEncoderTransformer(BaseEstimator, TransformerMixin):

	def fit(self, X, y=None):
		return self

	def transform(self, X, y=None):
		X_reshaped = np.squeeze(X)
		label_encoder = LabelEncoder()
		X_transformed = label_encoder.fit_transform(X_reshaped)
		return X_transformed.reshape(-1, 1)

	def get_feature_names_out(self, X, y=None):
		return X


num_pipeline = Pipeline([
	('Scaler', StandardScaler()),
	('QuantileTransformer', QuantileTransformer(n_quantiles=20, output_distribution='normal', random_state=SEED)),
])

cat_pipeline = Pipeline([
	('Encoder', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

preprocessor = ColumnTransformer(
    transformers=[
		('label_encoder', LabelEncoderTransformer(), model_config.cat_targets),
        ('num_pipeline', num_pipeline, num_columns),
        ('cat_pipeline', cat_pipeline, cat_columns),
	],
	remainder='drop'
)

In [6]:
df_pp = preprocessor.fit_transform(df_nonnull)

# Get the feature names for the entire ColumnTransformer
all_feature_names = list()
for transformer_name, transformer, columns in preprocessor.transformers_:
	if isinstance(transformer, str):
		continue
	all_feature_names.extend(transformer.get_feature_names_out(columns))

df_pp = pd.DataFrame(
	df_pp,
	index=df_nonnull.index,
	columns=all_feature_names
)

df_pp.head()

Unnamed: 0,resolved_pe,series_id,slice_number,centralartery,apical_rs1,posterior_rs2,anterior_rs3,lateral_rs4,medial_rs5,superior_rs6,...,inappropriate_o2_extraction_1.0,normal_1,normal_study_1,preload_insufficiency_1.0,preload_insufficiency_nan,race_White,resting_hfpef_1.0,resting_pah_1,surpassed_ventilatory_ceiling_1.0,systemic_htn_response_1.0
PE1_0,1.0,-0.199201,0.285171,0.935442,1.255586,1.258075,1.285853,1.272841,1.292814,1.515897,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
PE12_0,0.0,0.266994,1.311472,-0.230049,0.486956,-5.199338,0.452713,1.196428,1.040493,0.294385,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
PE12_1,0.0,-5.199338,-0.825438,0.174075,-5.199338,-5.199338,-5.199338,0.608237,-5.199338,-5.199338,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
PE14_0,0.0,0.266994,-0.366106,0.001879,-5.199338,-5.199338,-5.199338,0.571489,-5.199338,-5.199338,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
PE15_0,0.0,-0.199201,1.616647,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


In [7]:
targets = model_config.num_targets + model_config.cat_targets
# Separate X
X = df_pp.loc[:, df_pp.columns.difference(targets)]
# Separate Y
Y = df_pp.loc[:, targets]

print(f"X.shape: {X.shape}")
print(f"Y.shape: {Y.shape}")

X.shape: (95, 84)
Y.shape: (95, 22)


### Variance Inflation Factor Elimination

We will loop through multiple VIF elimination thresholds to see what variables remain at each cutoff. From there, we will select a specific cutoff and eliminate features from both the body composition and cardiopulmonary data 

In [8]:
def sequential_VIF(df, threshold):
	vif = pd.DataFrame()
	vif['Variable'] = df.columns
	vif['VIF'] = [variance_inflation_factor(df.values, i) for i in range(df.shape[-1])]

	while vif["VIF"].max() > threshold: 
		max_vif_index = vif["VIF"].idxmax()
		variable_to_remove = vif.loc[max_vif_index, "Variable"]
		df = df.drop(variable_to_remove, axis=1)

		vif = pd.DataFrame()
		vif["Variable"] = df.columns
		vif["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[-1])]
		
		vif.index = vif['Variable']
	vif = vif.drop(columns='Variable')
	vif.columns=[f'VIF<={threshold}']

	return vif

#### Body Composition VIF Feature Elimination

#### Cardiopulmonary VIF Feature Elimination 

#### All Numerical VIF Feature Elimination 

In [18]:
# Iterable of thresholds
thresholds = np.arange(2, 6.5, 0.5)

# Create dataframe to store results
feat_index = model_config.body_feat + model_config.cardiopulmonary_feat + model_config.controls_encoded
all_num_vif_df = pd.DataFrame(
	index=feat_index
)

# Loop through thresholds
for thresh in thresholds:
	# Subset to cardiopulmonary data
	feat_df = df_pp.loc[:, feat_index].dropna(axis=0, how='any')
	vif = sequential_VIF(feat_df, thresh)
	all_num_vif_df = pd.concat((all_num_vif_df, vif), axis=1)

all_num_vif_df.to_csv('../output/regressions/vif_feature_elimination.csv')

In [10]:
VIF_CUTOFF = 3.5

uncorrelated_feat = list(all_num_vif_df.loc[:, f"VIF<={VIF_CUTOFF}"].dropna().index)

uncorrelated_body_feat = [feat for feat in uncorrelated_feat if feat in model_config.body_feat]
uncorrelated_cardio_feat = [feat for feat in uncorrelated_feat if feat in model_config.cardiopulmonary_feat]
uncorrelated_controls = [feat for feat in uncorrelated_feat if feat in model_config.controls_encoded]

print(f"Body feat:\n{uncorrelated_body_feat}")
print(f"\nCardio feat:\n{uncorrelated_cardio_feat}")
print(f"\nControls:\n{uncorrelated_controls}")

Body feat:
['density_visceral_fat', 'mass_visceral_fat', 'mass_subcutaneous_fat', 'volume_intermuscular_fat', 'density_intermuscular_fat', 'volume_bone', 'density_bone']

Cardio feat:
['emphysema_volume_950hu', 'extrapulmonary_artery_volume', 'extrapulmonary_vein_volume', 'artery_vein_ratio', 'bv5', 'pb_larger_10', 'a_diameter', 'pv_a', 'heart_volume', 'airway_ratio', 'ild_volume']

Controls:
['age', 'gender_cl_Male']


In [11]:
df[uncorrelated_body_feat].corr()

Unnamed: 0,density_visceral_fat,mass_visceral_fat,mass_subcutaneous_fat,volume_intermuscular_fat,density_intermuscular_fat,volume_bone,density_bone
density_visceral_fat,1.0,-0.494764,-0.321168,-0.536278,0.487487,-0.175516,0.20509
mass_visceral_fat,-0.494764,1.0,0.557478,0.558337,-0.031025,0.562404,-0.298748
mass_subcutaneous_fat,-0.321168,0.557478,1.0,0.73564,-0.0548,0.136581,-0.251577
volume_intermuscular_fat,-0.536278,0.558337,0.73564,1.0,-0.357389,0.256359,-0.226943
density_intermuscular_fat,0.487487,-0.031025,-0.0548,-0.357389,1.0,0.129775,-0.231777
volume_bone,-0.175516,0.562404,0.136581,0.256359,0.129775,1.0,-0.090211
density_bone,0.20509,-0.298748,-0.251577,-0.226943,-0.231777,-0.090211,1.0


#### Ensure all necessary columns are in data

In [12]:
all_needed_columns = (
	model_config.cat_targets + 
	model_config.num_targets + 
	uncorrelated_body_feat + 
	uncorrelated_cardio_feat + 
	uncorrelated_controls
)

prediction_needed_columns = (
	model_config.num_targets + 
	uncorrelated_body_feat + 
	uncorrelated_cardio_feat + 
	uncorrelated_controls
)

classification_needed_columns = (
	model_config.cat_targets + 
	uncorrelated_body_feat + 
	uncorrelated_cardio_feat + 
	uncorrelated_controls
)

# Check columns to drop are named correctly
assert set(all_needed_columns).issubset(set(df_pp.columns))
assert set(prediction_needed_columns).issubset(set(df_pp.columns))
assert set(classification_needed_columns).issubset(set(df_pp.columns))

## Prediction

### Drop missing values for needed columns

In [13]:
# Drop columns for ols
df_prediction = df_pp.loc[:, prediction_needed_columns].dropna()
prediction_features = uncorrelated_body_feat + uncorrelated_cardio_feat + model_config.controls_encoded
X_prediction = df_prediction.loc[:, prediction_features]
y_prediction = df_prediction.loc[:, model_config.num_targets]

print(f"X.shape: {X_prediction.shape}")
print(f"y.shape: {y_prediction.shape}")

prediction_data = dict(
	X = X_prediction,
	y = y_prediction,
	body_features = uncorrelated_body_feat,
	cardio_features = uncorrelated_cardio_feat
)

X.shape: (95, 20)
y.shape: (95, 21)


### Export

In [14]:
with open(Path('../data/prediction_data.pkl'), 'wb') as f:
    pickle.dump(prediction_data, f)

## Classification

### Drop missing values for needed columns

In [15]:
# Drop columns for ols
df_classification = df_pp.loc[df.pe_obs==0, classification_needed_columns].dropna()
classification_features = uncorrelated_body_feat + uncorrelated_cardio_feat + model_config.controls_encoded
X_classification = df_classification.loc[:, classification_features]
y_classification = df_classification.loc[:, model_config.cat_targets]

print(f"X.shape: {X_classification.shape}")
print(f"y.shape: {y_classification.shape}")

classification_data = dict(
	X = X_classification,
	y = y_classification,
	body_features = uncorrelated_body_feat,
	cardio_features = uncorrelated_cardio_feat
)

X.shape: (43, 20)
y.shape: (43, 1)


In [16]:
X.isna().sum()[X.isna().sum()>0]

peak_cavo2_a_art_hb       1
peak_paao2                1
peak_pvr_wu               2
ve_vco2_slope            10
vo2_work_slope_output    14
dtype: int64

### Export

In [17]:
with open(Path('../data/classification_data.pkl'), 'wb') as f:
    pickle.dump(classification_data, f)