In [1]:
from pathlib import Path
import os
import pickle
from config import model_config

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.base import (
	BaseEstimator, TransformerMixin
)
from sklearn.compose import (
	ColumnTransformer
)
from sklearn.pipeline import (
	Pipeline
)
from sklearn.preprocessing import (
	LabelEncoder, OneHotEncoder, StandardScaler,
	RobustScaler, QuantileTransformer, FunctionTransformer, 
    PowerTransformer
)
from statsmodels.stats.outliers_influence import variance_inflation_factor



SEED = 123

In [2]:
df = pd.read_pickle(Path('../data/df_clean_w_outliers.pkl'))
df = df.drop(columns=['study_date_mask_cl', 'study_date_mask_pe',])
df = df.dropna(subset='total_clot_burden')
df['resolved_pe'] = df['resolved_pe'].map({'Unresolved': 0, 'Resolved': 1})
print(df.shape)
df.head()

(110, 117)


Unnamed: 0,pe_study_number,patient_id,gender_pe,series_id,slice_number,centralartery,apical_rs1,posterior_rs2,anterior_rs3,lateral_rs4,...,peak_fick_co,peak_vd_vt,peak_cavo2_a_art_hb,peak_pa_elastance_ea_mmhg_ml_m2,hyperventilation,hyperventilation_num,mpap_co_ratio,normal,dob_mask_cl,study_age_cl
PE1_0,PE1,HB046619,M,3.0,174.0,11.9319,0.399756,0.412309,0.326371,0.653708,...,15.604727,0.13,0.88597,1.170445,1,1.0,1.537996,0,1977-12-13,49.886379
PE12_0,PE12,HB046622,M,4.0,264.0,0.102822,0.041968,0.0,0.002098,0.405691,...,16.523663,0.205714,0.799035,0.540537,1,1.0,1.391943,0,1986-05-02,45.412731
PE12_1,PE12,HB046622,M,2.0,95.0,1.39117,0.0,0.0,0.0,0.114441,...,16.523663,0.205714,0.799035,0.540537,1,1.0,1.391943,0,1986-05-02,45.412731
PE14_0,PE14,HB046616,F,4.0,111.0,0.825384,0.0,0.0,0.0,0.097104,...,18.254263,0.21,0.737339,0.650262,1,1.0,1.58867,0,1997-04-05,31.616701
PE15_0,PE15,HB046618,F,3.0,315.0,0.0,0.0,0.0,0.0,0.0,...,11.849891,0.26,0.778135,0.559831,0,0.0,1.940946,0,2010-08-21,18.699521


# Preprocessing 

## Pipelines

The primary pipeline will be to pass all numeric data through a standard scaler and a quantile transformer (to achieve normal distributions). The quantile transformer is especially important for the target variables (clot burdens), because the clot burden variables often do not follow a normal distribution. Categorical features will be passed through a one hot encoder. Labels will be passed through a label encoder.


In [3]:
num_columns = list(df.select_dtypes(['int', 'float']).columns)
cat_columns = list(df.select_dtypes(['category']).columns.difference(model_config.cat_targets))
all_columns = num_columns + cat_columns

In [4]:
cols = model_config.cat_targets + model_config.num_targets + model_config.body_feat + model_config.cardiopulmonary_feat + model_config.controls

df_nonnull = df.dropna(subset=cols)
df_nonnull.shape

(99, 117)

In [5]:
encoder = OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False)
transformer = PowerTransformer(method='yeo-johnson', standardize=True)
# transformer = StandardScaler()
label_encoder = LabelEncoder()

y_temp = pd.Series(
    label_encoder.fit_transform(df_nonnull[model_config.cat_targets].squeeze()),
    index=df_nonnull.index,
    name='resolved_pe'
)

df_temp_cat = pd.DataFrame(
    encoder.fit_transform(df_nonnull[cat_columns]),
    index = df_nonnull.index,
    columns = encoder.get_feature_names_out()
)
df_temp_num = pd.concat([df_nonnull[num_columns], df_temp_cat], axis=1)
df_temp_num = pd.DataFrame(
    transformer.fit_transform(df_temp_num),
    columns=df_temp_num.columns,
    index=df_temp_num.index
)

df_pp = pd.concat([y_temp, df_temp_num], axis=1)
df_pp.head()

Unnamed: 0,resolved_pe,series_id,slice_number,centralartery,apical_rs1,posterior_rs2,anterior_rs3,lateral_rs4,medial_rs5,superior_rs6,...,inappropriate_o2_extraction_1.0,normal_1,normal_study_1,preload_insufficiency_1.0,preload_insufficiency_nan,race_White,resting_hfpef_1.0,resting_pah_1,surpassed_ventilatory_ceiling_1.0,systemic_htn_response_1.0
PE1_0,1,-0.213019,0.407237,1.286589,1.8731,2.143067,2.303267,2.017511,2.413267,2.339397,...,-0.707107,-0.789272,-0.755929,1.927248,-0.230633,0.40584,-0.275839,-0.230633,-0.371391,-1.189384
PE12_0,0,0.307231,1.216025,-0.839523,-0.068045,-0.51586,-0.466738,1.666104,1.267476,-0.479203,...,-0.707107,-0.789272,-0.755929,-0.518875,-0.230633,0.40584,-0.275839,-0.230633,-0.371391,-1.189384
PE12_1,0,-1.093862,-0.849457,0.16293,-0.58579,-0.51586,-0.509499,0.468959,-0.500176,-0.619258,...,-0.707107,-0.789272,-0.755929,-0.518875,-0.230633,0.40584,-0.275839,-0.230633,-0.371391,-1.189384
PE14_0,0,0.307231,-0.516685,-0.13829,-0.58579,-0.51586,-0.509499,0.336786,-0.500176,-0.619258,...,1.414214,-0.789272,-0.755929,-0.518875,-0.230633,0.40584,-0.275839,-0.230633,-0.371391,0.840771
PE15_0,0,-0.213019,1.545168,-1.000427,-0.58579,-0.51586,-0.509499,-0.655578,-0.500176,-0.619258,...,1.414214,-0.789272,-0.755929,1.927248,-0.230633,0.40584,-0.275839,-0.230633,-0.371391,-1.189384


In [6]:
def plot_distributions(data, features, fname, pre_or_post, rows, cols, figsize=(12, 4)):
    fig, axs = plt.subplots(rows, cols, figsize=figsize)
    for i, ax in enumerate(axs.reshape(-1)):
        if i < len(features):
            feat_name = features[i]
            ax.hist(data[feat_name], bins=20)
            ax.set_title(feat_name, fontsize=10)
    plt.suptitle(f'Distributions: {pre_or_post}-processing')
    plt.tight_layout()
    plt.savefig(f'../figures/{fname}.png')
    plt.close()

plot_distributions(df, model_config.body_feat, 'distribution_body_pre.png', 'pre', 3, 6)
plot_distributions(df_pp, model_config.body_feat, 'distribution_body_post.png', 'post', 3, 6)
plot_distributions(df, model_config.cardiopulmonary_feat, 'distribution_cardio_pre.png', 'pre', 3, 6)
plot_distributions(df_pp, model_config.cardiopulmonary_feat, 'distribution_cardio_post.png', 'post', 3, 6)
plot_distributions(df, model_config.num_targets, 'distribution_targets_pre.png', 'pre', 4, 6, figsize=(12, 5))
plot_distributions(df_pp, model_config.num_targets, 'distribution_targets_post.png', 'post', 4, 6, figsize=(12, 5))

In [7]:
targets = model_config.num_targets + model_config.cat_targets
# Separate X
X = df_pp.loc[:, df_pp.columns.difference(targets)]
# Separate Y
Y = df_pp.loc[:, targets]

print(f"X.shape: {X.shape}")
print(f"Y.shape: {Y.shape}")

X.shape: (99, 88)
Y.shape: (99, 22)


### Variance Inflation Factor Elimination

We will loop through multiple VIF elimination thresholds to see what variables remain at each cutoff. From there, we will select a specific cutoff and eliminate features from both the body composition and cardiopulmonary data 

In [8]:
def sequential_VIF(df, threshold):
	vif = pd.DataFrame()
	vif['Variable'] = df.columns
	vif['VIF'] = [variance_inflation_factor(df.values, i) for i in range(df.shape[-1])]

	while vif["VIF"].max() > threshold: 
		max_vif_index = vif["VIF"].idxmax()
		variable_to_remove = vif.loc[max_vif_index, "Variable"]
		df = df.drop(variable_to_remove, axis=1)

		vif = pd.DataFrame()
		vif["Variable"] = df.columns
		vif["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[-1])]
		
		vif.index = vif['Variable']
	vif = vif.drop(columns='Variable')
	vif.columns=[f'VIF<={threshold}']

	return vif

#### All Numerical VIF Feature Elimination 

#### Ensure all necessary columns are in data

In [9]:
all_needed_columns = (
	model_config.cat_targets + 
	model_config.num_targets + 
	model_config.body_feat + 
	model_config.cardiopulmonary_feat + 
	model_config.controls_encoded
)

prediction_needed_columns = (
	model_config.num_targets + 
	model_config.body_feat + 
	model_config.cardiopulmonary_feat + 
	model_config.controls_encoded
)

classification_needed_columns = (
	model_config.cat_targets + 
	model_config.body_feat + 
	model_config.cardiopulmonary_feat + 
	model_config.controls_encoded
)

all_features = model_config.body_feat + model_config.cardiopulmonary_feat + model_config.controls_encoded

# Check columns to drop are named correctly
assert set(all_needed_columns).issubset(set(df_pp.columns))
assert set(prediction_needed_columns).issubset(set(df_pp.columns))
assert set(classification_needed_columns).issubset(set(df_pp.columns))

## Prediction

### Drop missing values for needed columns

In [10]:
# Drop columns for ols
df_prediction = df_pp.loc[:, prediction_needed_columns].dropna()

X_prediction = df_prediction.loc[:, all_features]
y_prediction = df_prediction.loc[:, model_config.num_targets]

print(f"X.shape: {X_prediction.shape}")
print(f"y.shape: {y_prediction.shape}")

prediction_data = dict(
	X = X_prediction,
	y = y_prediction,
	body_features = model_config.body_feat,
	cardio_features = model_config.cardiopulmonary_feat,
    controls = model_config.controls_encoded
)

X.shape: (99, 37)
y.shape: (99, 21)


### Export

In [11]:
with open(Path('../data/prediction_data.pkl'), 'wb') as f:
    pickle.dump(prediction_data, f)

## Classification

### Drop missing values for needed columns

### Export all observations

In [12]:
# Drop columns for ols
df_classification = df_pp.loc[:, classification_needed_columns].dropna()
X_classification = df_classification.loc[:, all_features]
y_classification = df_classification.loc[:, model_config.cat_targets]

print(f"X.shape: {X_classification.shape}")
print(f"y.shape: {y_classification.shape}")

classification_data = dict(
	X = X_classification,
	y = y_classification,
	body_features = model_config.body_feat,
	cardio_features = model_config.cardiopulmonary_feat,
    controls = model_config.controls_encoded
)

with open(Path('../data/classification_data_all.pkl'), 'wb') as f:
    pickle.dump(classification_data, f)

X.shape: (99, 37)
y.shape: (99, 1)


### Export initial observations

In [13]:
# Drop columns for ols
# df_classification = df_pp.loc[:, classification_needed_columns].dropna()
df_classification = df_pp.loc[df.pe_obs==0, classification_needed_columns].dropna()
X_classification = df_classification.loc[:, all_features]
y_classification = df_classification.loc[:, model_config.cat_targets]

print(f"X.shape: {X_classification.shape}")
print(f"y.shape: {y_classification.shape}")

classification_data = dict(
	X = X_classification,
	y = y_classification,
	body_features = model_config.body_feat,
	cardio_features = model_config.cardiopulmonary_feat,
    controls = model_config.controls_encoded
)

with open(Path('../data/classification_data_initial.pkl'), 'wb') as f:
    pickle.dump(classification_data, f)

X.shape: (45, 37)
y.shape: (45, 1)
