In [1]:
from pathlib import Path
import os
import pickle
from config import model_config

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.base import (
	BaseEstimator, TransformerMixin
)
from sklearn.compose import (
	ColumnTransformer
)
from sklearn.pipeline import (
	Pipeline
)
from sklearn.preprocessing import (
	LabelEncoder, OneHotEncoder, StandardScaler,
	RobustScaler, QuantileTransformer, FunctionTransformer, 
    PowerTransformer
)
from statsmodels.stats.outliers_influence import variance_inflation_factor



SEED = 123

In [2]:
df = pd.read_pickle(Path('../data/df_clean_w_outliers.pkl'))
df = df.drop(columns=['study_date_mask_cl', 'study_date_mask_pe',])
df = df.dropna(subset='total_clot_burden')
df['resolved_pe'] = df['resolved_pe'].map({'Unresolved': 0, 'Resolved': 1})
print(df.shape)
df.head()

(107, 123)


Unnamed: 0,pe_number_clean,patient_id,gender_pe,series_id,slice_number,centralartery,apical_rs1,posterior_rs2,anterior_rs3,lateral_rs4,...,peak_vd_vt,peak_cavo2_a_art_hb,peak_pa_elastance_ea_mmhg_ml_m2,hyperventilation,vq_date,hyperventilation_num,mpap_co_ratio,normal,dob_mask_cl,study_age_cl
PE1_0,PE1,HB046619,M,3.0,174.0,12.7304,0.476038,0.839102,0.30899,0.784063,...,0.13,0.88597,1.170445,1.0,2027-09-30,1.0,1.537996,0,1977-12-13,49.886379
PE12_0,PE12,HB046622,M,4.0,264.0,0.0,0.025181,0.0,0.032875,0.510612,...,0.205714,0.799035,0.540537,1.0,2031-09-25,1.0,1.391943,0,1986-05-02,45.412731
PE12_1,PE12,HB046622,M,2.0,95.0,5.26785,0.33617,0.075102,0.0,0.461339,...,0.205714,0.799035,0.540537,1.0,2031-09-25,1.0,1.391943,0,1986-05-02,45.412731
PE14_0,PE14,HB046616,F,3.0,167.0,1.08294,0.0,0.0,0.0,0.0,...,0.21,0.737339,0.650262,1.0,2028-11-16,1.0,1.58867,0,1997-04-05,31.616701
PE15_0,PE15,HB046618,F,3.0,330.0,0.001022,0.0,0.0,0.0,0.005451,...,0.26,0.778135,0.559831,0.0,2029-04-18,0.0,1.940946,0,2010-08-21,18.699521


# Preprocessing 

## Pipelines

The primary pipeline will be to pass all numeric data through a standard scaler and a quantile transformer (to achieve normal distributions). The quantile transformer is especially important for the target variables (clot burdens), because the clot burden variables often do not follow a normal distribution. Categorical features will be passed through a one hot encoder. Labels will be passed through a label encoder.


In [3]:
num_columns = list(df.select_dtypes(['int', 'float']).columns)
cat_columns = list(df.select_dtypes(['category']).columns.difference(model_config.cat_targets))
all_columns = num_columns + cat_columns

In [4]:
cols = model_config.cat_targets + model_config.num_targets + model_config.body_feat + model_config.cardiopulmonary_feat + model_config.controls + model_config.clot_feat

df_nonnull = df.dropna(subset=cols)
df_nonnull.shape

(96, 123)

In [5]:
encoder = OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False)
# transformer = PowerTransformer(method='yeo-johnson', standardize=True)
transformer = StandardScaler()
label_encoder = LabelEncoder()

y_temp = pd.Series(
    label_encoder.fit_transform(df_nonnull[model_config.cat_targets].squeeze()),
    index=df_nonnull.index,
    name='resolved_pe'
)
df_temp_cat = pd.DataFrame(
    encoder.fit_transform(df_nonnull[cat_columns]),
    index = df_nonnull.index,
    columns = encoder.get_feature_names_out()
)
df_temp_num = pd.DataFrame(
    transformer.fit_transform(df_nonnull[num_columns]),
    columns=df_nonnull[num_columns].columns,
    index=df_nonnull[num_columns].index
)
df_temp_all = pd.concat([df_temp_num, df_temp_cat], axis=1)


df_pp = pd.concat([y_temp, df_temp_all], axis=1)
df_pp.head()

Unnamed: 0,resolved_pe,series_id,slice_number,centralartery,apical_rs1,posterior_rs2,anterior_rs3,lateral_rs4,medial_rs5,superior_rs6,...,inappropriate_o2_extraction_1.0,normal_1,normal_study_1,preload_insufficiency_1.0,preload_insufficiency_nan,race_White,resting_hfpef_1.0,resting_pah_1,surpassed_ventilatory_ceiling_1.0,systemic_htn_response_1.0
PE1_0,1,-0.060157,-0.600501,1.317463,1.410442,4.643375,1.044058,3.043933,2.202336,2.926237,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
PE12_0,0,0.902349,0.053106,-0.622559,-0.520512,-0.524311,-0.222947,1.781747,0.03936,-0.519309,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
PE12_1,0,-1.022662,-1.174222,0.180224,0.811409,-0.06179,-0.3738,1.554314,-0.388364,-0.148709,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
PE14_0,0,-0.060157,-0.651337,-0.457526,-0.628358,-0.524311,-0.3738,-0.57512,-0.388364,-0.595827,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
PE15_0,0,-0.060157,0.532417,-0.622403,-0.628358,-0.524311,-0.3738,-0.549961,-0.388364,-0.595827,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


In [6]:
def plot_distributions(data, features, fname, pre_or_post, rows, cols, figsize=(12, 4)):
    fig, axs = plt.subplots(rows, cols, figsize=figsize)
    for i, ax in enumerate(axs.reshape(-1)):
        if i < len(features):
            feat_name = features[i]
            ax.hist(data[feat_name], bins=20)
            ax.set_title(feat_name, fontsize=10)
    plt.suptitle(f'Distributions: {pre_or_post}-processing')
    plt.tight_layout()
    plt.savefig(f'../figures/{fname}.png')
    plt.close()

plot_distributions(df, model_config.body_feat, 'distribution_body_pre.png', 'pre', 3, 6)
plot_distributions(df_pp, model_config.body_feat, 'distribution_body_post.png', 'post', 3, 6)
plot_distributions(df, model_config.cardiopulmonary_feat, 'distribution_cardio_pre.png', 'pre', 3, 6)
plot_distributions(df_pp, model_config.cardiopulmonary_feat, 'distribution_cardio_post.png', 'post', 3, 6)
plot_distributions(df, model_config.num_targets, 'distribution_targets_pre.png', 'pre', 4, 6, figsize=(12, 5))
plot_distributions(df_pp, model_config.num_targets, 'distribution_targets_post.png', 'post', 4, 6, figsize=(12, 5))

In [7]:
targets = model_config.num_targets + model_config.cat_targets
# Separate X
X = df_pp.loc[:, df_pp.columns.difference(targets)]
# Separate Y
Y = df_pp.loc[:, targets]

print(f"X.shape: {X.shape}")
print(f"Y.shape: {Y.shape}")

X.shape: (96, 107)
Y.shape: (96, 8)


In [8]:
p = X[model_config.body_feat].corr()
p[p>0.8]

Unnamed: 0,volume_visceral_fat,density_visceral_fat,mass_visceral_fat,volume_subcutaneous_fat,density_subcutaneous_fat,mass_subcutaneous_fat,volume_intermuscular_fat,density_intermuscular_fat,mass_intermuscular_fat,volume_muscle,density_muscle,mass_muscle,volume_bone,density_bone,mass_bone,bmi
volume_visceral_fat,1.0,,0.999966,,,,,,,,,,,,,
density_visceral_fat,,1.0,,,,,,,,,,,,,,
mass_visceral_fat,0.999966,,1.0,,,,,,,,,,,,,
volume_subcutaneous_fat,,,,1.0,,0.99988,,,,,,,,,,
density_subcutaneous_fat,,,,,1.0,,,,,,,,,,,
mass_subcutaneous_fat,,,,0.99988,,1.0,,,,,,,,,,
volume_intermuscular_fat,,,,,,,1.0,,0.999875,,,,,,,
density_intermuscular_fat,,,,,,,,1.0,,,,,,,,
mass_intermuscular_fat,,,,,,,0.999875,,1.0,,,,,,,
volume_muscle,,,,,,,,,,1.0,,0.999656,0.826891,,0.843193,


In [9]:
p = X[model_config.cardiopulmonary_feat].corr()
p[p>0.8]

Unnamed: 0,emphysema_volume_950hu,lung_volume,extrapulmonary_artery_volume,extrapulmonary_vein_volume,intrapulmonary_artery_volume,intrapulmonary_vein_volume,artery_vein_ratio,bv5,bv10,pb_larger_10,pv_diameter,a_diameter,pv_a,heart_volume,airway_volume,airway_ratio
emphysema_volume_950hu,1.0,,,,,,,,,,,,,,,
lung_volume,,1.0,,,,,,,,,,,,,,
extrapulmonary_artery_volume,,,1.0,,,,,,,,0.834637,,,,,
extrapulmonary_vein_volume,,,,1.0,,0.878535,,,,,,,,,,
intrapulmonary_artery_volume,,,,,1.0,0.84687,,,,,,,,,,
intrapulmonary_vein_volume,,,,0.878535,0.84687,1.0,,,,,,,,,,
artery_vein_ratio,,,,,,,1.0,,,,,,,,,
bv5,,,,,,,,1.0,0.933521,,,,,,,
bv10,,,,,,,,0.933521,1.0,,,,,,,
pb_larger_10,,,,,,,,,,1.0,,,,,,


### Variance Inflation Factor Elimination

We will loop through multiple VIF elimination thresholds to see what variables remain at each cutoff. From there, we will select a specific cutoff and eliminate features from both the body composition and cardiopulmonary data 

#### All Numerical VIF Feature Elimination 

#### Ensure all necessary columns are in data

In [10]:
all_needed_columns = (
	model_config.cat_targets + 
	model_config.num_targets + 
	model_config.body_feat + 
	model_config.cardiopulmonary_feat + 
	model_config.controls_encoded + 
    model_config.clot_feat
)

prediction_needed_columns = (
	model_config.num_targets + 
	model_config.body_feat + 
	model_config.cardiopulmonary_feat + 
	model_config.controls_encoded
)

classification_needed_columns = (
	model_config.cat_targets + 
	model_config.body_feat + 
	model_config.cardiopulmonary_feat + 
	model_config.controls_encoded + 
    model_config.clot_feat
)

prediction_features     = model_config.body_feat + model_config.cardiopulmonary_feat + model_config.controls_encoded
classification_features = model_config.body_feat + model_config.cardiopulmonary_feat + model_config.controls_encoded + model_config.clot_feat

# Check columns to drop are named correctly
assert set(all_needed_columns).issubset(set(df_pp.columns))
assert set(prediction_needed_columns).issubset(set(df_pp.columns))
assert set(classification_needed_columns).issubset(set(df_pp.columns))

## Prediction

### Drop missing values for needed columns

In [11]:
# Drop columns for ols
df_prediction = df_pp.loc[:, prediction_needed_columns].dropna()

X_prediction = df_prediction.loc[:, prediction_features]
y_prediction = df_prediction.loc[:, model_config.num_targets]

print(f"X.shape: {X_prediction.shape}")
print(f"y.shape: {y_prediction.shape}")

prediction_data = dict(
	X = X_prediction,
	y = y_prediction,
	body_features = model_config.body_feat,
	cardio_features = model_config.cardiopulmonary_feat,
    controls = model_config.controls_encoded
)

X.shape: (96, 34)
y.shape: (96, 7)


### Export

In [12]:
with open(Path('../data/prediction_data.pkl'), 'wb') as f:
    pickle.dump(prediction_data, f)

## Classification

### Drop missing values for needed columns

### Export initial observations

In [13]:
# Drop columns for ols
# df_classification = df_pp.loc[:, classification_needed_columns].dropna()
df_classification = df_pp.loc[df.pe_obs==0, classification_needed_columns].dropna()
X_classification = df_classification.loc[:, classification_features]
y_classification = df_classification.loc[:, model_config.cat_targets]

print(f"X.shape: {X_classification.shape}")
print(f"y.shape: {y_classification.shape}")

classification_data = dict(
	X = X_classification,
	y = y_classification,
	body_features = model_config.body_feat,
	cardio_features = model_config.cardiopulmonary_feat,
    controls = model_config.controls_encoded,
    clot_features = model_config.clot_feat
)

with open(Path('../data/classification_data_initial.pkl'), 'wb') as f:
    pickle.dump(classification_data, f)

df_classification.to_csv("../data/classification_data.csv")

X.shape: (45, 40)
y.shape: (45, 1)
