In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os
import pickle
from tqdm import tqdm

from sklearn.compose import (
	ColumnTransformer
)
from sklearn.decomposition import (
	PCA
)
from sklearn.ensemble import (
	RandomForestClassifier
)
from sklearn.feature_selection import(
	RFECV, SequentialFeatureSelector
)
from sklearn.linear_model import (
	LinearRegression, LogisticRegression,
)
from sklearn.metrics import (
	confusion_matrix, classification_report, f1_score,
)
from sklearn.model_selection import (
	train_test_split, RandomizedSearchCV, GridSearchCV, 
	cross_val_score,
)
from sklearn.pipeline import (
	Pipeline
)
from sklearn.preprocessing import (
	LabelEncoder, OneHotEncoder, StandardScaler,
	RobustScaler, QuantileTransformer,
)
import statsmodels.api as sm

In [2]:
SEED = 123
TEST_SIZE = 0.25

HEATMAP_COLORS = sns.diverging_palette(h_neg=359, h_pos=250, as_cmap=True)


# Import/Preprocess Data

In [5]:
df = pd.read_pickle(Path('../data/df.pickle'))
print(df.shape)
df.head()

(99, 104)


Unnamed: 0,a_diameter,age,airway_ratio,airway_volume,anterior_basal_rs8,anterior_ls3,anterior_rs3,anteromedial_basal_ls7_8,apical_ls1,apical_rs1,...,ve_vco2_slope,vo2_hr_peak_percent_,vo2_ml_kg_min_at_at,vo2_work_slope_output,volume_bone,volume_intermuscular_fat,volume_muscle,volume_subcutaneous_fat,volume_visceral_fat,weight_kg
PE1_0,26.7526,49.0,0.011805,0.066788,0.201809,0.206637,0.326371,0.06566,0.0,0.399756,...,,45.0,9.6,,2.03983,0.986196,5.87431,6.59418,0.242408,150.0
PE12_0,21.0758,45.0,0.014465,0.06984,0.395899,0.0,0.002098,0.0,0.0,0.041968,...,29.557,75.0,10.7,8.038,1.85461,0.904912,6.00182,4.8675,1.98806,111.5
PE12_1,22.9284,45.0,0.018161,0.047697,0.0,0.0,0.0,0.0,0.0,0.0,...,29.557,75.0,10.7,8.038,2.02425,0.629943,8.01657,6.28189,2.44348,111.5
PE14_0,22.3061,31.0,0.010895,0.054162,0.238598,0.0,0.0,0.0,0.0,0.0,...,32.776,92.0,10.2,7.718,1.68335,0.347496,5.08316,4.88387,0.470233,85.55
PE15_0,17.006,18.0,0.013111,0.029738,0.0,0.0,0.0,0.0,0.0,0.0,...,28.446,80.0,17.1,,1.16264,0.168386,3.62723,1.53712,0.017716,55.0


## Identify Target and Independent Variables

In [6]:
# Identify all target columns
# In this case, they will be PE resolution and all 
# clot burden variables.
CAT_TARGETS = [
	'resolved_pe'
]
NUM_TARGETS = [
	'total_clot_burden',
	'centralartery',
	'apical_rs1',
	'posterior_rs2',
	'anterior_rs3',
	'lateral_rs4',
	'medial_rs5',
	'superior_rs6',
	'medial_basal_rs7',
	'anterior_basal_rs8',
	'lateral_basal_rs9',
	'posterior_basal_rs10',
	'apical_ls1',
	'posterior_ls2',
	'anterior_ls3',
	'superior_ls4',
	'inferior_ls5',
	'superior_ls6',
	'anteromedial_basal_ls7_8',
	'lateral_basal_ls9',
	'posterior_basal_ls10',
]
TARGETS = CAT_TARGETS + NUM_TARGETS
# Check columns to drop are named correctly
assert set(TARGETS).issubset(set(df.columns))

# Identify all body composition variables
VFAT_FEAT = ['volume_visceral_fat', 'density_visceral_fat', 'mass_visceral_fat']
SFAT_FEAT = ['volume_subcutaneous_fat', 'density_subcutaneous_fat', 'mass_subcutaneous_fat']
IFAT_FEAT = ['volume_intermuscular_fat', 'density_intermuscular_fat', 'mass_intermuscular_fat']
MUSCLE_FEAT = ['volume_muscle', 'density_muscle', 'mass_muscle']
BONE_FEAT = ['volume_bone', 'density_bone', 'mass_bone']
COMP_FEAT = ['bmi']
BODY_FEAT = VFAT_FEAT + SFAT_FEAT + IFAT_FEAT + MUSCLE_FEAT + BONE_FEAT + COMP_FEAT

# All necessary columns for clot burden predictions
NECESSARY_COLS = CAT_TARGETS + BODY_FEAT

# Drop columns
df_nonnull = df.dropna(subset=NECESSARY_COLS)
df_nonnull.shape

(90, 104)

In [76]:
# Separate X
X = df_nonnull.loc[:, df_nonnull.columns.difference(TARGETS)].select_dtypes(['int', 'float'])
# Separate Y
Y = df_nonnull.loc[:, TARGETS]

print(f"X_all.shape: {X.shape}")
print(f"Y_all.shape: {Y.shape}")

X_all.shape: (90, 65)
Y_all.shape: (90, 22)


# Logit Regression

## Standardize Data

In [729]:
# Standardize data
scaler = StandardScaler()
X_scale_first = pd.DataFrame(
	scaler.fit_transform(X_first),
	columns=X_first.columns,
	index=X_first.index
)
X_scale_all = pd.DataFrame(
	scaler.fit_transform(X_all),
	columns=X_all.columns,
	index=X_all.index
)

print(f"X_body_first.shape: {X_scale_first.shape}")
print(f"X_body_all.shape: {X_scale_all.shape}")

X_body_first.shape: (47, 65)
X_body_all.shape: (99, 65)


## Dependent Variable: PE Resolution

### All observations

In [728]:
Y = Y_first.copy()
Y['resolved_pe'].value_counts(dropna=False)


Unresolved    27
Resolved      15
NaN            5
Name: resolved_pe, dtype: int64

In [733]:
# # Non-null indices
non_null_resolved = set(Y.loc[pd.notnull(Y['resolved_pe']), :].index)
non_null_exog = set(X_scale_first.loc[pd.notnull(X_scale_first['volume_subcutaneous_fat']), :].index)
non_null_all = list(non_null_resolved.intersection(non_null_exog))
print(len(non_null_all))

Y = pd.DataFrame(
	Y.loc[non_null_all, :],
	columns=['resolved_pe'],
	index=non_null_all
).sort_index()
X_scale_first = pd.DataFrame(
	X_scale_first.loc[non_null_all, :],
	columns=X_scale_first.columns,
	index=non_null_all
).sort_index()
print(Y.shape)
print(X_scale_first.shape)

42
(42, 1)
(42, 65)


In [734]:
y = pd.DataFrame(
	LabelEncoder().fit_transform(Y['resolved_pe']),
	columns=['resolved_pe'],
	index=Y.index
).astype(np.int64)
y.head()

Unnamed: 0,resolved_pe
PE12_0,1
PE14_0,1
PE15_0,1
PE16_0,1
PE17_0,1


In [735]:
y.value_counts(dropna=False)

resolved_pe
1              27
0              15
dtype: int64

In [736]:
X_scale_first.loc[:, 'volume_subcutaneous_fat'].info()

<class 'pandas.core.series.Series'>
Index: 42 entries, PE12_0 to PE9_0
Series name: volume_subcutaneous_fat
Non-Null Count  Dtype  
--------------  -----  
42 non-null     float64
dtypes: float64(1)
memory usage: 672.0+ bytes


In [737]:
X_temp = X_scale_first.loc[:, ['volume_subcutaneous_fat']]
logit_temp = sm.Logit(y, X_temp).fit()
logit_temp.params

Optimization terminated successfully.
         Current function value: 0.682888
         Iterations 4


volume_subcutaneous_fat    0.318115
dtype: float64

In [720]:
X_temp.shape

(42, 1)

In [721]:
y.shape

(47, 1)