In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

from sklearn.compose import (
	ColumnTransformer
)
from sklearn.decomposition import (
	PCA
)
from sklearn.ensemble import (
	RandomForestClassifier
)
from sklearn.linear_model import (
	LogisticRegression
)
from sklearn.metrics import (
	confusion_matrix,
	classification_report,
	f1_score,
)
from sklearn.model_selection import (
	train_test_split,
	RandomizedSearchCV,
	GridSearchCV,
	cross_val_score,
)
from sklearn.pipeline import (
	Pipeline
)
from sklearn.preprocessing import (
	LabelEncoder,
	OneHotEncoder,
	StandardScaler,
	RobustScaler,
	QuantileTransformer,
)

from pathlib import Path
import os
import pickle
from tqdm import tqdm


In [2]:
import regression.reg as reg

In [3]:
df = pd.read_pickle('../data/clinical.pickle')

# Set index to be PE number
df.index = df.pe_study_number

# Identify columns to drop
cols_to_drop = [
	'pe_study_number', # ID var
	'age', # More precise study_age calculated
	'dob_mask', # Not needed
	'study_date_mask', # Not needed
]

df = df.loc[:, df.columns.difference(cols_to_drop)]
df.head()

Unnamed: 0_level_0,approached_ventilatory_ceiling,bmi,borderline_ph,bsa,deconditioning,eph,estimated_peak_vo2_ml_kg_min,estimated_peak_vo2_ml_min,exercise_hfpef,exercise_ph_resting_hfpef,...,resting_ph_exercise_hfpef,study_age,surpassed_ventilatory_ceiling,systemic_htn_response,ve_vco2_at_at,ve_vco2_slope,vo2_hr_peak_percent_,vo2_ml_kg_min_at_at,vo2_work_slope_output,weight_kg
pe_study_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PE1,0.0,35.5,1,2.87,1.0,0.0,24.0,3604.0,0.0,0.0,...,0.0,49.886379,0.0,0.0,31.0,,45.0,9.6,,150.0
PE11,,33.07,0,2.07,,,20.9,1993.0,,,...,,37.026694,,,41.0,46.877,83.0,8.1,7.805,95.6
PE12,0.0,32.72,0,2.34,1.0,0.0,27.4,3045.0,0.0,0.0,...,0.0,45.412731,0.0,0.0,31.0,29.557,75.0,10.7,8.038,111.5
PE14,0.0,30.455938,0,1.95,0.0,0.0,23.7,2027.0,0.0,0.0,...,0.0,31.616701,0.0,1.0,30.0,32.776,92.0,10.2,7.718,85.55
PE15,0.0,21.43,0,1.56,0.0,0.0,35.2,1930.0,0.0,0.0,...,0.0,18.699521,0.0,0.0,31.0,28.446,80.0,17.1,,55.0


In [7]:
df.resolved_pe.value_counts()

Unresolved    28
Resolved      16
Name: resolved_pe, dtype: int64

In [4]:
ttests_df = reg.numeric_ttests(df, 'resolved_pe')
ttests_df.to_csv('../output/summary/iCPET_numeric_ttests_resolved.csv')
ttests_df.loc[ttests_df.p_value <= 0.05, :]

Unnamed: 0,mean_Resolved,mean_Unresolved,count_Resolved,count_Unresolved,std_Resolved,std_Unresolved,t_stat,p_value,significance
peak_cavo2,13.827425,12.125982,15.0,28.0,2.208464,2.344787,2.356054,0.025132,*
percent_co_achieved,73.066667,92.917536,15.0,28.0,20.512581,18.184435,-3.144172,0.004149,**
percent_peak_vo2,68.267023,77.701205,16.0,28.0,12.521529,17.493514,-2.072137,0.044793,*
study_age,58.486653,48.892148,16.0,28.0,13.828059,15.542087,2.115053,0.041726,*
vo2_hr_peak_percent_,79.875,91.928571,16.0,28.0,12.996794,18.348334,-2.536559,0.015213,*


In [6]:
ttests_df = reg.numeric_ttests(df, ['normal', 'resolved_pe'])
ttests_df.to_csv('../output/summary/iCPET_numeric_ttests_resolved_normal.csv')
ttests_df.loc[ttests_df.p_value <= 0.05, :]

Unnamed: 0,mean_Resolved_0,mean_Resolved_1,mean_Unresolved_0,mean_Unresolved_1,count_Resolved_0,count_Resolved_1,count_Unresolved_0,count_Unresolved_1,std_Resolved_0,std_Resolved_1,std_Unresolved_0,std_Unresolved_1,t_stat,p_value,significance
peak_cavo2,14.027987,13.02518,11.019484,13.23248,12.0,3.0,14.0,14.0,2.099872,2.946591,1.612898,2.486063,4.691443,0.006836797,**
peak_fick_co,11.22942,12.352693,12.15002,14.720721,12.0,3.0,14.0,14.0,3.516874,2.643598,3.220509,2.534176,3.092703,0.03796509,*
peak_vo2_ml_min,1528.230769,1540.666667,1306.357143,1911.428571,13.0,3.0,14.0,14.0,414.046325,625.743025,386.678083,542.554317,4.065624,0.01301,*
percent_co_achieved,65.319167,104.056667,85.779286,100.055786,12.0,3.0,14.0,14.0,10.008129,24.367081,16.508101,17.450444,11.786838,1.230585e-05,***
percent_peak_vo2,63.710172,88.013377,63.334379,92.068032,13.0,3.0,14.0,14.0,8.67802,2.053624,10.220461,9.303889,31.907072,1.065774e-10,***
percent_vo2_at_at,43.453004,72.449789,46.442196,59.862613,13.0,3.0,14.0,14.0,8.941279,8.446047,8.925455,9.297344,14.289342,1.781012e-06,***
vo2_hr_peak_percent_,78.307692,86.666667,80.071429,103.785714,13.0,3.0,14.0,14.0,13.275194,11.150486,13.228549,14.879922,9.908908,5.156839e-05,***
vo2_work_slope_output,7.815091,7.993333,6.900818,8.803692,11.0,3.0,11.0,13.0,1.296254,1.281058,1.343854,0.946651,5.030363,0.005421008,**


In [22]:
# sns.pairplot(
# 	data=df,
# 	vars=df.select_dtypes(['float']).columns,
# 	hue='resolved_pe',
# 	diag_kind='kde'
# )
# plt.savefig('../figures/iCPET_numeric_pairplot.png')
# plt.show()


In [47]:
# Identify columns to drop
cols_to_drop = [
	've_vco2_slope', # Missing values
	'vo2_work_slope_output', # Missing values
]

# Subset data and remove remaining NaN observations
# Should only lose 2 observations
df_clean = df.loc[:, df.columns.difference(cols_to_drop)]
df_clean = df_clean.dropna()

df_clean.shape

(39, 46)

In [38]:
target_columns = ['resolved_pe']
numeric_columns = df_clean.select_dtypes(['int', 'float']).columns
categorical_columns = df_clean.select_dtypes(['category']).columns.difference(target_columns)

In [39]:
target_pipeline = Pipeline([
	('LabelEncoder', LabelEncoder())
])

numeric_pipeline = Pipeline([
	('Scaler', StandardScaler)
])

categorical_pipeline = Pipeline([
	('OneHotEncoder', OneHotEncoder())
])

preprocessor = ColumnTransformer([
	('target', target_pipeline, target_columns),
	('numeric', numeric_pipeline, numeric_columns),
	('categorical', categorical_pipeline, categorical_columns),
])

In [7]:
# df_preprocessed = preprocessor.fit_transform(df_clean)
# df_preprocessed.head()

## Analysis
### Graphical Models

Plan to use https://pgmpy.org/index.html (great tutorials)
https://pgmpy.org/detailed_notebooks/10.%20Learning%20Bayesian%20Networks%20from%20Data.html