In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython import display
import warnings
import seaborn as sns
import ehrapy as ep
import anndata as ad

plt.style.use("ggplot")
warnings.filterwarnings("ignore")
from tableone import TableOne

In [48]:
adata = ep.io.read_csv("./notebooks/ehrapy_data/final_icu.csv.gz", index_column = 'stay_id') 

adata2 = ep.io.read_csv("./notebooks/ehrapy_data/icu_stay_diag.csv.gz") 

In [49]:
adata
# we have 9864 unique icu_stays and 166 features

AnnData object with n_obs × n_vars = 9864 × 164
    obs: 'intime', 'outtime'
    layers: 'original'

In [50]:
data = adata.to_df()
data.index = data.index.astype(int)

In [51]:
adata.to_df().head()

Unnamed: 0_level_0,subject_id,Age,gender,ethnicity,insurance,hospital_death,dod,hadm_id,los,has_E87,...,MORGANELLA MORGANII_positive,"NON-FERMENTER, NOT PSEUDOMONAS AERUGINOSA_positive",STREPTOCOCCUS ANGINOSUS (MILLERI) GROUP_positive,SERRATIA MARCESCENS_positive,CANDIDA ALBICANS_positive,CLOSTRIDIUM DIFFICILE_positive,"CANDIDA ALBICANS, PRESUMPTIVE IDENTIFICATION_positive",ACHROMOBACTER SP._positive,STAPH AUREUS COAG +_positive,ACINETOBACTER BAUMANNII COMPLEX_positive
stay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30000484,18421337,91,M,MULTIPLE RACE/ETHNICITY,Medicare,0,2136-02-21 00:00:00,22413411,59.493333,0,...,0,0,0,0,0,0,0,0,0,0
30000831,15726459,78,M,WHITE,Private,0,0,22744101,64.923333,1,...,0,0,0,0,0,0,0,0,0,0
30002498,17938576,59,M,WHITE,Medicare,0,2160-11-25 00:00:00,20818145,25.601111,0,...,0,0,0,0,0,0,0,0,0,0
30003598,15332791,64,F,WHITE,Medicare,0,0,20683754,115.504444,0,...,0,0,0,0,0,2189-04-28,0,0,2189-04-14,0
30004320,17686783,52,F,WHITE,Private,0,2172-02-13 00:00:00,22994815,24.750556,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
print(f"Number of ICU stays: {str(len(data.index.unique()))}")
print(f"Number of survivors: {str(len(data[data['hospital_death']==0].index.unique()))}")
print(f"Number of non-survivors: {str(len(data[data['hospital_death']==1].index.unique()))}")
print(
    f"Mortality: {str(round(100*len(data[data['hospital_death']==1].index.unique()) / len(data.index.unique()),1))}%"
)

Number of ICU stays: 9864
Number of survivors: 8514
Number of non-survivors: 1350
Mortality: 13.7%


https://ehrapy.readthedocs.io/en/stable/tutorials/notebooks/ml_usecases.html

In [56]:
adata.obs["mortality_cat"] = adata[:, "hospital_death"].X.flatten()

In [None]:
adata.obs["mortality_cat"] = adata[:, "hospital_death"].X
adata.obs["mortality_cat"] = adata.obs["mortality_cat"].astype(int).astype(str)
adata.obs["stay_id"] = adata.obs.index.astype(int)
adata

https://ehrapy.readthedocs.io/en/stable/tutorials/notebooks/ml_usecases.html

In [58]:
adata_per_patient = ep.ad.df_to_anndata(data.groupby(["stay_id"]).mean())
adata_per_patient

TypeError: agg function failed [how->mean,dtype->object]

In [29]:
# let's try ehrapy's infer feature types
ep.ad.infer_feature_types(adata)

[93m![0m Features 'hospital_death', 'has_E87', 'has_J18', 'has_I50', 'has_N17', 'has_J96', 'has_E11', 'has_I25', 'has_E78', 'has_J98', 'has_Z51', 'has_I47', 'has_I10', 'has_D64', 'has_Z95', 'has_N18', 'proc_224277', 'proc_224275', 'proc_225402', 'proc_225432', 'proc_224263', 'proc_227194', 'proc_225401', 'proc_225459', 'proc_225752', 'proc_224264', 'proc_224274', 'proc_228129', 'proc_225454', 'proc_228128', 'proc_229351', 'proc_225792', 'proc_225400', 'proc_221217', 'proc_225966', 'proc_225451', 'proc_221214', 'proc_224385', 'proc_229581' were detected as categorical features stored numerically.Please verify and correct using `ep.ad.replace_feature_types` if necessary.
[1m Detected feature types for AnnData object [0m
[1mwith 9864 obs and 166 vars[0m
├── 📅[1m Date features[0m
├── 📐[1m Numerical features[0m
│   ├── ACD-A Citrate (1000ml)
│   ├── ACD-A Citrate (500ml)
│   ├── Acetaminophen-IV
│   ├── Acetylcysteine
│   ├── Age
│   ├── Alteplase (TPA)
│   ├── Aminocaproic acid (

In [30]:
adata = ep.pp.encode(adata, autodetect=True)

[2K[1;34mRunning one-hot encoding on passed column…[0m [35m …[0m
[?25h

TypeError: Encoders require their input argument must be uniformly strings or numbers. Got ['int', 'str']

In [None]:
adata = ep.pp.encode(adata, encodings={"one-hot": ["service_unit", "day_icu_intime"]})


In [None]:
ep.ad.correct_feature_types(adata, )

In [23]:
obs_metric, var_metrics = ep.pp.qc_metrics(adata2)

In [24]:
obs_metric

Unnamed: 0,missing_values_abs,missing_values_pct
0,0,0.0
1,0,0.0
2,0,0.0
3,0,0.0
4,0,0.0
...,...,...
9859,0,0.0
9860,0,0.0
9861,0,0.0
9862,0,0.0


In [25]:
var_metrics

Unnamed: 0,missing_values_abs,missing_values_pct,mean,median,standard_deviation,min,max,iqr_outliers
subject_id,0,0.0,,,,,,
stay_id,0,0.0,,,,,,
Age,0,0.0,,,,,,
gender,0,0.0,,,,,,
ethnicity,0,0.0,,,,,,
insurance,0,0.0,,,,,,
hospital_death,0,0.0,,,,,,
dod,0,0.0,,,,,,
hadm_id,0,0.0,,,,,,
los,0,0.0,,,,,,


In [19]:
ep.settings.set_figure_params(figsize=(4, 3), dpi=100)
ep.pl.violin(adata, keys=["Age"], groupby="gender")

KeyError: 'gender'

In [15]:
ep.ad.infer_feature_types(adata2)

[93m![0m Features 'hospital_death', 'has_E87', 'has_J18', 'has_I50', 'has_N17', 'has_J96', 'has_E11', 'has_I25', 'has_E78', 'has_J98', 'has_Z51', 'has_I47', 'has_I10', 'has_D64', 'has_Z95', 'has_N18' were detected as categorical features stored numerically.Please verify and correct using `ep.ad.replace_feature_types` if necessary.
[1m Detected feature types for AnnData object [0m
[1mwith 9864 obs and 25 vars[0m
├── 📅[1m Date features[0m
├── 📐[1m Numerical features[0m
│   ├── Age
│   ├── hadm_id
│   ├── los
│   ├── stay_id
│   └── subject_id
└── 🗂️[1m Categorical features[0m
    ├── dod (4169 categories)
    ├── ethnicity (33 categories)
    ├── gender (2 categories)
    ├── has_D64 (2 categories)
    ├── has_E11 (2 categories)
    ├── has_E78 (2 categories)
    ├── has_E87 (2 categories)
    ├── has_I10 (2 categories)
    ├── has_I25 (2 categories)
    ├── has_I47 (2 categories)
    ├── has_I50 (2 categories)
    ├── has_J18 (1 categories)
    ├── has_J96 (2 categories)
   

In [None]:
ep.ad.correct_feature_types # use this to fix if feature types aren't correct

In [9]:
adata.obs_names

Index(['Cell_0', 'Cell_1', 'Cell_2',
       'Cell_3', 'Cell_4', 'Cell_5',
       'Cell_6', 'Cell_7', 'Cell_8',
       'Cell_9',
       ...
       'Cell_9854', 'Cell_9855', 'Cell_9856',
       'Cell_9857', 'Cell_9858', 'Cell_9859',
       'Cell_9860', 'Cell_9861', 'Cell_9862',
       'Cell_9863'],
      dtype='object', length=9864)

In [5]:
TableOne(adata.obs, categorical=['gender', 'ethnicity', 'insurance'])

KeyError: "None of [Index(['gender', 'ethnicity', 'insurance'], dtype='object')] are in the [columns]"