In [95]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind, chi2_contingency, fisher_exact

In [96]:
data = pd.read_csv('./data_processed/data_stats_.csv', index_col=0)
data.head()

Unnamed: 0,los,gender,age,heart_rate,respiratory_rate,hematocrit,rdw,platelet,mcv,mch,...,race_BLACK,race_HISPANIC/LATINO,race_OTHER,race_WHITE,marital_status_MARRIED,marital_status_SINGLE,marital_status_WIDOWED,insurance_Medicare,insurance_Other,icu_count
327,4.10571,1,73.210959,70.0,16.0,29.1,12.8,151.0,94.0,30.2,...,False,False,False,True,True,False,False,True,False,1
1022,1.9681,1,84.726027,70.0,19.0,31.2,15.3,192.0,82.0,27.7,...,False,False,False,True,True,False,False,True,False,1
896,4.87824,1,60.043836,90.0,37.0,38.1,16.9,16.0,93.0,30.7,...,False,False,True,False,True,False,False,True,False,1
908,5.04106,1,50.728767,89.0,20.0,26.1,15.4,77.0,86.0,31.0,...,False,True,False,False,False,True,False,False,False,1
559,1.37475,0,56.183562,43.0,17.0,29.6,12.7,274.0,92.0,31.9,...,False,False,False,True,True,False,False,False,True,1


In [97]:
Index_res = []
A_res = []
B_res = []
P_res = []

for col in data.columns:
    A = data.loc[data.die_in_icu == 0, [col]].dropna()
    B = data.loc[data.die_in_icu == 1, [col]].dropna()
    
    if data[col].dtype == 'bool' or (col in ['gender', 'uc_only', 'cd_only']):
        # print(f'INFO: {col} (bool)')
        A_weight = (A.sum() / A.shape[0] * 100).item()
        B_weight = (B.sum() / B.shape[0] * 100).item()

        group_counts = data.groupby(['die_in_icu', col]).size().unstack(fill_value=0)
        chi2_stat, p_val, dof, expected = chi2_contingency(group_counts)
        oddsratio, fisher_p_val = fisher_exact(group_counts)
        
        # print(f"Chi-squared test p-value: {p_val}")
        # print(f"Fisher's exact test p-value: {fisher_p_val}")

        Index_res.append(col)
        A_res.append(f'{A_weight:.2f}%')
        B_res.append(f'{B_weight:.2f}%')
        if p_val <= 0.05:
            P_res.append(f'{p_val:.4f}*')
        else:
            P_res.append(f'{p_val:.4f}')
        # print(f'INFO: A: {A_weight:.2f}% B: {B_weight:.2f}%')
    elif data[col].dtype == 'float64':
        t_stat, p_val = ttest_ind(A, B)
        p_val = p_val[0]

        Index_res.append(col)
        A_res.append(f'{A.mean().item():.2f}±{A.std().item():.2f}')
        B_res.append(f'{B.mean().item():.2f}±{B.std().item():.2f}')
        if p_val <= 0.05:
            P_res.append(f'{p_val:.4f}*')
        else:
            P_res.append(f'{p_val:.4f}')
    else:
        print(f'WARN: {col} is missing.')

pd.DataFrame(
    {
        'Not Die in ICU': A_res,
        'Die in ICU': B_res,
        'P-Value': P_res,
    },
    index=Index_res,
)

WARN: die_in_icu is missing.
WARN: icu_count is missing.


Unnamed: 0,Not Die in ICU,Die in ICU,P-Value
los,3.11±4.20,3.88±5.64,0.0545
gender,46.66%,38.13%,0.0719
age,62.95±15.69,67.69±13.78,0.0008*
heart_rate,87.55±19.51,93.30±21.11,0.0013*
respiratory_rate,19.16±5.60,19.91±6.95,0.1495
hematocrit,30.20±5.76,30.71±5.39,0.3284
rdw,16.13±2.57,16.04±2.18,0.6985
platelet,247.69±156.37,246.94±152.51,0.9576
mcv,90.51±8.55,91.53±8.71,0.1893
mch,29.57±3.17,29.67±3.02,0.7443


In [98]:
pd.DataFrame(
    {
        'Not Die in ICU': A_res,
        'Die in ICU': B_res,
        'P-Value': P_res,
    },
    index=Index_res,
).to_csv('./temp.csv')